## Import all pakages and file required

In [43]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

import joblib

In [5]:
xtrain = pd.read_csv('xtrain.csv')
xtest = pd.read_csv('xtest.csv')
ytrain = pd.read_csv('ytrain.csv')
ytest= pd.read_csv('ytest.csv')

In [6]:
yt = ytrain.to_numpy()
yte = ytest.to_numpy()

In [7]:
y_train = yt.ravel()
y_test = yte.ravel()

In [8]:
print(xtest.shape);
print(xtrain.shape);
print(y_test.shape);
print(y_train.shape)

(179, 10)
(712, 10)
(179,)
(712,)


In [9]:
print(xtest.columns);
print(xtrain.columns);

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'female', 'male', 'C', 'Q',
       'S'],
      dtype='object')
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'female', 'male', 'C', 'Q',
       'S'],
      dtype='object')


## Model building

In [10]:
rfc = RandomForestClassifier()

In [11]:
rfc.fit(xtrain, y_train)

RandomForestClassifier()

### predict

In [12]:
train_predict = rfc.predict(xtrain)

In [13]:
test_predict = rfc.predict(xtest)

### Evaluate Model

In [14]:
def evaluate_model(act, pred):
    print("Confusion Matrix \n", confusion_matrix(act, pred))
    print("Accuracy : ", accuracy_score(act, pred))
    print("Recall   : ", recall_score(act, pred))
    print("Precision: ", precision_score(act, pred))
    print("F1_score : ", f1_score(act, pred))

In [15]:
### Train data accuracy
print('----------------train predictions---------------')
evaluate_model(ytrain, train_predict)

### Test data accuracy
print('---------------------test predictions----------')
evaluate_model(ytest, test_predict)

----------------train predictions---------------
Confusion Matrix 
 [[433   3]
 [  8 268]]
Accuracy :  0.9845505617977528
Recall   :  0.9710144927536232
Precision:  0.988929889298893
F1_score :  0.979890310786106
---------------------test predictions----------
Confusion Matrix 
 [[101  12]
 [ 16  50]]
Accuracy :  0.8435754189944135
Recall   :  0.7575757575757576
Precision:  0.8064516129032258
F1_score :  0.7812499999999999


## model building using GridearchCV for parameter tunning

In [34]:
grid = param_grid = {"n_estimators" : [100, 150],
                    "max_depth" : [5,10],
                    "max_features" : [3, 2],
                   "min_samples_leaf" : [1, 2, 4]}

In [35]:
rf_grid = GridSearchCV(rfc, param_grid=grid)

In [36]:
rf_grid.fit(xtrain, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [5, 10], 'max_features': [3, 2],
                         'min_samples_leaf': [1, 2, 4],
                         'n_estimators': [100, 150]})

In [37]:
rf_grid.best_params_

{'max_depth': 5, 'max_features': 3, 'min_samples_leaf': 2, 'n_estimators': 150}

### predict

In [38]:
train_pred = rf_grid.predict(xtrain)
test_pred = rf_grid.predict(xtest)

### Evaluate model

In [39]:
### Train data accuracy
print('----------------train predictions---------------')
evaluate_model(ytrain, train_pred)

### Test data accuracy
print('---------------------test predictions----------')
evaluate_model(ytest, test_pred)

----------------train predictions---------------
Confusion Matrix 
 [[412  24]
 [ 86 190]]
Accuracy :  0.8455056179775281
Recall   :  0.6884057971014492
Precision:  0.8878504672897196
F1_score :  0.7755102040816327
---------------------test predictions----------
Confusion Matrix 
 [[104   9]
 [ 19  47]]
Accuracy :  0.8435754189944135
Recall   :  0.7121212121212122
Precision:  0.8392857142857143
F1_score :  0.7704918032786886


## Export best model

In [40]:
# the best model is
rf = RandomForestClassifier(max_depth= 5, max_features= 3, min_samples_leaf= 2, n_estimators= 150)

In [41]:
rf.fit(xtrain, y_train)

RandomForestClassifier(max_depth=5, max_features=3, min_samples_leaf=2,
                       n_estimators=150)

In [44]:
joblib.dump(rf_grid,'randomforestclassifier.joblib')

['randomforestclassifier.joblib']