In [50]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
import pandas_profiling

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix

In [96]:
train = pd.read_csv(r'train.csv')
test= pd.read_csv(r'test.csv')

In [52]:
train.shape

(891, 12)

In [53]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [54]:
train_X = train[train.columns.difference(['PassengerId', 'Survived','Name','Ticket','Cabin'])]
y = train['Survived']

In [55]:
categorical_feature = [feature for feature in train_X.columns if train[feature].dtypes=='O']
numerical_feature = [feature for feature in train_X.columns if train[feature].dtypes!='O']

In [56]:
df_cat = train_X[categorical_feature]
df_num = train_X[numerical_feature]

In [57]:
df_cat

Unnamed: 0,Embarked,Sex
0,S,male
1,C,female
2,S,female
3,S,female
4,S,male
...,...,...
886,S,male
887,S,female
888,S,female
889,C,male


In [58]:
def missings_treat(x):
    x = x.fillna(x.median())
    return x

df_num = df_num.apply(lambda x: missings_treat(x))

In [59]:
df_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     891 non-null    float64
 1   Fare    891 non-null    float64
 2   Parch   891 non-null    int64  
 3   Pclass  891 non-null    int64  
 4   SibSp   891 non-null    int64  
dtypes: float64(2), int64(3)
memory usage: 34.9 KB


- There are only 2 missings in categorical so in this case we ca

In [60]:
df_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Embarked  889 non-null    object
 1   Sex       891 non-null    object
dtypes: object(2)
memory usage: 14.0+ KB


In [61]:
Sex = pd.get_dummies(df_cat['Sex'], drop_first = True)
Sex

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1
...,...
886,1
887,0
888,0
889,1


In [62]:
Embarked = pd.get_dummies(df_cat['Embarked'], drop_first = True)
Embarked

Unnamed: 0,Q,S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
886,0,1
887,0,1
888,0,1
889,0,0


In [63]:
dataset = pd.concat([df_num, Sex,Embarked], axis = 1)
dataset.head()

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,male,Q,S
0,22.0,7.25,0,3,1,1,0,1
1,38.0,71.2833,0,1,1,0,0,0
2,26.0,7.925,0,3,0,0,0,1
3,35.0,53.1,0,1,1,0,0,1
4,35.0,8.05,0,3,0,1,0,1


In [64]:
dataset.dropna(inplace = True)

In [65]:
scaling_feature=[feature for feature in dataset.columns if feature not in ['Survived']]
len(scaling_feature)

8

In [66]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(dataset[scaling_feature])

MinMaxScaler()

In [67]:
scaler.transform(dataset[scaling_feature])

array([[0.27117366, 0.01415106, 0.        , ..., 1.        , 0.        ,
        1.        ],
       [0.4722292 , 0.13913574, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.32143755, 0.01546857, 0.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.34656949, 0.04577135, 0.33333333, ..., 0.        , 0.        ,
        1.        ],
       [0.32143755, 0.0585561 , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.39683338, 0.01512699, 0.        , ..., 1.        , 1.        ,
        0.        ]])

In [68]:
# transform the train and test set, and add on the Id and SalePrice variables
data = pd.DataFrame(scaler.transform(dataset[scaling_feature]), columns=scaling_feature)

In [69]:
data

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,male,Q,S
0,0.271174,0.014151,0.000000,1.0,0.125,1.0,0.0,1.0
1,0.472229,0.139136,0.000000,0.0,0.125,0.0,0.0,0.0
2,0.321438,0.015469,0.000000,1.0,0.000,0.0,0.0,1.0
3,0.434531,0.103644,0.000000,0.0,0.125,0.0,0.0,1.0
4,0.434531,0.015713,0.000000,1.0,0.000,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...
886,0.334004,0.025374,0.000000,0.5,0.000,1.0,0.0,1.0
887,0.233476,0.058556,0.000000,0.0,0.000,0.0,0.0,1.0
888,0.346569,0.045771,0.333333,1.0,0.125,0.0,0.0,1.0
889,0.321438,0.058556,0.000000,0.0,0.000,1.0,0.0,0.0


In [70]:
train_X,test_X, train_y, test_y= train_test_split(data,y,test_size=0.3,random_state=1234)

print(train_X. shape)
print(test_X.shape)
print(train_y. shape)
print(test_y.shape)

(623, 8)
(268, 8)
(623,)
(268,)


In [71]:
test_X.isnull().sum()

Age       0
Fare      0
Parch     0
Pclass    0
SibSp     0
male      0
Q         0
S         0
dtype: int64

In [72]:
pargrid_ada = {'n_estimators': [50, 60, 70, 80, 90, 100],
                'max_features': [2,3,4],
              'max_depth': [2,3,4,5,6]}
gscv_Rf = GridSearchCV(estimator=RandomForestClassifier(), 
                        param_grid=pargrid_ada, 
                        cv=5,
                        verbose=True, n_jobs=-1)

In [73]:
gscv_results = gscv_Rf.fit(train_X, train_y)
gscv_results.best_score_

Fitting 5 folds for each of 90 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 352 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:    9.4s finished


0.8249548387096773

In [74]:
gscv_results.best_params_

{'max_depth': 6, 'max_features': 2, 'n_estimators': 80}

In [75]:
gscv_results.predict(train_X)

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,

In [76]:
radm_clf = RandomForestClassifier(oob_score=True,n_estimators=60 , max_depth = 3, max_features=3, n_jobs=-1)
radm_clf.fit( train_X, train_y )

RandomForestClassifier(max_depth=3, max_features=3, n_estimators=60, n_jobs=-1,
                       oob_score=True)

In [77]:
#Train data - AUC Score
print(metrics.roc_auc_score(train_y, pd.DataFrame(radm_clf.predict_proba(train_X))[1]))

#Test data - AUC Score
print(metrics.roc_auc_score(test_y, pd.DataFrame(radm_clf.predict_proba(test_X))[1]))

0.8831755874673629
0.8677947082447435


In [78]:
print(metrics.classification_report(train_y, radm_clf.predict(train_X)))

              precision    recall  f1-score   support

           0       0.81      0.96      0.88       383
           1       0.91      0.64      0.75       240

    accuracy                           0.83       623
   macro avg       0.86      0.80      0.81       623
weighted avg       0.85      0.83      0.83       623



In [79]:
print(metrics.classification_report(test_y, radm_clf.predict(test_X)))

              precision    recall  f1-score   support

           0       0.80      0.96      0.88       166
           1       0.91      0.62      0.74       102

    accuracy                           0.83       268
   macro avg       0.86      0.79      0.81       268
weighted avg       0.85      0.83      0.82       268



In [80]:
test_pred =  radm_clf.predict(test_X)

In [81]:
test_pred

array([1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0], dtype=int64)

In [82]:
confusion_matrix = confusion_matrix(test_y, pd.Series(test_pred))
print(confusion_matrix)

[[160   6]
 [ 39  63]]


In [150]:
test= pd.read_csv(r'test.csv')

In [151]:
val = test.copy()

In [152]:
test.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64

In [153]:
val.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [154]:
val.drop(columns= ['PassengerId','Name','Cabin','Ticket'], inplace = True)

In [155]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [156]:
val.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [157]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [158]:
df_cat_test = val[categorical_feature]
df_num_test = val[numerical_feature]

In [159]:
df_cat_test.info()
df_num_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Embarked  418 non-null    object
 1   Sex       418 non-null    object
dtypes: object(2)
memory usage: 6.7+ KB


In [161]:
df_num_test = df_num_test.apply(lambda x: missings_treat(x))

In [162]:
Sex = pd.get_dummies(val['Sex'], drop_first = True)
Sex
Embarked = pd.get_dummies(val['Embarked'], drop_first = True)
Embarked

Unnamed: 0,male
0,1
1,0
2,1
3,1
4,0
...,...
413,1
414,0
415,1
416,1


In [164]:
df_test = pd.concat([df_num_test,Sex,Embarked], axis = 1)
df_test.head()

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,male,Q,S
0,34.5,7.8292,0,3,0,1,1,0
1,47.0,7.0,0,3,1,0,0,1
2,62.0,9.6875,0,2,0,1,1,0
3,27.0,8.6625,0,3,0,1,0,1
4,22.0,12.2875,1,3,1,0,0,1


In [165]:
df_test.count()

Age       418
Fare      418
Parch     418
Pclass    418
SibSp     418
male      418
Q         418
S         418
dtype: int64

In [166]:
test_prediction =  radm_clf.predict(df_test)
test_prediction= pd.DataFrame([test.PassengerId,test_prediction]).T
test_prediction

In [169]:
test_prediction.columns = ['PassengerId','Predictions']
test_prediction.to_csv('Submission.csv')