In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score,f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import pickle

In [3]:
df_train = pd.read_csv('../../datasets/titanic/train.csv')
df_test = pd.read_csv('../../datasets/titanic/test.csv')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df_train['Age'].fillna(value=df_train['Age'].median(),inplace=True)
df_test['Age'].fillna(value=df_train['Age'].median(),inplace=True)

df_train['Embarked'].fillna(value=df_train['Embarked'].mode().iloc[0], inplace=True)
df_test['Embarked'].fillna(value=df_train['Embarked'].mode().iloc[0], inplace=True)

In [6]:
df_train['isCabin'] = (~df_train['Cabin'].isna()).apply(int)
df_test['isCabin'] = (~df_test['Cabin'].isna()).apply(int)

In [7]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
isCabin          0
dtype: int64

In [8]:
df_test['Fare'].fillna(value=df_train['Fare'].median(),inplace=True)

In [9]:
df_train['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [10]:
name1 = 'Braund, Mr. Owen Harris'
name2 = 'Graham, Miss. Margaret Edith'
def get_title(name):
    return name.split(',')[1].split('.')[0].strip()
get_title(name1), get_title(name2) # Mr, Miss

('Mr', 'Miss')

In [11]:
df_train['Title'] = df_train['Name'].apply(get_title)
df_test['Title'] = df_test['Name'].apply(get_title)

In [12]:
df_train['Title'] = df_train['Title'].apply(lambda X:X if X in ['Mr','Mrs','Miss','Master'] else 'other')
df_test['Title'] = df_test['Title'].apply(lambda X:X if X in ['Mr','Mrs','Miss','Master'] else 'other')

In [13]:
df_train['Age_Cat'] = df_train['Age'].apply(lambda x: '0-10' if 0 <= x <= 10 else ('11-25' if 11 <= x <= 25 else ('26-40' if 26 <= x <= 40 else ('41-60' if 41 <= x <= 60 else '>60'))))
df_test['Age_Cat'] = df_test['Age'].apply(lambda x: '0-10' if 0 <= x <= 10 else ('11-25' if 11 <= x <= 25 else ('26-40' if 26 <= x <= 40 else ('41-60' if 41 <= x <= 60 else '>60'))))

In [14]:
name1 = 'STON/O2. 3101282'
name2 = '113803'

def get_ticket_number(name):
    ticket_number = name.split(' ')[0]
    if ticket_number != name:
        return ticket_number
    else:
        return 'NaN'

def get_ticket_item(name):
    ticket_item = name.split(' ')[0]
    if ticket_item == name:
        return ticket_item
    else:
        return 'NaN'
get_ticket_item(name1), get_ticket_item(name2)


('NaN', '113803')

In [15]:
df_train['Ticket_number'] = df_train['Ticket'].apply(get_ticket_number)
df_train['Ticket_item'] = df_train['Ticket'].apply(get_ticket_item)

In [16]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,isCabin,Title,Age_Cat,Ticket_number,Ticket_item
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,Mr,11-25,A/5,
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs,26-40,PC,
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,Miss,26-40,STON/O2.,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,Mrs,26-40,,113803.0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,Mr,26-40,,373450.0


# Encoding

In [17]:
# Label Encoding 
df_train['Sex'] = df_train['Sex'].map({'female':1,'male':0})

In [18]:
df_test['Sex'] = df_test['Sex'].map({'female':1,'male':0})

In [19]:
# OneHotEncoding 
df_train[['C','Q','S']]= pd.get_dummies(df_train['Embarked'])
df_test[['C','Q','S']]= pd.get_dummies(df_test['Embarked'])

In [20]:
df_train[['Master','Miss','Mr','Mrs','other']] = pd.get_dummies(df_train['Title'])
df_test[['Master','Miss','Mr','Mrs','other']] = pd.get_dummies(df_test['Title'])

In [21]:
df_train['Age_Cat'] = df_train['Age_Cat'].map({'0-10':0,'11-25':1,'26-40':2,'41-60':3,'>60':4})
df_test['Age_Cat'] = df_test['Age_Cat'].map({'0-10':0,'11-25':1,'26-40':2,'41-60':3,'>60':4})

In [22]:
# merging two col 
df_train['Family'] = df_train['SibSp'] + df_train['Parch']
df_test['Family'] = df_test['SibSp'] + df_test['Parch']

# Feature selection - corr

In [23]:
df_train.corr()['Survived']

ValueError: could not convert string to float: 'Braund, Mr. Owen Harris'

# All Model Building & Test

In [24]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'isCabin', 'Title',
       'Age_Cat', 'Ticket_number', 'Ticket_item', 'C', 'Q', 'S', 'Master',
       'Miss', 'Mr', 'Mrs', 'other', 'Family'],
      dtype='object')

In [25]:
selected_columns = ['Pclass', 'Sex', 'Fare', 'isCabin','C','S','Miss', 'Mr', 'Mrs']

In [26]:
selected_columns1 =  ['Pclass', 'Sex', 'Fare', 'isCabin','Age_Cat',
       'C', 'S',  'Miss', 'Mr', 'Mrs', 'other', 'Family']

In [27]:
selected_columns2 = ['Pclass', 'Sex',  'Fare', 'isCabin','C', 'S', 'Miss', 'Mr', 'Mrs', 'Family']

In [28]:
X = df_train[selected_columns1]
y = df_train['Survived']

In [29]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=27)

In [30]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
pickle.dump(scaler,open('titanic_scaling.pkl','wb'))

In [32]:
DT_params={'max_depth':[1,2,3,4,5,6,7,8,9],
       'min_samples_leaf':[3,4,5,6,7,8,9]}

DT_gridcv = GridSearchCV(DecisionTreeClassifier(),param_grid=DT_params,cv=5,n_jobs=-1)
DT_gridcv.fit(X_train,y_train)

In [31]:
DT_gridcv.best_estimator_

In [32]:
RF_params={'n_estimators':[10,25,50,100,150,250],
       'max_depth':[1,2,3,4,5],
       'min_samples_leaf':[3,4,5,6,7],
       'max_features':[2,4,6,8]}

RF_gridcv = GridSearchCV(RandomForestClassifier(),param_grid=RF_params,cv=5,n_jobs=-1)
RF_gridcv.fit(X_train,y_train)

In [33]:
RF_gridcv.best_estimator_

In [34]:
models = {
    'Log reg':LogisticRegression(),
    'SVM-Lin':SVC(kernel='linear'),
    'SVM-Pol':SVC(kernel='poly'),
    'SVM-RBF':SVC(kernel='rbf'),
    'DT': DecisionTreeClassifier(max_depth=5, min_samples_leaf=4),
    'RF':RandomForestClassifier(max_depth=5, max_features=6, min_samples_leaf=5,
                       n_estimators=10, n_jobs=-1),
    'KNN-3':KNeighborsClassifier(n_neighbors=3),
    'KNN-5':KNeighborsClassifier(n_neighbors=5),
    'KNN-7':KNeighborsClassifier(n_neighbors=7),
    'AdB':AdaBoostClassifier()
}

In [35]:

print('Model','\tACC','Recall','Pre','F1-score',sep='\t')
print('-'*50)
for name,model in models.items():
    model.fit(X_train_scaled,y_train)
    yp = model.predict(X_test_scaled )
    acc = accuracy_score(y_test,yp)
    rec = recall_score(y_test,yp)
    pre = precision_score(y_test,yp)
    f1 = f1_score(y_test,yp)
    print(name,'',round(acc,2),round(rec,2),round(pre,2),round(f1,2),sep='\t')

Model		ACC	Recall	Pre	F1-score
--------------------------------------------------
Log reg		0.81	0.74	0.76	0.75
SVM-Lin		0.78	0.73	0.71	0.72
SVM-Pol		0.83	0.66	0.85	0.74
SVM-RBF		0.83	0.67	0.84	0.75
DT		0.8	0.69	0.77	0.73
RF		0.82	0.74	0.78	0.76
KNN-3		0.81	0.76	0.75	0.76
KNN-5		0.83	0.72	0.8	0.76
KNN-7		0.81	0.68	0.78	0.73
AdB		0.83	0.79	0.76	0.77


# Model Building

In [33]:
model = RandomForestClassifier(max_depth=5, max_features=6, min_samples_leaf=5,
                       n_estimators=10, n_jobs=-1)
model.fit(df_train[selected_columns1],df_train['Survived'])

# Pickling The Model file For Deployment

In [35]:
pickle.dump(model,open('titanic_model.pkl','wb'))

In [36]:
pickled_model=pickle.load(open('titanic_model.pkl','rb'))

In [38]:
predict = pickled_model.predict(df_test[selected_columns1])

In [39]:
predict

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [40]:
df_submit = pd.DataFrame({'PassengerId':df_test['PassengerId'],'Survived':predict})

In [41]:
df_submit.to_csv('sub_RF01.csv',index=False)

In [46]:
print(selected_columns1)

['Pclass', 'Sex', 'Fare', 'isCabin', 'Age_Cat', 'C', 'S', 'Miss', 'Mr', 'Mrs', 'other', 'Family']


In [45]:
df_test[selected_columns1].head()

Unnamed: 0,Pclass,Sex,Fare,isCabin,Age_Cat,C,S,Miss,Mr,Mrs,other,Family
0,3,0,7.8292,0,2,False,False,False,True,False,False,0
1,3,1,7.0,0,3,False,True,False,False,True,False,1
2,2,0,9.6875,0,4,False,False,False,True,False,False,0
3,3,0,8.6625,0,2,False,True,False,True,False,False,0
4,3,1,12.2875,0,1,False,True,False,False,True,False,2


# Make predictions

In [37]:
yp = model.predict(df_test[selected_columns1])

In [38]:
df_submit = pd.DataFrame({'PassengerId':df_test['PassengerId'],'Survived':yp})

In [40]:
df_submit.to_csv('sub_RF11.csv',index=False)