In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
df_test.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
selected_columns = list(df_train.columns)
selected_columns.remove('Survived')
selected_columns.remove('PassengerId')
selected_columns.remove('Name')
selected_columns.remove('Ticket')
selected_columns.remove('Cabin')
selected_columns

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

# Handle Missinng Data

In [7]:
# Imputation/ Fill missing data
df_train['Age'].mean() 

29.69911764705882

In [8]:
df_train['Age'].median()

28.0

In [9]:
df_train['Age'].mode()[0]

24.0

In [10]:
df_train['Age'].fillna(df_train['Age'].mean(), inplace=True)
df_train['Embarked'].fillna(df_train['Embarked'].mode()[0], inplace=True)

In [11]:
df_train[selected_columns].isna().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

# Handle Object Cols/ Feature Encoding

In [12]:
df_train[selected_columns].dtypes

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
dtype: object

In [13]:
df_train['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [14]:
df_train['Sex'] = df_train['Sex'].map({'male':0, 'female':1})
df_train['Embarked'] = df_train['Embarked'].map({'S':0, 'C':1, 'Q':2})

In [15]:
df_train[selected_columns].dtypes

Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked      int64
dtype: object

# Building ML model

In [16]:
X = df_train[selected_columns]
y = df_train['Survived']

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_val, y_train, y_val= train_test_split(X,y,test_size=0.2, random_state=13)

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
#score using RF without hyperparameter tuning

model = RandomForestClassifier(random_state=13)
model.fit(X_train,y_train)
model.score(X_train,y_train), model.score(X_val,y_val)

(0.9831460674157303, 0.8212290502793296)

In [24]:
# Hyper parameter tuning using GridSearchCV, RandomSearchCV
params = {'n_estimators':[10,25,50,75,100],
          'min_samples_leaf':[2,3,4,5,6],
          'max_depth':[2,3,4],
          'min_samples_split':[4,5,6,7]}


In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
grid_cv = GridSearchCV(RandomForestClassifier(), params, cv=5, verbose=3, n_jobs=-1)

In [27]:
grid_cv.fit(X_train,y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [2, 3, 4],
                         'min_samples_leaf': [2, 3, 4, 5, 6],
                         'min_samples_split': [4, 5, 6, 7],
                         'n_estimators': [10, 25, 50, 75, 100]},
             verbose=3)

In [29]:
grid_cv.best_estimator_

RandomForestClassifier(max_depth=4, min_samples_leaf=2, min_samples_split=6,
                       n_estimators=50)

In [30]:
#score using RF with hyperparameter tuning
model1 = grid_cv.best_estimator_
model1.fit(X_train,y_train)
model1.score(X_train,y_train), model.score(X_val,y_val)

(0.8398876404494382, 0.8212290502793296)

## Select final model with best accuracy on unknown data(X_val)

In [31]:
# Consider Final selected model is RF without hyperparameter tuning
final_model = RandomForestClassifier(random_state=13)
final_model.fit(X,y)

RandomForestClassifier(random_state=13)

In [32]:
#using model1 created using hyper parameter tuning

model1=RandomForestClassifier(random_state=13)
model1.fit(X,y)

RandomForestClassifier(random_state=13)

### Use final model to make predictions on test data(df_test)

In [33]:
df_test[selected_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB


In [34]:
# Handle missing data from X_test
df_test['Age'].fillna(df_train['Age'].mean(), inplace=True)
df_test['Fare'].fillna(df_train['Fare'].mean(), inplace=True)

In [35]:
df_test['Sex'] = df_test['Sex'].map({'male':0, 'female':1})
df_test['Embarked'] = df_test['Embarked'].map({'S':0, 'C':1, 'Q':2})

In [36]:
df_test[selected_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    int64  
 2   Age       418 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      418 non-null    float64
 6   Embarked  418 non-null    int64  
dtypes: float64(2), int64(5)
memory usage: 23.0 KB


In [37]:
X_test = df_test[selected_columns]

In [33]:
yp = final_model.predict(X_test)

In [34]:
yp

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [36]:
df_test['Survived'] = yp

In [41]:
df_test[['PassengerId','Survived']].to_csv('first.csv', index = False)

In [38]:
#prediction using model1(RF with hyper parameter tuning)

yp = model1.predict(X_test)

In [39]:
yp

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [40]:
df_test['Survived'] = yp

In [42]:
df_test[['PassengerId','Survived']].to_csv('firstmodel.csv', index = False)