In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from scipy.stats.stats import pearsonr
from sklearn.model_selection import GridSearchCV

  from scipy.stats.stats import pearsonr


In [2]:
train_data = pd.read_csv('train.csv')
test_X = pd.read_csv('test.csv')

train_X = train_data.drop('Survived', axis=1)
train_Y = train_data['Survived']

test_X_copy = test_X.copy()

In [3]:
train_X.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
print(train_X.isnull().sum())
print("Total rows = ", len(train_X))

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Total rows =  891


In [5]:
## Data Preprocessing

train_X = train_X.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
test_X = test_X.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

train_X['Age'] = train_X['Age'].fillna(train_X['Age'].mean())
test_X['Age'] = test_X['Age'].fillna(test_X['Age'].mean())

train_X['Embarked'] = train_X['Embarked'].fillna(train_X['Embarked'].mode()[0])
test_X['Embarked'] = test_X['Embarked'].fillna(test_X['Embarked'].mode()[0])

train_X['Fare'] = train_X['Fare'].fillna(train_X['Fare'].mean())
test_X['Fare'] = test_X['Fare'].fillna(test_X['Fare'].mean())

train_X['Sex'] = train_X['Sex'].map({'male':0, 'female':1})
test_X['Sex'] = test_X['Sex'].map({'male':0, 'female':1})

train_X['Embarked'] = train_X['Embarked'].map({'S':0, 'C':1, 'Q':2})
test_X['Embarked'] = test_X['Embarked'].map({'S':0, 'C':1, 'Q':2})

In [6]:
train_X[['Age']] = train_X[['Age']]/100
test_X[['Age']] = test_X[['Age']]/100

scaler = MinMaxScaler()
train_X[['Fare']] = scaler.fit_transform(train_X[['Fare']])
test_X[['Fare']] = scaler.transform(test_X[['Fare']])

In [7]:
train_X.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,0.22,1,0,0.014151,0
1,1,1,0.38,1,0,0.139136,1
2,3,1,0.26,0,0,0.015469,0
3,1,1,0.35,1,0,0.103644,0
4,3,0,0.35,0,0,0.015713,0


In [8]:
for cols in train_X.columns:
    print(cols)
    print(pearsonr(train_X[cols], train_Y))

Pclass
PearsonRResult(statistic=-0.3384810359610166, pvalue=2.537047387977477e-25)
Sex
PearsonRResult(statistic=0.5433513806577593, pvalue=1.4060661308747188e-69)
Age
PearsonRResult(statistic=-0.06980851528714352, pvalue=0.03721708372681344)
SibSp
PearsonRResult(statistic=-0.03532249888573566, pvalue=0.29224392869800203)
Parch
PearsonRResult(statistic=0.08162940708348386, pvalue=0.014799245374711896)
Fare
PearsonRResult(statistic=0.2573065223849638, pvalue=6.120189341916209e-15)
Embarked
PearsonRResult(statistic=0.10681138570892018, pvalue=0.0014083124205682656)


In [9]:
train_X = train_X.drop(['SibSp', 'Parch'], axis=1)
test_X = test_X.drop(['SibSp', 'Parch'], axis=1)

In [14]:
dtc = sklearn.ensemble.RandomForestClassifier()
grid = {'n_estimators':[2,3,4,5,6], 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10]}

gscv = GridSearchCV(dtc, grid, cv=5)
gscv.fit(train_X, train_Y)

print(gscv.best_params_)
print(gscv.best_score_)
print(gscv.score(train_X, train_Y))

train_pred = gscv.predict(train_X)
print("Score =",sklearn.metrics.accuracy_score(train_pred, train_Y))

{'max_depth': 10, 'n_estimators': 3}
0.8294520118008913
0.9158249158249159
Score = 0.9158249158249159


In [15]:
predicted = gscv.predict(test_X)
psg_id = test_X_copy['PassengerId']

In [16]:
predictions = pd.DataFrame()
predictions['PassengerId'] = psg_id
predictions['Survived'] = predicted

predictions.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,0
4,896,1


In [17]:
predictions.to_csv('dtc.csv', index=False)