In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = './drive/MyDrive/Kaggle'

In [None]:
!kaggle competitions download -c titanic

In [None]:
import numpy as np
import pandas as pd

pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('dark')

In [None]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
train_df.columns.values

array(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [None]:
import scipy.stats as ss
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x,y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [None]:
train_df['Title'] = train_df.Name.apply(lambda x : x.split(',')[1].split('.')[0].strip())
train_df.Title.replace(['Mme', 'Ms', 'Lady', 'Mlle', 'the Countess', 'Dona'], 'Miss', inplace=True)
train_df.Title.replace(['Major', 'Col', 'Capt', 'Don', 'Sir', 'Jonkheer'], 'Mr', inplace=True)
train_df.Title.replace(['Dr','Rev'],'Others',inplace=True)

In [None]:
test_df['Title'] = test_df.Name.apply(lambda x : x.split(',')[1].split('.')[0].strip())
test_df.Title.replace(['Mme', 'Ms', 'Lady', 'Mlle', 'the Countess', 'Dona'], 'Miss', inplace=True)
test_df.Title.replace(['Major', 'Col', 'Capt', 'Don', 'Sir', 'Jonkheer'], 'Mr', inplace=True)
test_df.Title.replace(['Dr','Rev'],'Others',inplace=True)

In [None]:
train_df.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


## **PCLASS**

In [None]:
train_df.Pclass.isnull().sum(axis=0)

0

In [None]:
cramers_v(train_df.Survived, train_df.Pclass)

0.33668387622245516

## **TITLE**

In [None]:
train_df.Title.isnull().sum(axis=0)

0

In [None]:
cramers_v(train_df.Survived, train_df.Title)

0.5668555996348488

## **SEX**

In [None]:
train_df.Sex.isnull().sum(axis=0)

0

In [None]:
cramers_v(train_df.Survived, train_df.Sex)

0.5401999468101071

## **FAMILY**

In [None]:
train_df['Family'] = train_df.SibSp + train_df.Parch + 1
train_df.Family.value_counts()

1     537
2     161
3     102
4      29
6      22
5      15
7      12
11      7
8       6
Name: Family, dtype: int64

In [None]:
test_df['Family'] = test_df.SibSp + test_df.Parch + 1
test_df.Family.value_counts()

1     253
2      74
3      57
4      14
5       7
11      4
7       4
6       3
8       2
Name: Family, dtype: int64

In [None]:
train_df.Family = pd.cut(train_df.Family, [0,1,4,7,11], labels=['Solo','Small','Big','TooBig'])
train_df.Family.value_counts()

Solo      537
Small     292
Big        49
TooBig     13
Name: Family, dtype: int64

In [None]:
test_df['Family'] = pd.cut(test_df.Family, [0,1,4,7,11], labels=['Solo','Small','Big','TooBig'])
test_df.Family.value_counts()

Solo      253
Small     145
Big        14
TooBig      6
Name: Family, dtype: int64

In [None]:
cramers_v(train_df.Survived, train_df.Family)

0.2857346463065578

## **TICKET**

In [None]:
train_df.Ticket.isnull().sum(axis=0)

0

In [None]:
train_df.Ticket.value_counts()

1601          7
CA. 2343      7
347082        7
347088        6
CA 2144       6
             ..
2623          1
A./5. 2152    1
350407        1
111369        1
371362        1
Name: Ticket, Length: 681, dtype: int64

In [None]:
train_df.Ticket = train_df.Ticket.apply(lambda x : x[0])
train_df.Ticket.value_counts()

3    301
2    183
1    146
P     65
S     65
C     47
A     29
W     13
4     10
7      9
F      7
6      6
L      4
5      3
8      2
9      1
Name: Ticket, dtype: int64

In [None]:
test_df.Ticket = test_df.Ticket.apply(lambda x : x[0])
test_df.Ticket.value_counts()

3    128
2     95
1     64
P     33
S     33
C     30
A     13
W      6
F      6
7      4
6      3
9      1
4      1
L      1
Name: Ticket, dtype: int64

In [None]:
cramers_v(train_df.Survived, train_df.Ticket)

0.3368277080379243

## **FARE**

In [None]:
train_df.Fare.isnull().sum(axis=0)

0

In [None]:
test_df.Fare.isnull().sum(axis=0)

1

In [None]:
cramers_v(train_df.Fare, train_df.Embarked)

0.7983506279145837

In [None]:
cramers_v(train_df.Fare, train_df.Pclass)

0.8226552735806314

In [None]:
cramers_v(train_df.Fare, train_df.Ticket)

0.627651312368573

In [None]:
test_df.Embarked.loc[test_df.Fare.isnull()]

152    S
Name: Embarked, dtype: object

In [None]:
test_df.Pclass.loc[test_df.Fare.isnull()]

152    3
Name: Pclass, dtype: int64

In [None]:
test_df.Ticket.loc[test_df.Fare.isnull()]

152    3
Name: Ticket, dtype: object

In [None]:
guess_Fare = test_df.Fare.loc[ (test_df.Ticket == '3') & (test_df.Pclass == 3) & (test_df.Embarked == 'S')].median()

In [None]:
test_df.Fare.fillna(guess_Fare , inplace=True)

In [None]:
train_df.Fare.value_counts()

8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
8.4583      1
9.8375      1
8.3625      1
14.1083     1
17.4000     1
Name: Fare, Length: 248, dtype: int64

## **EMBARKED**

In [None]:
train_df.Embarked.isnull().sum(axis=0)

2

In [None]:
train_df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [None]:
train_df.Embarked.fillna('S',inplace=True)

In [None]:
test_df.Embarked.isnull().sum(axis=0)

0

In [None]:
cramers_v(train_df.Survived, train_df.Embarked)

0.16408491889698326

In [None]:
cramers_v(train_df.Embarked,train_df.Fare)

0.7983506279145837

## **CABIN**

In [None]:
train_df.Cabin.isnull().sum(axis=0)

687

## **AGE**

In [None]:
train_df.Age.isnull().sum(axis=0)

177

In [None]:
cramers_v(train_df.Survived, train_df.Age)

0.15456614283818954

In [None]:
cramers_v(train_df.Age, train_df.Title)

0.37804028105785936

In [None]:
cramers_v(train_df.Age, train_df.Fare)

0.14462120455817193

In [None]:
cramers_v(train_df.Age, train_df.Family)

0.21087848875019624

In [None]:
cramers_v(train_df.Age, train_df.Pclass)

0.2778415721999506

In [None]:
cramers_v(train_df.Age, train_df.Parch)

0.2660040032906386

In [None]:
cramers_v(train_df.Age, train_df.SibSp)

0.22831559036973606

## **Model**

In [None]:
features = ['Pclass', 'Sex', 'Title', 'Family', 'Ticket','Age','Fare','Embarked']
X_train = train_df[features]
y_train = train_df.Survived

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [None]:
numerical_cols = ['Fare','Age']
categorical_cols = ['Pclass', 'Sex', 'Title', 'Family', 'Ticket','Embarked']

numerical_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])
rfc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=0, n_estimators=2000, max_depth=5))
])
rfc.fit(X_train,y_train)
print(cross_val_score(rfc, X_train, y_train, cv=20).mean())

0.8237121212121211


In [None]:
from xgboost import XGBClassifier
xgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(n_estimators= 10000, max_depth= 4, min_child_weight= 2, gamma=0.9, subsample=0.8,
                            colsample_bytree=0.8, objective= 'binary:logistic', nthread= -1, scale_pos_weight=1))
])

xgb.fit(X_train,y_train)

print('Cross validation score: {:.3f}'.format(cross_val_score(xgb, X_train, y_train, cv=20).mean()))

Cross validation score: 0.818


In [None]:
X_test = test_df[features]
X_test.head()

Unnamed: 0,Pclass,Sex,Title,Family,Ticket,Age,Fare,Embarked
0,3,male,Mr,Solo,3,34.5,7.8292,Q
1,3,female,Mrs,Small,3,47.0,7.0,S
2,2,male,Mr,Solo,2,62.0,9.6875,Q
3,3,male,Mr,Solo,3,27.0,8.6625,S
4,3,female,Mrs,Small,3,22.0,12.2875,S


In [None]:
predictions = rfc.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)

In [None]:
!kaggle competitions submit -c titanic -f my_submission.csv -m "Tf"

100% 2.77k/2.77k [00:02<00:00, 1.31kB/s]
403 - Your team has used its submission allowance (10 of 10). This resets at midnight UTC (13 hours from now).
