In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msn

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import OrdinalEncoder

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv('../input/titanic/train.csv')
test_data = pd.read_csv('../input/titanic/test.csv')

print(train_data.columns)
print(test_data.columns)

In [None]:
print(train_data.shape)
print(test_data.shape)

Merging the Train and test data


In [None]:
main_data = pd.concat([train_data, test_data])
print(main_data.head())
print(main_data.shape)

In [None]:
msn.bar(main_data, figsize = (20,5), color = 'green')

In [None]:
f, ax = plt.subplots(figsize = (2,4))
sns.barplot(data = main_data, x = "Sex", y = "Survived", label = "Total")

In [None]:
# probability of survival for gender.
main_data['sex_prob'] = main_data.groupby('Sex')['Survived'].transform('sum')/main_data.groupby('Sex')['Survived'].transform('count')
    
pd.crosstab(main_data.sex_prob, main_data.Sex, dropna = False)

Survival according to the cabin:main_data['CabinProb'] = main_data.groupby('Cabin')['Survived'].transform('sum') / main_data.groupby('Cabin')['Survived'].transform('count')

In [None]:
main_data["Cabin"] = main_data['Cabin'].fillna('Z')
# cabin consist of the both letters and numbers so we take the first letter only
# main_data["Cabin"] = main_data["Cabin"].str[0:1]


In [None]:
f, ax = plt.subplots(figsize = (10,5))

sns.barplot(data = main_data, x = "Cabin", y = "Survived", label = "Total", palette = "tab10")

In [None]:
main_data['cabinprob'] = main_data.groupby('Cabin')['Survived'].transform('sum') / main_data.groupby('Cabin')['Survived'].transform('count')
pd.crosstab(main_data.cabinprob, main_data.Cabin, dropna = False)

 Extracting Probability from title of the person Salutation

In [None]:
Name = main_data['Name'].str.split(', ', expand=True)
print(Name)
main_data['Title'] = Name[1].str.split('.', expand=True)[0]


In [None]:
main_data['TitleProb'] = main_data.groupby('Title')['Survived'].transform('sum')/main_data.groupby('Title')['Survived'].transform('count')
pd.crosstab(main_data.TitleProb, main_data.Title, dropna = False)

**As we can see some groups contain only one person, I decide to regroup thinking a lot about the sex of the people and their probability of survival.**

In [None]:
main_data['Title'].replace(['Rev','Don','Jonkheer','Capt'],'Rare', inplace=True)
main_data['Title'].replace(['Dr','Col','Major','Sir'],'Dr+Col+Major+Sir', inplace=True)
main_data['Title'].replace(['Lady','Ms','Miss'],'Miss',inplace=True)
main_data['Title'].replace(['Dona','Mlle','Mme','Mrs', 'the Countess'],'Mrs',inplace=True)

main_data['TitleProb'] = main_data.groupby('Title')['Survived'].transform('sum') / main_data.groupby('Title')['Survived'].transform('count')
pd.crosstab(main_data.TitleProb, main_data.Title, dropna=False)

**Survival according to the Ticket**

In [None]:
# selecting ticket 1st word only
main_data['Tickets'] = main_data['Ticket'].str[0:1].copy()
main_data.Tickets

In [None]:
main_data['Ticketss'] = main_data.groupby('Tickets')['Survived'].transform('sum')/ main_data.groupby('Tickets')['Survived'].transform('count')
pd.crosstab(main_data.Ticketss, main_data.Tickets)


**Survival based on the number of siblings or "spouses" that the person has**

In [None]:
main_data['SibSpProb'] = main_data.groupby('SibSp')['Survived'].transform('sum') / main_data.groupby('SibSp')['Survived'].transform('count')
pd.crosstab(main_data.SibSpProb, main_data.SibSp, dropna=False)


**Survival according to the number of parents or children**

In [None]:

main_data['ParchProb'] = main_data.groupby('Parch')['Survived'].transform('sum') / main_data.groupby('Parch')['Survived'].transform('count')
pd.crosstab(main_data.ParchProb, main_data.Parch, dropna=False)



In [None]:
main_data["ParchProb"] = main_data["ParchProb"].fillna(0)

**Survival according to the class in which the person was traveling**

In [None]:
main_data['PclassProb'] = main_data.groupby('Pclass')['Survived'].transform('sum') / main_data.groupby('Pclass')['Survived'].transform('count')
pd.crosstab(main_data.PclassProb, main_data.Pclass, dropna=False)

**create a variable that measures survival by combining the ticket and the cabin:
**

In [None]:

# main_data['CabTick'] = (main_data['CabinProb'] + main_data['Ticket2Prob'])/2
# pd.crosstab(main_data.CabTick, main_data.Ticket2, dropna=False)



In [None]:


main_data_suv = main_data[main_data['Survived'].notna()]
main_data_suv_nan = main_data[main_data['Survived'].isna()]



In [None]:
predictors = ['PclassProb', 'TitleProb', 'ParchProb']

target = 'Survived'
X = main_data_suv[predictors]
Y = main_data_suv[target]
X_test = main_data_suv_nan[predictors]

In [None]:
model = RandomForestClassifier(n_estimators=100, bootstrap=True, criterion='entropy',
                               min_samples_leaf=60, min_samples_split=4, random_state=16)

model.fit(X,Y)

cv = KFold(n_splits = 10, shuffle=True, random_state=1)
cv.get_n_splits(X)
scores = cross_val_score(model, X, Y, scoring=make_scorer(mean_squared_error), cv = cv, n_jobs=1)
print(scores)
score = np.mean(scores)
print(1-score)

In [None]:
predictions = model.predict(X_test)
output = pd.DataFrame({'PassengerId': main_data_suv_nan.index, 'Survived': predictions})
output['Survived'] = output['Survived'].astype('int')
output.to_csv('submission.csv', index=False)
# output.Survived