In [110]:
import pandas as pd
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.Age = train.Age.fillna(train.Age.median())

In [4]:
train.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [5]:
train.Embarked = train.Embarked.fillna('S')

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Here we 1st predicting the Cabin Value

In [7]:
df_train = train.copy()

In [8]:
# x_train data 
cabin_notnull_index = df_train[df_train['Cabin'].notnull()].index.tolist()

In [9]:
len(cabin_notnull_index)

204

In [10]:
# X_train data with all the notnull value index
df_cabin_notnull = df_train.iloc[cabin_notnull_index]
X_train = df_cabin_notnull.copy()

In [12]:
y_train = X_train.Cabin

In [13]:
y_train.shape

(204,)

In [14]:
y_train = y_train.astype(str).str[0]
y_train.value_counts()

C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin, dtype: int64

In [15]:
cabin_map = {'C':0,"B":1,"D":2,"E":3,'A':4,'F':5,"G":6,"T":7}
y_train = y_train.map(cabin_map)

In [17]:
y_train.value_counts()

0    59
1    47
2    33
3    32
4    15
5    13
6     4
7     1
Name: Cabin, dtype: int64

In [18]:
X_train = X_train.drop('Cabin',axis=1)

In [19]:
X_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,S
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,S


In [20]:
cabin_null_index = df_train[df_train['Cabin'].isnull()].index.tolist()
# X_test data with all the null value index
df_cabin_null = df_train.iloc[cabin_null_index]

In [23]:
X_test = df_cabin_null.copy()

In [24]:
X_test = X_test.drop('Cabin',axis=1)

In [25]:
X_test.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S
5,6,0,3,"Moran, Mr. James",male,28.0,0,0,330877,8.4583,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,S


In [26]:
X_train.Sex = X_train.Sex.map({'male':0,'female':1})

X_test.Sex = X_test.Sex.map({'male':0,'female':1})

In [27]:
X_train['Title'] = X_train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

X_test['Title'] = X_test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [28]:
X_train['Title'] = X_train['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                             'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

X_train['Title'] = X_train['Title'].replace('Mlle', 'Miss')
X_train['Title'] = X_train['Title'].replace('Ms', 'Miss')
X_train['Title'] = X_train['Title'].replace('Mme', 'Mrs')

In [29]:
X_test['Title'] = X_test['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                             'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

X_test['Title'] = X_test['Title'].replace('Mlle', 'Miss')
X_test['Title'] = X_test['Title'].replace('Ms', 'Miss')
X_test['Title'] = X_test['Title'].replace('Mme', 'Mrs')

In [30]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

X_train.Title = X_train.Title.map(title_mapping)

X_test.Title = X_test.Title.map(title_mapping)

In [31]:
X_train = X_train.drop('Name',axis=1)

X_test = X_test.drop('Name',axis=1)

In [32]:
Embarked_map = {"S":0,'C':1,"Q":2}
X_train.Embarked = X_train.Embarked.map(Embarked_map)

X_test.Embarked = X_test.Embarked.map(Embarked_map)

In [34]:
X_train = X_train.drop('Ticket',axis=1)

X_test = X_test.drop('Ticket',axis=1)

X_train = X_train.drop('PassengerId',axis=1)

X_test = X_test.drop('PassengerId',axis=1)

In [36]:
X_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
1,1,1,1,38.0,1,0,71.2833,1,3
3,1,1,1,35.0,1,0,53.1,0,3
6,0,1,0,54.0,0,0,51.8625,0,1
10,1,3,1,4.0,1,1,16.7,0,2
11,1,1,1,58.0,0,0,26.55,0,2


In [37]:
X_test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,0,3,0,22.0,1,0,7.25,0,1
2,1,3,1,26.0,0,0,7.925,0,2
4,0,3,0,35.0,0,0,8.05,0,1
5,0,3,0,28.0,0,0,8.4583,2,1
7,0,3,0,2.0,3,1,21.075,0,4


## Predicting the Cabin value

In [38]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train,y_train)
print(classifier.score(X_train,y_train))
y_predicted = classifier.predict(X_test)

0.9950980392156863


In [39]:
X_test['Cabin'] = y_predicted

In [40]:
X_test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Cabin
0,0,3,0,22.0,1,0,7.25,0,1,5
2,1,3,1,26.0,0,0,7.925,0,2,3
4,0,3,0,35.0,0,0,8.05,0,1,5
5,0,3,0,28.0,0,0,8.4583,2,1,5
7,0,3,0,2.0,3,1,21.075,0,4,5


In [41]:
X_test.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Title       0
Cabin       0
dtype: int64

In [42]:
X_test.shape

(687, 10)

In [43]:
X_train['Cabin'] = y_train

In [44]:
X_train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Cabin
1,1,1,1,38.0,1,0,71.2833,1,3,0
3,1,1,1,35.0,1,0,53.1,0,3,0
6,0,1,0,54.0,0,0,51.8625,0,1,3
10,1,3,1,4.0,1,1,16.7,0,2,6
11,1,1,1,58.0,0,0,26.55,0,2,0


In [45]:
X_train.shape

(204, 10)

In [46]:
X_train.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Title       0
Cabin       0
dtype: int64

In [47]:
train_data = pd.concat([X_train,X_test])

In [48]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Cabin
1,1,1,1,38.0,1,0,71.2833,1,3,0
3,1,1,1,35.0,1,0,53.1,0,3,0
6,0,1,0,54.0,0,0,51.8625,0,1,3
10,1,3,1,4.0,1,1,16.7,0,2,6
11,1,1,1,58.0,0,0,26.55,0,2,0


In [49]:
train_data = train_data.reset_index()

In [52]:
train_data = train_data.drop('index',axis=1)

In [53]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Cabin
0,1,1,1,38.0,1,0,71.2833,1,3,0
1,1,1,1,35.0,1,0,53.1,0,3,0
2,0,1,0,54.0,0,0,51.8625,0,1,3
3,1,3,1,4.0,1,1,16.7,0,2,6
4,1,1,1,58.0,0,0,26.55,0,2,0


In [54]:
train_data.shape

(891, 10)

In [55]:
train_data.to_csv('Cabin_pred_trainData.csv')

## for titanic test data

In [56]:
test = pd.read_csv('test.csv')

In [57]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [60]:
test.Age = test.Age.fillna(test.Age.median())

In [62]:
test.Fare = test.Fare.fillna(test.Fare.median())

In [63]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

## Here we 1st predicting the Cabin Value

In [64]:
df_test = test.copy()

In [65]:
# x_train data 
cabin_notnull_index = df_test[df_test['Cabin'].notnull()].index.tolist()

In [66]:
# X_train data with all the notnull value index
df_cabin_notnull = df_test.iloc[cabin_notnull_index]
X_train = df_cabin_notnull.copy()

In [67]:
y_train = X_train.Cabin

In [68]:
y_train = y_train.astype(str).str[0]
y_train.value_counts()

C    35
B    18
D    13
E     9
F     8
A     7
G     1
Name: Cabin, dtype: int64

In [69]:
cabin_map = {'C':0,"B":1,"D":2,"E":3,'A':4,'F':5,"G":6,"T":7}
y_train = y_train.map(cabin_map)

In [70]:
X_train = X_train.drop('Cabin',axis=1)

In [71]:
cabin_null_index = df_test[df_test['Cabin'].isnull()].index.tolist()
# X_test data with all the null value index
df_cabin_null = df_test.iloc[cabin_null_index]

In [72]:
X_test = df_cabin_null.copy()

In [73]:
X_test = X_test.drop('Cabin',axis=1)

In [74]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
12,904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23.0,1,0,21228,82.2667,S
14,906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance...",female,47.0,1,0,W.E.P. 5734,61.175,S
24,916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48.0,1,3,PC 17608,262.375,C
26,918,1,"Ostby, Miss. Helene Ragnhild",female,22.0,0,1,113509,61.9792,C
28,920,1,"Brady, Mr. John Bertram",male,41.0,0,0,113054,30.5,S


In [75]:
X_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


In [78]:
y_train[:5]

12    1
14    3
24    1
26    1
28    4
Name: Cabin, dtype: int64

In [79]:
X_train.Sex = X_train.Sex.map({'male':0,'female':1})

X_test.Sex = X_test.Sex.map({'male':0,'female':1})

In [80]:
X_train['Title'] = X_train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

X_test['Title'] = X_test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

In [81]:
X_train['Title'] = X_train['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                             'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

X_train['Title'] = X_train['Title'].replace('Mlle', 'Miss')
X_train['Title'] = X_train['Title'].replace('Ms', 'Miss')
X_train['Title'] = X_train['Title'].replace('Mme', 'Mrs')

In [82]:
X_test['Title'] = X_test['Title'].replace(['Lady', 'Countess','Capt', 'Col',
                                             'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

X_test['Title'] = X_test['Title'].replace('Mlle', 'Miss')
X_test['Title'] = X_test['Title'].replace('Ms', 'Miss')
X_test['Title'] = X_test['Title'].replace('Mme', 'Mrs')

In [83]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

X_train.Title = X_train.Title.map(title_mapping)

X_test.Title = X_test.Title.map(title_mapping)

In [84]:
X_train = X_train.drop('Name',axis=1)

X_test = X_test.drop('Name',axis=1)

In [85]:
Embarked_map = {"S":0,'C':1,"Q":2}
X_train.Embarked = X_train.Embarked.map(Embarked_map)

X_test.Embarked = X_test.Embarked.map(Embarked_map)

In [86]:
X_train = X_train.drop('Ticket',axis=1)

X_test = X_test.drop('Ticket',axis=1)

X_train = X_train.drop('PassengerId',axis=1)

X_test = X_test.drop('PassengerId',axis=1)

In [87]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
12,1,1,23.0,1,0,82.2667,0,3
14,1,1,47.0,1,0,61.175,0,3
24,1,1,48.0,1,3,262.375,1,3
26,1,1,22.0,0,1,61.9792,1,2
28,1,0,41.0,0,0,30.5,0,1


In [88]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,3,0,34.5,0,0,7.8292,2,1
1,3,1,47.0,1,0,7.0,0,3
2,2,0,62.0,0,0,9.6875,2,1
3,3,0,27.0,0,0,8.6625,0,1
4,3,1,22.0,1,1,12.2875,0,3


## Predicting the Cabin value

In [89]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier()
classifier.fit(X_train,y_train)
print(classifier.score(X_train,y_train))
y_predicted = classifier.predict(X_test)

1.0


In [90]:
X_test['Cabin'] = y_predicted

In [91]:
X_train['Cabin'] = y_train

In [100]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Cabin
12,1,1,23.0,1,0,82.2667,0,3,1
14,1,1,47.0,1,0,61.175,0,3,3
24,1,1,48.0,1,3,262.375,1,3,1
26,1,1,22.0,0,1,61.9792,1,2,1
28,1,0,41.0,0,0,30.5,0,1,4


In [92]:
test_data = pd.concat([X_train,X_test])

In [93]:
test_data = test_data.reset_index()

In [94]:
test_data.head()

Unnamed: 0,index,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Cabin
0,12,1,1,23.0,1,0,82.2667,0,3,1
1,14,1,1,47.0,1,0,61.175,0,3,3
2,24,1,1,48.0,1,3,262.375,1,3,1
3,26,1,1,22.0,0,1,61.9792,1,2,1
4,28,1,0,41.0,0,0,30.5,0,1,4


In [95]:
test_data.shape

(418, 10)

In [97]:
test_data = test_data.drop('index',axis=1)

In [98]:
train_data.to_csv('Cabin_pred_testData.csv')

# ------------------------------------------------------------------------

## Now we can predict the Survived Value 

In [101]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Cabin
0,1,1,1,38.0,1,0,71.2833,1,3,0
1,1,1,1,35.0,1,0,53.1,0,3,0
2,0,1,0,54.0,0,0,51.8625,0,1,3
3,1,3,1,4.0,1,1,16.7,0,2,6
4,1,1,1,58.0,0,0,26.55,0,2,0


In [102]:
test_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,Cabin
0,1,1,23.0,1,0,82.2667,0,3,1
1,1,1,47.0,1,0,61.175,0,3,3
2,1,1,48.0,1,3,262.375,1,3,1
3,1,1,22.0,0,1,61.9792,1,2,1
4,1,0,41.0,0,0,30.5,0,1,4


In [104]:
X_train = train_data.iloc[:,1:]
y_train = train_data['Survived']
X_test = test_data

## traning the models

In [111]:
from xgboost import XGBClassifier

xgb = XGBClassifier(seed=0,reg_lambda=0,reg_alpha=1,n_estimators=180,min_child_weight=1,
                   max_depth=3,gamma=9,colsample_bytree=0.5)

xgb.fit(X_train,y_train)
print(xgb.score(X_train,y_train))

0.8742985409652076


In [116]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=10, max_features=5, n_estimators=10)
rf.fit(X_train,y_train)
print(rf.score(X_train,y_train))

0.9506172839506173


In [112]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_leaf=20)
dt.fit(X_train,y_train)
print(dt.score(X_train,y_train))

0.8552188552188552


In [113]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(metric='manhattan', n_neighbors=13)
knn.fit(X_train,y_train)
print(knn.score(X_train,y_train))

0.8069584736251403


In [114]:
from sklearn.svm import SVC

svm = SVC(C=50, gamma=0.01)

svm.fit(X_train,y_train)
print(svm.score(X_train,y_train))

0.9281705948372615


In [115]:
from sklearn.linear_model import LogisticRegression

lr =LogisticRegression(C=4.281332398719396, penalty='l1', solver='saga')

lr.fit(X_train,y_train)
print(lr.score(X_train,y_train))

0.6992143658810326


### We select the model RandomForest

In [117]:
classifier = RandomForestClassifier(max_depth=10, max_features=5, n_estimators=10)
classifier.fit(X_train,y_train)
print(classifier.score(X_train,y_train))
y_predicted = classifier.predict(X_test)

0.957351290684624


In [118]:
import pickle
pickle.dump(classifier,open('Last_RandomForest_Classifier.pkl','wb'))

In [119]:
pred = pd.DataFrame(y_predicted)
sub_df = pd.read_csv('gender_submission.csv')
datasets = pd.concat([sub_df['PassengerId'],pred],axis=1)
datasets.columns=['PassengerId','Survived']
datasets.to_csv('Gender_submission_last_RandomForest.csv',index=False)

In [120]:
from sklearn.svm import SVC

svm = SVC(C=50, gamma=0.01)

svm.fit(X_train,y_train)
print(svm.score(X_train,y_train))
y_predicted = svm.predict(X_test)

0.9281705948372615


In [121]:
pred = pd.DataFrame(y_predicted)
sub_df = pd.read_csv('gender_submission.csv')
datasets = pd.concat([sub_df['PassengerId'],pred],axis=1)
datasets.columns=['PassengerId','Survived']
datasets.to_csv('Gender_submission_last_SVM.csv',index=False)