In [90]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import os

In [91]:
genderdf = pd.read_csv("D://titanic//gender_submission.csv")
test = pd.read_csv("D://titanic//test.csv")
train = pd.read_csv("D://titanic//train.csv")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [92]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [93]:
#By looking at the data we dont need their passengerId,name, ticket number or their fare so we remove those columns
train = train.drop(['Name','Ticket','Fare'],axis = 'columns')

In [94]:
train.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,1,0,3,male,22.0,1,0,,S
1,2,1,1,female,38.0,1,0,C85,C
2,3,1,3,female,26.0,0,0,,S
3,4,1,1,female,35.0,1,0,C123,S
4,5,0,3,male,35.0,0,0,,S
5,6,0,3,male,,0,0,,Q
6,7,0,1,male,54.0,0,0,E46,S
7,8,0,3,male,2.0,3,1,,S
8,9,1,3,female,27.0,0,2,,S
9,10,1,2,female,14.0,1,0,,C


In [95]:
#Now we check for null values
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Cabin          687
Embarked         2
dtype: int64

In [96]:
train['Age'].fillna(int(train['Age'].mean()), inplace=True)

In [97]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age              0
SibSp            0
Parch            0
Cabin          687
Embarked         2
dtype: int64

In [98]:
#Cabin has a lot of null values and its not important either so we drop it too
train = train.drop('Cabin',axis="columns")

In [99]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,1,0,3,male,22.0,1,0,S
1,2,1,1,female,38.0,1,0,C
2,3,1,3,female,26.0,0,0,S
3,4,1,1,female,35.0,1,0,S
4,5,0,3,male,35.0,0,0,S
5,6,0,3,male,29.0,0,0,Q
6,7,0,1,male,54.0,0,0,S
7,8,0,3,male,2.0,3,1,S
8,9,1,3,female,27.0,0,2,S
9,10,1,2,female,14.0,1,0,C


In [100]:
#Since some of the data of age is in decimal format we are going round it up
train['Age'] = train['Age'].round()

In [101]:
train['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
5      29.0
6      54.0
7       2.0
8      27.0
9      14.0
10      4.0
11     58.0
12     20.0
13     39.0
14     14.0
15     55.0
16      2.0
17     29.0
18     31.0
19     29.0
20     35.0
21     34.0
22     15.0
23     28.0
24      8.0
25     38.0
26     29.0
27     19.0
28     29.0
29     29.0
30     40.0
31     29.0
32     29.0
33     66.0
34     28.0
35     42.0
36     29.0
37     21.0
38     18.0
39     14.0
40     40.0
41     27.0
42     29.0
43      3.0
44     19.0
45     29.0
46     29.0
47     29.0
48     29.0
49     18.0
50      7.0
51     21.0
52     49.0
53     29.0
54     65.0
55     29.0
56     21.0
57     28.0
58      5.0
59     11.0
60     22.0
61     38.0
62     45.0
63      4.0
64     29.0
65     29.0
66     29.0
67     19.0
68     17.0
69     26.0
70     32.0
71     16.0
72     21.0
73     26.0
74     32.0
75     25.0
76     29.0
77     29.0
78      1.0
79     30.0
80     22.0
81     29.0
82     29.0
83  

In [102]:
#Encoding male/female to 0 and 1 for training purposes
train['Sex_encoded'] = train['Sex'].map({'male': 0, 'female': 1})


In [103]:
train = train.drop('Sex',axis="columns")

In [104]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Embarked,Sex_encoded
0,1,0,3,22.0,1,0,S,0
1,2,1,1,38.0,1,0,C,1
2,3,1,3,26.0,0,0,S,1
3,4,1,1,35.0,1,0,S,1
4,5,0,3,35.0,0,0,S,0
5,6,0,3,29.0,0,0,Q,0
6,7,0,1,54.0,0,0,S,0
7,8,0,3,2.0,3,1,S,0
8,9,1,3,27.0,0,2,S,1
9,10,1,2,14.0,1,0,C,1


In [105]:
label_encoder = LabelEncoder()
train['Embarked_encoded'] = label_encoder.fit_transform(train['Embarked'])
train = train.drop('Embarked',axis="columns")

In [106]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Sex_encoded,Embarked_encoded
0,1,0,3,22.0,1,0,0,2
1,2,1,1,38.0,1,0,1,0
2,3,1,3,26.0,0,0,1,2
3,4,1,1,35.0,1,0,1,2
4,5,0,3,35.0,0,0,0,2
5,6,0,3,29.0,0,0,0,1
6,7,0,1,54.0,0,0,0,2
7,8,0,3,2.0,3,1,0,2
8,9,1,3,27.0,0,2,1,2
9,10,1,2,14.0,1,0,1,0


Now doing the same for test

In [107]:
test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [108]:
test = test.drop(['Name','Ticket','Fare'],axis = 'columns')
test['Age'].fillna(int(test['Age'].mean()), inplace=True)
test = test.drop('Cabin',axis="columns")
test['Sex_encoded'] = test['Sex'].map({'male': 0, 'female': 1})
test = test.drop('Sex',axis="columns")
test['Embarked_encoded'] = label_encoder.fit_transform(test['Embarked'])
test = test.drop('Embarked',axis="columns")

In [109]:
test

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Sex_encoded,Embarked_encoded
0,892,3,34.5,0,0,0,1
1,893,3,47.0,1,0,1,2
2,894,2,62.0,0,0,0,1
3,895,3,27.0,0,0,0,2
4,896,3,22.0,1,1,1,2
5,897,3,14.0,0,0,0,2
6,898,3,30.0,0,0,1,1
7,899,2,26.0,1,1,0,2
8,900,3,18.0,0,0,1,0
9,901,3,21.0,2,0,0,2


In [110]:
test['Age'] = test['Age'].round()

In [111]:
test

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Sex_encoded,Embarked_encoded
0,892,3,34.0,0,0,0,1
1,893,3,47.0,1,0,1,2
2,894,2,62.0,0,0,0,1
3,895,3,27.0,0,0,0,2
4,896,3,22.0,1,1,1,2
5,897,3,14.0,0,0,0,2
6,898,3,30.0,0,0,1,1
7,899,2,26.0,1,1,0,2
8,900,3,18.0,0,0,1,0
9,901,3,21.0,2,0,0,2


In [112]:
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Sex_encoded,Embarked_encoded
0,1,0,3,22.0,1,0,0,2
1,2,1,1,38.0,1,0,1,0
2,3,1,3,26.0,0,0,1,2
3,4,1,1,35.0,1,0,1,2
4,5,0,3,35.0,0,0,0,2
5,6,0,3,29.0,0,0,0,1
6,7,0,1,54.0,0,0,0,2
7,8,0,3,2.0,3,1,0,2
8,9,1,3,27.0,0,2,1,2
9,10,1,2,14.0,1,0,1,0


In [113]:
y = train["Survived"]

X = pd.get_dummies(train.iloc[:, 2:])
X_test = pd.get_dummies(test.iloc[:,1:])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
