# Titanic: Machine Learning from Disaster

In [1]:
import numpy as np      # Linear Algebra
import pandas as pd     # Data Processing
from sklearn.ensemble import RandomForestClassifier  # Random Forrest

In [2]:
# Load data sets
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

In [3]:
# Focus on Train:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
total = train.isnull().sum().sort_values(ascending=False)
percent_1 = train.isnull().sum()/train.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head

<bound method NDFrame.head of              Total     %
Cabin          687  77.1
Age            177  19.9
Embarked         2   0.2
Fare             0   0.0
Ticket           0   0.0
Parch            0   0.0
SibSp            0   0.0
Sex              0   0.0
Name             0   0.0
Pclass           0   0.0
Survived         0   0.0
PassengerId      0   0.0>

In [5]:
# Data Preprocessing
# Drop ‘PassengerId’ from the train set, because it does not contribute to a persons survival probability. 
train = train.drop(['PassengerId'], axis=1)

In [6]:
# Age null must get cleaned                        : Done
# cabin must be removed                            : Done
# Embarked null must get cleaned too               : Done

# Name object should be int or removed             : Done          
# Sex must be int                                  : Done
# Age must be int                                  : Done
# Ticket object should be removed                  : Done
# Fare float must be int                           : Done
# Cabin object must be taken care of               : Done
# Embarked object must be taken care of            : Done

In [7]:
# Cabin 687 missed data so I will drop it:
train = train.drop(['Cabin'], axis=1)
test = test.drop(['Cabin'], axis=1)

In [8]:
# check the Age column to be null-free:
Tot_Null  = train["Age"].isnull().sum()   # Number of all age data
Tot_N     = train["Age"].isnull().count() # Number of Null data
# Take care of nulls on Both train and test:
Data = [train, test]

for Dataset in Data:                       # Pointer
    mean = train["Age"].mean()             # Mean for train set
    std  = test["Age"].std()               # Standard deviation of Age test set
    isN  = Dataset["Age"].isnull().sum()
    rand_age = np.random.randint(mean-std,mean+std,size=isN) 
    copAge = Dataset["Age"].copy()
    copAge[np.isnan(copAge)] = rand_age
    Dataset["Age"] = copAge
    #train["Age"].astype(int)

# train["Age"].isnull().sum()


# Covert ages to int and then put them in age group: Group ages
for Dataset in Data:
    Dataset["Age"] = Dataset["Age"].astype(int)
    Dataset.loc[Dataset["Age"] <= 10, "Age"] = 0
    Dataset.loc[(Dataset["Age"]> 10) & (Dataset["Age"]<= 20), "Age"] = 1
    Dataset.loc[(Dataset["Age"]> 20) & (Dataset["Age"]<= 30), "Age"] = 2
    Dataset.loc[(Dataset["Age"]> 30) & (Dataset["Age"]<= 40), "Age"] = 3
    Dataset.loc[(Dataset["Age"]> 40) & (Dataset["Age"]<= 50), "Age"] = 4
    Dataset.loc[(Dataset["Age"]> 50) & (Dataset["Age"]<= 60), "Age"] = 5
    Dataset.loc[(Dataset["Age"]> 60) & (Dataset["Age"]<= 70), "Age"] = 6
    Dataset.loc[(Dataset["Age"]> 70), "Age"] = 7

# let's see how it's distributed 
train["Age"].value_counts()

2    290
3    228
1    150
4     95
0     64
5     42
6     18
7      4
Name: Age, dtype: int64

In [9]:
# Embarked object must be taken care of
# we have 2 missing data so we put most common value for them:
train["Embarked"].describe().unique()

array([889, 3, 'S', 644], dtype=object)

In [10]:
Common_value = "S"
Data = [train,test]
for Dataset in Data:
    Dataset["Embarked"] = Dataset["Embarked"].fillna(Common_value)

In [11]:
train.head(8)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,2,1,0,A/5 21171,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,3,1,0,PC 17599,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,2,0,0,STON/O2. 3101282,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,3,1,0,113803,53.1,S
4,0,3,"Allen, Mr. William Henry",male,3,0,0,373450,8.05,S
5,0,3,"Moran, Mr. James",male,3,0,0,330877,8.4583,Q
6,0,1,"McCarthy, Mr. Timothy J",male,5,0,0,17463,51.8625,S
7,0,3,"Palsson, Master. Gosta Leonard",male,0,3,1,349909,21.075,S


In [12]:
# Now we have to make object as int in Embarke: We know it has three uniques which are:
# S, C, Q
ports = {"S": 0, "C": 1, "Q": 2}
data = [train, test]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(ports)

In [13]:
# Name data may be useless so I will drop it:
train = train.drop(['Name'], axis=1)
test = test.drop(['Name'], axis=1)

In [14]:
# Make Sex object as int:
genders = {"male": 0 , "female": 1}
Data = [train, test]

for Dataset in Data:
    Dataset['Sex'] = Dataset['Sex'].map(genders)

In [45]:
# Fare Float 
train["Fare"].describe().transpose()

count    891.000000
mean       1.523008
std        1.250743
min        0.000000
25%        0.000000
50%        1.000000
75%        2.000000
max        5.000000
Name: Fare, dtype: float64

In [16]:
Data = [train, test]

for Dataset in Data:
    Dataset['Fare'] = Dataset['Fare'].fillna(0)
    Dataset['Fare'] = Dataset['Fare'].astype(int)

for Dataset in Data:
    Dataset.loc[ Dataset['Fare'] <= 7.91, 'Fare'] = 0
    Dataset.loc[(Dataset['Fare'] > 7.91) & (Dataset['Fare'] <= 14.454), 'Fare'] = 1
    Dataset.loc[(Dataset['Fare'] > 14.454) & (Dataset['Fare'] <= 31.0), 'Fare']   = 2
    Dataset.loc[(Dataset['Fare'] > 31) & (Dataset['Fare'] <= 99), 'Fare']   = 3
    Dataset.loc[(Dataset['Fare'] > 99) & (Dataset['Fare'] <= 250), 'Fare']   = 4
    Dataset.loc[ Dataset['Fare'] > 250, 'Fare'] = 5
    Dataset['Fare'] = Dataset['Fare'].astype(int)
    


In [17]:
train["Fare"].describe()

count    891.000000
mean       1.523008
std        1.250743
min        0.000000
25%        0.000000
50%        1.000000
75%        2.000000
max        5.000000
Name: Fare, dtype: float64

In [18]:
train["Fare"].astype(int)

0      0
1      3
2      0
3      3
4      1
5      1
6      3
7      2
8      1
9      2
10     2
11     2
12     1
13     2
14     0
15     2
16     2
17     1
18     2
19     0
20     2
21     1
22     1
23     3
24     2
25     2
26     0
27     5
28     0
29     0
      ..
861    1
862    2
863    3
864    1
865    1
866    1
867    3
868    1
869    1
870    0
871    3
872    0
873    1
874    2
875    0
876    1
877    0
878    0
879    3
880    2
881    0
882    1
883    1
884    0
885    2
886    1
887    2
888    2
889    2
890    0
Name: Fare, Length: 891, dtype: int64

In [19]:
train['Ticket'].describe()

count          891
unique         681
top       CA. 2343
freq             7
Name: Ticket, dtype: object

In [20]:
# We drop the ticket since it is not usefull:
train = train.drop(['Ticket'], axis=1)
test = test.drop(['Ticket'], axis=1)

# Building Machine Learning:

In [21]:
X_train = train.drop("Survived", axis=1)
Y_train = train["Survived"]
X_test  = test.drop("PassengerId", axis=1).copy()

In [22]:
#Random Forest:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_prediction = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

acc_random_forest

90.8

In [23]:
# K-Fold Cross Validation:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.75555556 0.8        0.79775281 0.85393258 0.82022472 0.87640449
 0.82022472 0.80898876 0.84269663 0.84090909]
Mean: 0.821668936556577
Standard Deviation: 0.03229045226062931


In [38]:
result = [test["PassengerId"].shape,Y_prediction]
result

[(418,),
 array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
        1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
        1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
        1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
        1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
        0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
        1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 

In [41]:
output = pd.DataFrame(columns=['PassengerId','Survived'],data=zip(test['PassengerId'].values,np.hstack(Y_prediction)))

In [43]:
output.reset_index(inplace=True,drop=True)

In [44]:
output.to_csv('my_submission.csv', index=False)