# Titanic

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
print(train.columns.values)

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']


In [11]:
print(train['Survived'].unique())

[0 1]


In [18]:
print(train['Pclass'].unique())

[3 1 2]


In [21]:
print(train['Embarked'].unique())

['S' 'C' 'Q' nan]


In [25]:
print(train['Sex'].unique())

['male' 'female']


In [8]:
for data in train['Name']:
    if str(data).__contains__("Dr."):
        print(data)

Minahan, Dr. William Edward
Moraweck, Dr. Ernest
Pain, Dr. Alfred
Stahelin-Maeglin, Dr. Max
Frauenthal, Dr. Henry William
Brewe, Dr. Arthur Jackson
Leader, Dr. Alice (Farnham)


In [17]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

PassengerId - id of each passenger

Survived - Has 2 values - 0 and 1:

    Survived - 1
    Not Survived - 0

Pclass - Passenger's class:

    Upper Class - 1st
    Middle Class - 2nd
    Lower Class - 3rd

Name - Passenger's name

Sex - Passenger's Gender

Age - Passenger's Age in years

SibSp - Passenger's siblings and spouses

Parch - Passenger's parents and children

Ticket - Passenger's ticket number

Fare - Passenger's ticket price

Cabin - Passenger's cabin number

Embarked - Port of entry:

    C - Cherbourg
    Q - Queenstown
    S - Southampton


# Pre-process data

In [35]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

for data in [train, test]:
    #drop useless columns
    data.drop(['Name', 'Cabin', 'Ticket'], axis="columns", inplace=True)
    
    #fill missing data
    data['Age'].fillna(value=data['Age'].mean(), inplace=True)
    data['Fare'].fillna(value=data['Fare'].median(), inplace=True)
    # .mode = most occuring value in array ([0] is highest occurance)
    data['Embarked'].fillna(value=data['Embarked'].mode()[0], inplace=True)
    
    # use labelencoder to encode sex / embarked values into numerical values
    label = LabelEncoder()
    data['Sex']=label.fit_transform(data['Sex'])
    data['Embarked']=label.fit_transform(data['Embarked'])
    
    # use MinMaxScaler to scale the data
    minmax_scaler = MinMaxScaler()
    data = minmax_scaler.fit_transform(data)

In [36]:
# Check if there are any null values present
print(train.isnull().sum())
print(test.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


In [37]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.000000,1,0,7.2500,2
1,2,1,1,0,38.000000,1,0,71.2833,0
2,3,1,3,0,26.000000,0,0,7.9250,2
3,4,1,1,0,35.000000,1,0,53.1000,2
4,5,0,3,1,35.000000,0,0,8.0500,2
...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,27.000000,0,0,13.0000,2
887,888,1,1,0,19.000000,0,0,30.0000,2
888,889,0,3,0,29.699118,1,2,23.4500,2
889,890,1,1,1,26.000000,0,0,30.0000,0


In [40]:
X_train = train.iloc[:,2:]
Y_train = train.iloc[:,1:2].to_numpy().reshape((-1,1))
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

In [42]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,22.000000,1,0,7.2500,2
1,1,0,38.000000,1,0,71.2833,0
2,3,0,26.000000,0,0,7.9250,2
3,1,0,35.000000,1,0,53.1000,2
4,3,1,35.000000,0,0,8.0500,2
...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,13.0000,2
887,1,0,19.000000,0,0,30.0000,2
888,3,0,29.699118,1,2,23.4500,2
889,1,1,26.000000,0,0,30.0000,0


In [45]:
Y_train

array([[0],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
    

In [47]:
X_test = test.iloc[:,1:]
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,34.50000,0,0,7.8292,1
1,3,0,47.00000,1,0,7.0000,2
2,2,1,62.00000,0,0,9.6875,1
3,3,1,27.00000,0,0,8.6625,2
4,3,0,22.00000,1,1,12.2875,2
...,...,...,...,...,...,...,...
413,3,1,30.27259,0,0,8.0500,2
414,1,0,39.00000,0,0,108.9000,0
415,3,1,38.50000,0,0,7.2500,2
416,3,1,30.27259,0,0,8.0500,2


# Models

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# model objects
rf = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=0)
lr = LogisticRegression(max_iter=10000)
gb = GradientBoostingClassifier()
nb = GaussianNB()
svc = SVC()

#      

In [69]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

rf.fit(x_train, y_train.ravel())
lr.fit(x_train, y_train.ravel())
gb.fit(x_train, y_train.ravel())
nb.fit(x_train, y_train.ravel())
svc.fit(x_train, y_train.ravel())

rf_predict = rf.predict(x_test)

lr_predict = lr.predict(x_test)

gb_predict = gb.predict(x_test)

nb_predict = nb.predict(x_test)

svc_predict = svc.predict(x_test)

print("Accuracy Score")

print("Random forest {:0.2f}%".format(accuracy_score(y_test, rf_predict)*100))
print("Logistic Regression {:0.2f}%".format(accuracy_score(y_test, lr_predict)*100))
print("Gradient Boost {:0.2f}%".format(accuracy_score(y_test, gb_predict)*100))
print("Naive Bayes {:0.2f}%".format(accuracy_score(y_test, nb_predict)*100))
print("Support Vector Classifier {:0.2f}%".format(accuracy_score(y_test, svc_predict)*100))

Accuracy Score
Random forest 83.05%
Logistic Regression 81.69%
Gradient Boost 82.37%
Naive Bayes 80.34%
Support Vector Classifier 66.44%


Final Training with the best model

In [54]:
rf.fit(X_train, Y_train.ravel())

In [65]:
rf_predict = rf.predict(X_test)
rf_predict = np.array(rf_predict)

In [66]:
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': rf_predict})
output.to_csv('submission.csv', index=False)

In [67]:
submission = pd.read_csv('submission.csv')
submission.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
