In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from pathlib import Path

In [2]:
# LOADING TITANIC DATASETS
titanicTrain = pd.read_csv('train.csv')
titanicTest = pd.read_csv('test.csv')
titanicTest

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [3]:
# FILLING NAN VALUES WITH THE MEAN FOR AGE - TRAINING DATA
meanAge = titanicTrain['Age'].mean()                       # CALCULATING THE MEAN AGE
titanicTrainNoNaN = titanicTrain.fillna(meanAge)           # FILLING NAN VALUES - BCZ ONLY AGE COLUMN HAS NANS - USE THE WHOLE DATASET


# FILLING NAN VALUES WITH THE MEAN FOR AGE - TESTING DATA
meanAge = titanicTest['Age'].mean()                        # CALCULATING THE MEAN AGE
titanicTestNoNaN = titanicTest.fillna(meanAge)             # FILLING NAN VALUES - BCZ ONLY AGE COLUMN HAS NANS - USE THE WHOLE DATASET

# Fit label encoder and return encoded labels: MAKING females:0 and males:1
titanicTrainNoNaN['Sex'] = LabelEncoder().fit_transform(titanicTrainNoNaN['Sex'])
titanicTestNoNaN['Sex'] = LabelEncoder().fit_transform(titanicTestNoNaN['Sex'])

In [4]:
xTrain = titanicTrainNoNaN[['Pclass', 'Sex', 'Age']]
yTrain = titanicTrainNoNaN['Survived']
xTest = titanicTestNoNaN[['Pclass', 'Sex', 'Age']]

In [5]:
# LOGISTIC REGRESSION
# BUILDING A LOGISTIC REGRESSION MODEL
logisticModel = LogisticRegression().fit(xTrain, yTrain)

predLog = logisticModel.predict(xTest)           # ASKING THE MODEL FOR PREDICTIONS OF SURVIVAL

In [6]:
toSubmit = titanicTestNoNaN[['PassengerId']]

toSubmit.insert(loc = 1, column = 'Survived', value = predLog)
toSubmit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [7]:
filepathLog = Path('submitLog/toSubmit.csv')
filepathLog.parent.mkdir(parents = True, exist_ok = True)  
toSubmit.to_csv(filepathLog, index = False)
toSubmit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [8]:
# KNN MODEL
knnClassf = KNeighborsClassifier(n_neighbors = 5).fit(xTrain, yTrain)
knnPred = knnClassf.predict(xTest)

knnSubmit = titanicTestNoNaN[['PassengerId']]

knnSubmit.insert(loc = 1, column = 'Survived', value = knnPred)
knnSubmit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [9]:
filepathKNN = Path('submitKNN/knnSubmit.csv')
filepathKNN.parent.mkdir(parents = True, exist_ok = True)  
knnSubmit.to_csv(filepathKNN, index = False)
knnSubmit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [10]:
# RANDOM FOREST MODEL - Score: 0.77751
RFclassf = RandomForestClassifier(n_estimators = 100, max_depth = 3, max_features = 5).fit(xTrain, yTrain)

rfPred = RFclassf.predict(xTest)

rfSubmit = titanicTestNoNaN[['PassengerId']]

rfSubmit.insert(loc = 1, column = 'Survived', value = rfPred)

filepathRF = Path('rfSubmit/rfSubmit.csv')
filepathRF.parent.mkdir(parents = True, exist_ok = True)  
rfSubmit.to_csv(filepathRF, index = False)
rfSubmit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [11]:
# DECISION TREE MODEL
dtClassf = DecisionTreeClassifier(random_state = 20, max_depth = None, max_leaf_nodes=5).fit(xTrain, yTrain)

dtPred = dtClassf.predict(xTest)

dtSubmit = titanicTestNoNaN[['PassengerId']]

dtSubmit.insert(loc = 1, column = 'Survived', value = dtPred)

filepathdt = Path('dtSubmit/dtSubmit.csv')
filepathdt.parent.mkdir(parents = True, exist_ok = True)  
dtSubmit.to_csv(filepathdt, index = False)
dtSubmit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [12]:
# RANDOM FOREST MODEL - Score: 0.77511
RFclassf = RandomForestClassifier(n_estimators = 900, max_depth = 7, max_features = 5, n_jobs=1).fit(xTrain, yTrain)

rfPred = RFclassf.predict(xTest)

rfSubmit = titanicTestNoNaN[['PassengerId']]

rfSubmit.insert(loc = 1, column = 'Survived', value = rfPred)

filepathRF = Path('rfSubmit/rfSubmit2.csv')
filepathRF.parent.mkdir(parents = True, exist_ok = True)  
rfSubmit.to_csv(filepathRF, index = False)
rfSubmit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [13]:
# RANDOM FOREST MODEL - Score: 0.78468 - FINAL MODEL
RFclassf = RandomForestClassifier(n_estimators = 500, max_depth = 5, max_features = 5, n_jobs = 1).fit(xTrain, yTrain)

rfPred = RFclassf.predict(xTest)

rfSubmit = titanicTestNoNaN[['PassengerId']]

rfSubmit.insert(loc = 1, column = 'Survived', value = rfPred)

filepathRF = Path('rfSubmit/rfSubmit3.csv')
filepathRF.parent.mkdir(parents = True, exist_ok = True)  
rfSubmit.to_csv(filepathRF, index = False)
rfSubmit

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
