In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
from sklearn import preprocessing as pp

In [185]:
from sklearn import cross_validation as cv
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [186]:
from sklearn.svm import SVC

In [187]:
from sklearn import tree

In [188]:
import pydotplus
from IPython.display import Image

In [206]:
def get_final_output(clf, X, y, X_test, out_file):
    clf.fit(X, y)
    y_ = clf.predict(X_test)
    y_results = pd.DataFrame()
    
    y_results["PassengerId"] = X_test.index
    y_results["Survived"] = y_
    
    if(out_file):
        y_results.to_csv(out_file, encoding='utf-8', index=False)

In [227]:
def report_train_test_accuracy(clf, X, y, test_size, random_state):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_size, random_state = random_state)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return metrics.accuracy_score(y_test, y_pred)

### Strip input data

In [236]:
train = pd.read_csv("train.csv", index_col=0)
test = pd.read_csv("test.csv", index_col=0)
print(train.shape, test.shape)

(891, 11) (418, 10)


In [237]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [238]:
cols_to_drop = ["Name", "Ticket", "Cabin"]

In [239]:
train.drop(cols_to_drop, axis= 1, inplace=True)
test.drop(cols_to_drop, axis= 1, inplace=True)

In [240]:
gender_mapping = {'female':0, 'male':1}
train['Sex'] = train['Sex'].map(gender_mapping)
test['Sex'] = test['Sex'].map(gender_mapping)

In [241]:
embarked_mapping = {'S':0, 'C':1, 'Q': 2}
train['Embarked'] = train['Embarked'].map(embarked_mapping)
test['Embarked'] = test['Embarked'].map(embarked_mapping)

In [242]:
test.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,332.0,418.0,418.0,417.0,418.0
mean,2.26555,0.636364,30.27259,0.447368,0.392344,35.627188,0.464115
std,0.841838,0.481622,14.181209,0.89676,0.981429,55.907576,0.685516
min,1.0,0.0,0.17,0.0,0.0,0.0,0.0
25%,1.0,0.0,21.0,0.0,0.0,7.8958,0.0
50%,3.0,1.0,27.0,0.0,0.0,14.4542,0.0
75%,3.0,1.0,39.0,1.0,0.0,31.5,1.0
max,3.0,1.0,76.0,8.0,9.0,512.3292,2.0


###   Dealing with missing values 

In [243]:
test.Age.fillna(inplace=True, value=test.Age.mean())
test.Fare.fillna(inplace=True, value=test.Fare.mean())

In [244]:
test.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,0.636364,30.27259,0.447368,0.392344,35.627188,0.464115
std,0.841838,0.481622,12.634534,0.89676,0.981429,55.8405,0.685516
min,1.0,0.0,0.17,0.0,0.0,0.0,0.0
25%,1.0,0.0,23.0,0.0,0.0,7.8958,0.0
50%,3.0,1.0,30.27259,0.0,0.0,14.4542,0.0
75%,3.0,1.0,35.75,1.0,0.0,31.5,1.0
max,3.0,1.0,76.0,8.0,9.0,512.3292,2.0


In [245]:
train.dropna(inplace=True)
test.dropna(inplace=True)
print(train.shape, test.shape)

(712, 8) (418, 7)


In [246]:
y = train.Survived
train.drop("Survived", axis= 1, inplace=True)
X = train
X_final = test

In [247]:
knn = KNeighborsClassifier(n_neighbors=20)

In [248]:
get_out_put(knn, X, y, X_final, "titanic_results_04.csv")

In [249]:
logreg = LogisticRegression()

In [250]:
get_out_put(logreg, X, y, X_final, "titanic_results_05.csv")

In [254]:
print(report_train_test_accuracy(knn, X, y, 0.4, 9))

0.719298245614


In [255]:
print(report_train_test_accuracy(logreg, X, y, 0.4, 9))

0.79298245614


In [257]:
from sklearn.cross_validation import cross_val_score

In [258]:

print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())

0.788054437738


In [263]:
knn = KNeighborsClassifier(n_neighbors=70)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

0.692483791639


In [264]:
tree1 = tree.DecisionTreeClassifier()

In [265]:
print(cross_val_score(tree1, X, y, cv=10, scoring='accuracy').mean())

0.775492957746


In [266]:
print(report_train_test_accuracy(tree1, X, y, 0.4, 9))

0.712280701754


In [267]:
get_out_put(tree1, X, y, X_final, "titanic_results_dTree.csv")