In [73]:
import numpy as np
import pandas as pd
import os

In [74]:
from sklearn import preprocessing as pp

In [75]:
from sklearn import cross_validation as cv
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [76]:
from sklearn.svm import SVC

In [77]:
from sklearn import tree

In [78]:
from sklearn.cross_validation import cross_val_score

In [79]:
#import pydotplus
#from IPython.display import Image

In [80]:
def get_final_output(clf, X, y, X_test, out_file):
    clf.fit(X, y)
    y_ = clf.predict(X_test)
    y_results = pd.DataFrame()
    
    y_results["PassengerId"] = X_test.index
    y_results["Survived"] = y_
    
    if(out_file):
        y_results.to_csv(out_file, encoding='utf-8', index=False)

In [81]:
def report_train_accuracy(clf, X, y):
    clf.fit(X, y)
    y_pred = clf.predict(X)
    
    #Confusion Matrix
    y_test = y
    confusion_mat = metrics.confusion_matrix(y_test, y_pred)
    
    TN = confusion_mat[0, 0]
    TP = confusion_mat[1, 1]
    FP = confusion_mat[0, 1]
    FN = confusion_mat[1, 0]
    
    
    print("Confusion Matrix : ")
    print(confusion_mat)
 
    print("Sensitivity :")
    print(metrics.recall_score(y_test, y_pred))
    
    print("Precision : ")
    print(TP/float(TP+FP))
    
    print("Specificity : ")
    print(TN/float(TN+FP))
    
    print("False Positive Rate : ")
    print(FP/float(TN+FP))    
    return metrics.accuracy_score(y, y_pred)

In [82]:
def report_train_test_accuracy(clf, X, y, test_size, random_state):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_size, random_state = random_state)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    #Confusion Matrix
    confusion_mat = metrics.confusion_matrix(y_test, y_pred)
    
    TN = confusion_mat[0, 0]
    TP = confusion_mat[1, 1]
    FP = confusion_mat[0, 1]
    FN = confusion_mat[1, 0]
    
    
    print("Confusion Matrix : ")
    print(confusion_mat)
 
    print("Sensitivity :")
    print(metrics.recall_score(y_test, y_pred))
    
    print("Precision : ")
    print(TP/float(TP+FP))
    
    print("Specificity : ")
    print(TN/float(TN+FP))
    
    print("False Positive Rate : ")
    print(FP/float(TN+FP))    

    return metrics.accuracy_score(y_test, y_pred)

### Strip input data

In [83]:
train = pd.read_csv("train.csv", index_col=0)
test = pd.read_csv("test.csv", index_col=0)
print(train.shape, test.shape)

(891, 11) (418, 10)


In [84]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [85]:
cols_to_drop = ["Name", "Ticket", "Cabin"]

In [86]:
train.drop(cols_to_drop, axis= 1, inplace=True)
test.drop(cols_to_drop, axis= 1, inplace=True)

In [87]:
gender_mapping = {'female':0, 'male':1}
train['Sex'] = train['Sex'].map(gender_mapping)
test['Sex'] = test['Sex'].map(gender_mapping)

In [88]:
embarked_mapping = {'S':0, 'C':1, 'Q': 2}
train['Embarked'] = train['Embarked'].map(embarked_mapping)
test['Embarked'] = test['Embarked'].map(embarked_mapping)

In [89]:
test.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,332.0,418.0,418.0,417.0,418.0
mean,2.26555,0.636364,30.27259,0.447368,0.392344,35.627188,0.464115
std,0.841838,0.481622,14.181209,0.89676,0.981429,55.907576,0.685516
min,1.0,0.0,0.17,0.0,0.0,0.0,0.0
25%,1.0,0.0,21.0,0.0,0.0,7.8958,0.0
50%,3.0,1.0,27.0,0.0,0.0,14.4542,0.0
75%,3.0,1.0,39.0,1.0,0.0,31.5,1.0
max,3.0,1.0,76.0,8.0,9.0,512.3292,2.0


###   Dealing with missing values 

In [90]:
test.Age.fillna(inplace=True, value=test.Age.mean())
test.Fare.fillna(inplace=True, value=test.Fare.mean())

In [91]:
test.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,0.636364,30.27259,0.447368,0.392344,35.627188,0.464115
std,0.841838,0.481622,12.634534,0.89676,0.981429,55.8405,0.685516
min,1.0,0.0,0.17,0.0,0.0,0.0,0.0
25%,1.0,0.0,23.0,0.0,0.0,7.8958,0.0
50%,3.0,1.0,30.27259,0.0,0.0,14.4542,0.0
75%,3.0,1.0,35.75,1.0,0.0,31.5,1.0
max,3.0,1.0,76.0,8.0,9.0,512.3292,2.0


In [92]:
train.dropna(inplace=True)
test.dropna(inplace=True)
print(train.shape, test.shape)

(712, 8) (418, 7)


In [93]:
y = train.Survived
train.drop("Survived", axis= 1, inplace=True)
X = train
X_final = test

## Logistic regression evaluation

In [94]:
logreg = LogisticRegression()

In [95]:
print(report_train_test_accuracy(logreg, X, y, 0.4, 9))

Confusion Matrix : 
[[153  25]
 [ 34  73]]
Sensitivity :
0.682242990654
Precision : 
0.744897959184
Specificity : 
0.859550561798
False Positive Rate : 
0.140449438202
0.79298245614


In [96]:
logreg = LogisticRegression()
print(report_train_accuracy(logreg, X.append(X), y.append(y)))

Confusion Matrix : 
[[730 118]
 [170 406]]
Sensitivity :
0.704861111111
Precision : 
0.774809160305
Specificity : 
0.860849056604
False Positive Rate : 
0.139150943396
0.797752808989


In [72]:
logreg = LogisticRegression()
print(report_train_test_accuracy(logreg, X.append(X), y.append(y), 0.4, 9))

Confusion Matrix : 
[[287  51]
 [ 77 155]]
Sensitivity :
0.668103448276
Precision : 
0.752427184466
Specificity : 
0.849112426036
False Positive Rate : 
0.150887573964
0.775438596491


## KNN evaluation

In [85]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')

In [86]:
print(report_train_accuracy(knn, X, y))

0.98595505618


In [88]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance')
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

0.685620389001


In [22]:
get_out_put(knn, X, y, X_final, "titanic_results_04.csv")

NameError: name 'get_out_put' is not defined

In [23]:
logreg = LogisticRegression()

In [24]:
get_out_put(logreg, X, y, X_final, "titanic_results_05.csv")

NameError: name 'get_out_put' is not defined

In [25]:
print(report_train_test_accuracy(knn, X, y, 0.4, 9))

0.719298245614


In [26]:
print(report_train_test_accuracy(logreg, X, y, 0.4, 9))

0.79298245614


0.788054437738


In [38]:
logreg = LogisticRegression()
print(report_train_accuracy(logreg, X, y))

0.794943820225


In [29]:
knn = KNeighborsClassifier(n_neighbors=70)
print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

0.692483791639


In [47]:
knn = KNeighborsClassifier(n_neighbors=4)
print(report_train_accuracy(knn, X, y))

0.787921348315


In [30]:
tree1 = tree.DecisionTreeClassifier()

In [31]:
print(cross_val_score(tree1, X, y, cv=10, scoring='accuracy').mean())

0.764284037559


In [56]:
X[X.Fare == 7.9250]

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,3,0,26.0,0,0,7.925,0.0
69,3,0,17.0,4,2,7.925,0.0
105,3,1,37.0,2,0,7.925,0.0
116,3,1,21.0,0,0,7.925,0.0
174,3,1,21.0,0,0,7.925,0.0
217,3,0,27.0,0,0,7.925,0.0
383,3,1,32.0,0,0,7.925,0.0
393,3,1,28.0,2,0,7.925,0.0
401,3,1,39.0,0,0,7.925,0.0
415,3,1,44.0,0,0,7.925,0.0


In [55]:
tree1 = tree.DecisionTreeClassifier()
print(report_train_accuracy(tree1, X, y))
print(X[X.duplicated() == True])
print(y[X.duplicated() == True])
len(X[X.duplicated() == True])

print(X.columns)

0.98595505618
             Pclass  Sex    Age  SibSp  Parch     Fare  Embarked
PassengerId                                                     
134               2    0  29.00      1      0  26.0000       0.0
174               3    1  21.00      0      0   7.9250       0.0
214               2    1  30.00      0      0  13.0000       0.0
239               2    1  19.00      0      0  10.5000       0.0
289               2    1  42.00      0      0  13.0000       0.0
314               3    1  28.00      0      0   7.8958       0.0
321               3    1  22.00      0      0   7.2500       0.0
344               2    1  25.00      0      0  13.0000       0.0
346               2    0  24.00      0      0  13.0000       0.0
356               3    1  28.00      0      0   9.5000       0.0
373               3    1  19.00      0      0   8.0500       0.0
388               2    0  36.00      0      0  13.0000       0.0
419               2    1  30.00      0      0  13.0000       0.0
443        

In [32]:
print(report_train_test_accuracy(tree1, X, y, 0.4, 9))

0.712280701754


In [33]:
get_out_put(tree1, X, y, X_final, "titanic_results_dTree.csv")

NameError: name 'get_out_put' is not defined