In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn import svm, linear_model, datasets, neighbors
from patsy import dmatrices
import random
from sklearn.neural_network import MLPClassifier
import sklearn.ensemble as ske

In [2]:
#importing train data 

#Formatted the surnames and tickets
#surnames: using the surnames, grouped people. Used the grouping for the formula
#tickets: removed the non-albhabet-numermic characters and used the first 3characters for the formula

df = pd.read_csv("data/train.csv")
df = df.drop(['Ticket', 'Cabin', 'Name', 'Surname', 'S_Ticket'], axis = 1)
df = df.dropna()

#ToDos
#(a) have a single boolean variable for relatives on the ship
#(b) age need not be a continuous variable, we could combine them as child, male, female (child < 16, has greater chance of survival)
#(c) need to take care of the missing values (drop, replace with frequently occuring, mean)
#(d) Fare variable is already correlated with Class, Embarked variable

In [None]:
#shuffling the data
df = df.sample(frac=1).reset_index(drop=True)

#using dmatrices to get data for the ml algorithms
formula = 'Survived ~ C(Pclass) + C(Sex)+ Age + SibSp + Parch + Fare + Surname_In + Ticket_In + C(Embarked)'
y,x = dmatrices(formula, data=df, return_type='dataframe')

In [11]:
#cross validation within the train data
n_train = int(0.15*len(y))
x_train = x[n_train:]
y_train = y[n_train:]

x_test = x[:n_train]
y_test = y[:n_train]

y_train = np.asarray(y_train).ravel()
y_test = np.asarray(y_test).ravel()

In [12]:
#Logistic Regression
acc_sum = 0
logistic = linear_model.LogisticRegression(C = 1, max_iter = 200, tol = 1e-5, class_weight = {1: 1})
for i in range(5):
    acc_sum += logistic.fit(x_train, y_train).score(x_test, y_test)
accuracy_LR = acc_sum/5

In [13]:
#KNN
kneighbors = neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=60)

acc_sum = 0
logistic = linear_model.LogisticRegression(C = 1, max_iter = 200, tol = 1e-5, class_weight = {1: 1})
for i in range(5):
    acc_sum += kneighbors.fit(x_train, y_train).score(x_test, y_test)
accuracy_KNN = acc_sum/5

In [14]:
#NeuralNets
acc_sum = 0
neural_classifier = MLPClassifier(max_iter = 300)

for i in range(5):
    acc_sum += neural_classifier.fit(x_train, y_train).score(x_test, y_test)
accuracy_NN = acc_sum/5

In [15]:
#Random Forest
acc_sum = 0
for i in range(5):
    acc_sum += ske.RandomForestClassifier(n_estimators=100).fit(x_train, y_train).score(x_test, y_test)
accuracy_RF = acc_sum/5

In [16]:
#SVM
y_svm,x_svm = dmatrices(formula, data=df, return_type='matrix')
#need to experiment with different features
#'Intercept' (column 0), 'C(Pclass)' (columns 1:3), 'C(Sex)' (column 3), 'C(Embarked)' (columns 4:6)
#'Age' (column 6), 'SibSp' (column 7), 'Parch' (column 8), 'Fare' (column 9)

feature_1 = 3
feature_2 = 2

x_svm = np.asarray(x_svm)
x_svm = x_svm[:,[feature_1, feature_2]]  

y_svm = np.asarray(y_svm)
y_svm = y_svm.flatten()

#cross validation within the train data
n_train = int(0.15*len(y))
x_svm_train = x_svm[n_train:]
y_svm_train = y_svm[n_train:]

x_svm_test = x_svm[:n_train]
y_svm_test = y_svm[:n_train]

# create a list of the types of kernels we will use for your analysis

accuracy_SVM_linear = svm.SVC(kernel='linear', C = 2).fit(x_svm_train, y_svm_train).score(x_svm_test, y_svm_test)
accuracy_SVM_rbf = svm.SVC(kernel='rbf', gamma=3).fit(x_svm_train, y_svm_train).score(x_svm_test, y_svm_test)
accuracy_SVM_poly = svm.SVC(kernel='poly', gamma=2).fit(x_svm_train, y_svm_train).score(x_svm_test, y_svm_test)

In [17]:
#Print and compare the results
print("Logistic Regression: ", accuracy_LR)
print("K Nearest Neighbors: ", accuracy_KNN)
print("Neural Nets: ", accuracy_NN)
print("Random Forest: ", accuracy_RF)
print("SVM - ", "(a)linear: ", accuracy_SVM_linear, ", (b)rbf: ", accuracy_SVM_rbf, ", (c)poly: ", accuracy_SVM_poly)

Logistic Regression:  0.745283018868
K Nearest Neighbors:  0.660377358491
Neural Nets:  0.660377358491
Random Forest:  0.807547169811
SVM -  (a)linear:  0.764150943396 , (b)rbf:  0.764150943396 , (c)poly:  0.764150943396


In [10]:
#Building formal models
df_predict = pd.read_csv("data/test.csv")
df_predict['Survived'] = 1
df_predict = df_predict.drop(['Ticket', 'Cabin', 'Name','Surname', 'S_Ticket'], axis = 1)
df_predict = df_predict.sample(frac=1).reset_index(drop=True)
df_predict = df_predict.fillna(method = 'pad')


#training on the best models from above: RNN, SVM(poly), SVM(rbf), LogisticRegression

#LogisticRegression
y_predict,x_predict = dmatrices(formula, data=df_predict, return_type='dataframe')
x_predict
logistic = linear_model.LogisticRegression(C = 1, max_iter = 200, tol = 1e-5, class_weight = {1: 1})
logistic.fit(x, y)
y_predict = logistic.predict(x_predict)
np.savetxt("prediction_lr.csv", y_predict.astype(int), delimiter=",")


#RNN
y_predict,x_predict = dmatrices(formula, data=df_predict, return_type='dataframe')

y = np.asarray(y).ravel()
y_predict = np.asarray(y_predict).ravel()
rnn = ske.RandomForestClassifier(n_estimators=100).fit(x, y)
y_predict = rnn.predict(x_predict)
np.savetxt("prediction_rf.csv", y_predict.astype(int), delimiter=",")


#SVM
y_svm_p,x_svm_p = dmatrices(formula, data=df_predict, return_type='matrix')
x_svm_p = np.asarray(x_svm_p)
x_svm_p = x_svm_p[:,[feature_1, feature_2]]  

y_svm_p = np.asarray(y_svm_p)
y_svm_p = y_svm_p.flatten()

rbf = svm.SVC(kernel='rbf', gamma=3).fit(x_svm, y_svm)
poly = svm.SVC(kernel='poly', gamma=2).fit(x_svm, y_svm)

y_rbf = rbf.predict(x_svm_p)
y_poly = poly.predict(x_svm_p)

np.savetxt("prediction_svm_rbf.csv", y_rbf.astype(int), delimiter=",")
np.savetxt("prediction_svm_poly.csv", y_poly.astype(int), delimiter=",")

  y = column_or_1d(y, warn=True)
