In [153]:
%matplotlib inline
import pandas, matplotlib, numpy

from scipy.stats import linregress
from sklearn.svm import SVC


#Thought the output sample were actual answers for contest
def compare_computed_prediction_with_true_answer(
    path_to_answer="", answer_predictions=pandas.DataFrame):
    n_good_answers = 0
    answer_df = pandas.read_csv(path_to_answer)
    len_answers = len(answer_df["Survived"])
    for i in range(len_answers):
        if answer_df["Survived"][i] == answer_predictions["Survived"][i]:
            n_good_answers += 1
    return float(n_good_answers)/(len_answers+1)


unmodified_data_df = pandas.read_csv('data/train.csv').fillna(0)
test_data_df = pandas.read_csv('data/test.csv')

#First clean
unmodified_data_df = unmodified_data_df
test_data_df = test_data_df.fillna(0)
avg_age_value = 0.5*(numpy.mean(unmodified_data_df["Age"]) + 
                     numpy.average(unmodified_data_df["Age"]))

unmodified_data_df["Sex"] = unmodified_data_df["Sex"].map({"female": 0, "male": 1})
unmodified_data_df["Age"].replace(0, avg_age_value)
test_data_df["Sex"] = test_data_df["Sex"].map({"female": 0, "male": 1})
test_data_df["Age"].replace(0, avg_age_value)

#print(unmodified_data_df)
#print(test_data_df)

#label for training
train_label = numpy.array(unmodified_data_df["Survived"]).transpose()

#Let's split these dataframe
train_p_class = unmodified_data_df["Pclass"]
train_name = unmodified_data_df["Name"]
train_sex = unmodified_data_df["Sex"]
train_age = unmodified_data_df["Age"]
train_sibsp = unmodified_data_df["SibSp"]

test_p_class = test_data_df["Pclass"]
test_name = test_data_df["Name"]
test_sex = test_data_df["Sex"]
test_age = test_data_df["Age"]
test_sibsp = test_data_df["SibSp"]

training_inputs = numpy.array([train_p_class, train_sex, train_age, train_sibsp]).transpose()
test_inputs = numpy.array([test_p_class, test_sex, test_age, test_sibsp]).transpose()

#training the model (model is suppoort vector machine used for classification)
first_model = SVC(gamma='auto')
first_model.fit(training_inputs, train_label)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [154]:
#This time lets try with more attributes
train_parch = unmodified_data_df["Parch"]
train_fare = unmodified_data_df["Fare"]
train_cabin = unmodified_data_df["Cabin"]
train_embarked = unmodified_data_df["Embarked"]
train_cabin = train_cabin.notnull().astype("int")
train_embarked = train_embarked.replace(["S", "Q", "C"], [1, 2, 3])

test_parch = test_data_df["Parch"]
test_fare = test_data_df["Fare"]
test_cabin = test_data_df["Cabin"]
test_embarked = test_data_df["Embarked"]
test_cabin = test_cabin.notnull().astype("int")
test_embarked = test_embarked.replace(["S", "Q", "C"], [1, 2, 3])

#check how each attribute is linked to outcome
print("p_class: ", linregress(train_p_class, train_label).rvalue)
print("sex: ", linregress(train_sex, train_label).rvalue)
print("age: ", linregress(train_age, train_label).rvalue)
print("sibsp: ", linregress(train_sibsp, train_label).rvalue)
print("parch: ", linregress(train_parch, train_label).rvalue)
print("fare: ", linregress(train_fare, train_label).rvalue)
#print("cabin: ", linregress(train_cabin, train_label).rvalue)
print("embarked: ", linregress(train_embarked, train_label).rvalue)

p_class:  -0.338481035961
sex:  -0.543351380658
age:  0.0105392158713
sibsp:  -0.0353224988857
parch:  0.0816294070835
fare:  0.257306522385
embarked:  0.163516651425


In [155]:
training_inputs = numpy.array([train_p_class, 
                               train_sex, 
                               #train_age, 
                               #train_sibsp, 
                               #train_parch, 
                               train_fare, 
                               #train_cabin, 
                               #train_embarked
                              ]).transpose()

test_inputs = numpy.array([test_p_class, 
                           test_sex, 
                           #test_age, 
                           #test_sibsp, 
                           #test_parch, 
                           test_fare, 
                           #test_cabin, 
                           #test_embarked
                          ]).transpose()

#training the new model
second_model = SVC(gamma='auto')
second_model.fit(training_inputs, train_label)

passengers_id = test_data_df["PassengerId"]
answers = second_model.predict(test_inputs)

answer_df = pandas.DataFrame(
    {"PassengerId": passengers_id, 
     "Survived": answers})

print(compare_computed_prediction_with_true_answer(
    "data/true_answer/gender_submission.csv", answer_df))

answer_df.to_csv("data/outputs/svc_output.csv", index=False)

0.8878281622911695


In [156]:
from sklearn.neighbors import KNeighborsClassifier
neighbors_classifier = KNeighborsClassifier(n_neighbors=10)

neighbors_classifier.fit(training_inputs, train_label)
answers = neighbors_classifier.predict(test_inputs)
answer_df = pandas.DataFrame(
    {"PassengerId": passengers_id, 
     "Survived": answers})

print(compare_computed_prediction_with_true_answer(
    "data/true_answer/gender_submission.csv", answer_df))

answer_df.to_csv("data/outputs/knn_output.csv", index=False)

0.8210023866348448


In [157]:
from sklearn import tree

classification_tree = tree.DecisionTreeClassifier()
classification_tree = classification_tree.fit(training_inputs, train_label)
answers = classification_tree.predict(test_inputs)
answer_df = pandas.DataFrame(
    {"PassengerId": passengers_id, 
     "Survived": answers})

print(compare_computed_prediction_with_true_answer(
    "data/true_answer/gender_submission.csv", answer_df))

answer_df.to_csv("data/outputs/decision_tree_output.csv", index=False)
#score for classification tree = 0.75119

0.863961813842482


In [158]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

classification_forest = RandomForestClassifier(100)
classification_forest.fit(training_inputs, train_label)
answers = classification_forest.predict(test_inputs)
answer_df =  pandas.DataFrame(
    {"PassengerId": passengers_id, 
     "Survived": answers})

answer_df.to_csv("data/outputs/random_forest_output_100_trees", index=False)
#100 trees give score of = 0.75590

In [159]:
classification_forest = RandomForestClassifier(50)
classification_forest.fit(training_inputs, train_label)
answers = classification_forest.predict(test_inputs)
answer_df =  pandas.DataFrame(
    {"PassengerId": passengers_id, 
     "Survived": answers})

answer_df.to_csv("data/outputs/random_forest_output_50_trees", index=False)
#50 trees give score of 0.7655

In [160]:
classification_forest = RandomForestClassifier(75)
classification_forest.fit(training_inputs, train_label)
answers = classification_forest.predict(test_inputs)
answer_df =  pandas.DataFrame(
    {"PassengerId": passengers_id, 
     "Survived": answers})

answer_df.to_csv("data/outputs/random_forest_output_75_trees", index=False)
#75 trees give score of 0.77511

In [161]:
classification_forest = RandomForestClassifier(75, max_depth=5)
classification_forest.fit(training_inputs, train_label)
answers = classification_forest.predict(test_inputs)
answer_df =  pandas.DataFrame(
    {"PassengerId": passengers_id, 
     "Survived": answers})

answer_df.to_csv("data/outputs/random_forest_output_75_trees_max_depth_5", index=False)
#reducing number of max layers to 5 brings score to 0.77033