In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import random
import numpy as np
import pandas as pd
from sklearn import datasets, svm, cross_validation, tree, preprocessing, metrics
import sklearn.ensemble as ske
import tensorflow as tf
from tensorflow.contrib import skflow

In [None]:
titanic_df = pd.read_excel('titanic3.xls', 'titanic3', index_col=None, na_values= ['NA'])

In [None]:
titanic_df.head()

In [None]:
titanic_df['survived'].mean()

In [None]:
titanic_df.groupby('pclass').mean()

In [None]:
class_sex_grouping = titanic_df.groupby(['pclass', 'sex']).mean()
class_sex_grouping

In [None]:
class_sex_grouping['survived'].plot.bar()

In [None]:
group_by_age = pd.cut(titanic_df['age'], np.arange(0, 90, 10))
age_grouping = titanic_df.groupby(group_by_age).mean()
age_grouping['survived'].plot.bar()

In [None]:
titanic_df.count()

In [None]:
titanic_df = titanic_df.drop(['body', 'cabin', 'boat'], axis=1)

In [None]:
titanic_df['home.dest'] = titanic_df['home.dest'].fillna('NA')

In [None]:
titanic_df = titanic_df.dropna()

In [None]:
titanic_df.count()

In [None]:
def preprocess_titanic_df(df):
    processed_df = df.copy()
    le = preprocessing.LabelEncoder()
    processed_df.sex = le.fit_transform(processed_df.sex)
    processed_df.embarked = le.fit_transform(processed_df.embarked)
    processed_df = processed_df.drop(['name', 'ticket', 'home.dest'], axis=1)
    return processed_df

In [None]:
processed_df = preprocess_titanic_df(titanic_df)

In [None]:
X = processed_df.drop(['survived'], axis=1).values
y = processed_df['survived'].values

In [None]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

In [None]:
clf_dt = tree.DecisionTreeClassifier(max_depth=10)

In [None]:
clf_dt.fit(X_train, y_train)
clf_dt.score(X_test, y_test)

In [None]:
shuffle_validator = cross_validation.ShuffleSplit(len(X), n_iter=20, test_size=0.2, random_state=0)
def test_classifier(clf):
    scores = cross_validation.cross_val_score(clf, X, y, cv=shuffle_validator)
    print("Accuracy: %0.4f (+/- %0.2f)" % (scores.mean(), scores.std()))

In [None]:
test_classifier(clf_dt)

In [None]:
clf_rf = ske.RandomForestClassifier(n_estimators=50)
test_classifier(clf_rf)

In [None]:
clf_gb = ske.GradientBoostingClassifier(n_estimators=50)
test_classifier(clf_gb)

In [None]:
eclf = ske.VotingClassifier([('dt', clf_dt), ('rf', clf_rf), ('gb', clf_gb)])
test_classifier(eclf)

In [None]:
tf_clf_dnn = skflow.TensorFlowDNNClassifier(hidden_units=[20, 40, 20], n_classes=2, 
                                            batch_size=256, steps=1000, learning_rate=0.05)
tf_clf_dnn.fit(X_train, y_train)
metrics.accuracy_score(y_test, tf_clf_dnn.predict(X_test))

In [None]:
passengers_set_1 = titanic_df[titanic_df.pclass == 1].iloc[:20,:].copy()
passengers_set_2 = titanic_df[titanic_df.pclass == 2].iloc[:20,:].copy()
passengers_set_3 = titanic_df[titanic_df.pclass == 3].iloc[:20,:].copy()
passenger_set = pd.concat([passengers_set_1, passengers_set_2, passengers_set_3])
testing_set = preprocess_titanic_df(passenger_set)

In [None]:
X_train = training_set.drop(['survived'], axis=1).values
y_train = training_set['survived'].values
X_test = testing_set.drop(['survived'], axis=1).values
y_test = testing_set['survived'].values

In [None]:
training_set = pd.concat([titanic_df, passenger_set]).drop_duplicates(keep=False)
training_set = preprocess_titanic_df(training_set)

In [None]:
tf_clf_dnn.fit(X_train, y_train)
metrics.accuracy_score(y_test, tf_clf_dnn.predict(X_test))

In [None]:
prediction = tf_clf_dnn.predict(X_test)
passenger_set[passenger_set.survived != prediction]