In [36]:
import tensorflow as tf

In [37]:
# pandas
import pandas as pd

In [38]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

In [39]:
train_df = pd.read_csv("/data-sets/titanic/train.csv")
test_df    = pd.read_csv("/data-sets/titanic/test.csv")
test_validation = pd.read_csv("/data-sets/titanic/gender_submission.csv")

In [40]:
test_df = test_df.join(test_validation.set_index("PassengerId"),on="PassengerId", how="left")

In [41]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
Survived       418 non-null int64
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [42]:
all_df = pd.concat([train_df,test_df], axis=0)

In [43]:
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       1309 non-null int64
Ticket         1309 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 132.9+ KB


In [44]:
#convert title and lastname to integer
def split_name(fullName):
    firstName, lastName = fullName.split(",")
    titleIndex = lastName.find(".")
    title, lastName = lastName.split(".", 1)
    return [title,lastName]

def get_title(name):
    title = split_name(name['Name'])[0]
    return hash(title)%256
    
def get_lastName(name):
    lastName = split_name(name['Name'])[1]
    return hash(lastName)%1024

#convert sex to 1,0
def conv_sex(row):
    if row['Sex']=='male':
        return 1
    else:
        return 0

In [45]:
type(all_df['Name'])

pandas.core.series.Series

In [46]:
all_df['Name'] = all_df['Name'].astype(str)
all_df['Title'] = all_df[['Name']].apply(get_title,axis=1)
all_df['LastName'] = all_df[['Name']].apply(get_lastName,axis=1)
all_df = all_df.drop(['Name'], axis=1)

In [47]:
all_df['Sex'] = all_df['Sex'].astype(str)
all_df['Sex'] = all_df.apply(conv_sex, axis=1)

In [48]:
# get average, std, and number of NaN values in titanic_df
average_age_titanic   = all_df["Age"].mean()
std_age_titanic       = all_df["Age"].std()
count_nan_age_titanic = all_df["Age"].isnull().sum()

In [49]:
rand = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)


In [50]:
all_df["Age"][np.isnan(all_df["Age"])] = rand

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [51]:
#extract ticket class info from ticket field
def get_ticketClass(row):
    ticketInfo = row["Ticket"].split(" ",1)
    if len(ticketInfo) > 1: 
        return hash(ticketInfo[0])%256+1
    else:
        return 0

In [52]:
all_df['TicketClass'] = all_df[['Ticket']].apply(get_ticketClass, axis=1)

In [53]:
all_df = all_df.drop(['Ticket'],axis=1)

In [54]:
#Extract Cabin number
def get_cabinNum(row):
    if pd.isnull(row['Cabin']):
        return 0
    else:
        cabinInfo = row["Cabin"].split(" ")
        return len(cabinInfo)

In [55]:
all_df['CabinNum'] = all_df[['Cabin']].apply(get_cabinNum, axis=1)
all_df = all_df.drop(['Cabin'],axis=1)

In [56]:
#Extract Cabin number
def get_embarked(row):
    return hash(row["Embarked"])%256

In [57]:
all_df['Embarked'] = all_df[['Embarked']].apply(get_embarked, axis=1)

In [58]:
#Fare
average_fare_titanic   = test_df["Fare"].mean()
std_fare_titanic       = test_df["Fare"].std()
count_nan_fare_titanic = test_df["Fare"].isnull().sum()

In [59]:
rand_fare = np.random.randint(average_fare_titanic - std_fare_titanic, average_fare_titanic + std_fare_titanic, size = count_nan_fare_titanic)
all_df["Fare"][np.isnan(all_df["Fare"])] = rand_fare


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [60]:
all_features = all_df.drop("Survived",axis=1)

In [61]:
all_outputs = all_df['Survived']

In [62]:
# Normalize the inputs so they have ~0 mean, and 1 Standard Deviation
# make trainning easiler
all_features = (all_features - all_features.mean(axis=0)) / all_features.std(axis=0)

In [63]:
all_df.drop("PassengerId",axis=1,inplace=True)

In [64]:
from sklearn.model_selection import train_test_split

In [65]:
X = all_features
y = all_outputs
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [66]:
# start with logistic ml
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
predictions = logmodel.predict(X_test)

In [67]:
# print report
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.88      0.88      0.88       269
          1       0.80      0.81      0.80       163

avg / total       0.85      0.85      0.85       432



In [68]:
# start with SVM
from sklearn.svm import SVC
model = SVC()
model.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [69]:
# add Grid Search to find best parama
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001], 'kernel': ['rbf']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid.fit(X_train,y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] kernel=rbf, C=0.1, gamma=1 ......................................
[CV] ....... kernel=rbf, C=0.1, gamma=1, score=0.621160, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=1 ......................................
[CV] ....... kernel=rbf, C=0.1, gamma=1, score=0.623288, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=1 ......................................
[CV] ....... kernel=rbf, C=0.1, gamma=1, score=0.623288, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=0.1 ....................................
[CV] ..... kernel=rbf, C=0.1, gamma=0.1, score=0.836177, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=0.1 ....................................
[CV] ..... kernel=rbf, C=0.1, gamma=0.1, score=0.863014, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=0.1 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ..... kernel=rbf, C=0.1, gamma=0.1, score=0.845890, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=0.01 ...................................
[CV] .... kernel=rbf, C=0.1, gamma=0.01, score=0.836177, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=0.01 ...................................
[CV] .... kernel=rbf, C=0.1, gamma=0.01, score=0.866438, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=0.01 ...................................
[CV] .... kernel=rbf, C=0.1, gamma=0.01, score=0.842466, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=0.001 ..................................
[CV] ... kernel=rbf, C=0.1, gamma=0.001, score=0.621160, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=0.001 ..................................
[CV] ... kernel=rbf, C=0.1, gamma=0.001, score=0.623288, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=0.001 ..................................
[CV] ... kernel=rbf, C=0.1, gamma=0.001, score=0.623288, total=   0.0s
[CV] kernel=rbf, C=0.1, gamma=0.0001 .................................
[CV] .

[CV] ... kernel=rbf, C=1000, gamma=0.01, score=0.825939, total=   0.1s
[CV] kernel=rbf, C=1000, gamma=0.01 ..................................
[CV] ... kernel=rbf, C=1000, gamma=0.01, score=0.825342, total=   0.1s
[CV] kernel=rbf, C=1000, gamma=0.01 ..................................
[CV] ... kernel=rbf, C=1000, gamma=0.01, score=0.818493, total=   0.1s
[CV] kernel=rbf, C=1000, gamma=0.001 .................................
[CV] .. kernel=rbf, C=1000, gamma=0.001, score=0.836177, total=   0.0s
[CV] kernel=rbf, C=1000, gamma=0.001 .................................
[CV] .. kernel=rbf, C=1000, gamma=0.001, score=0.869863, total=   0.0s
[CV] kernel=rbf, C=1000, gamma=0.001 .................................
[CV] .. kernel=rbf, C=1000, gamma=0.001, score=0.845890, total=   0.0s
[CV] kernel=rbf, C=1000, gamma=0.0001 ................................
[CV] . kernel=rbf, C=1000, gamma=0.0001, score=0.836177, total=   0.1s
[CV] kernel=rbf, C=1000, gamma=0.0001 ................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    2.6s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kernel': ['rbf'], 'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [70]:
# print report
from sklearn.metrics import classification_report,confusion_matrix
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,predictions))

[[238  31]
 [ 31 132]]
             precision    recall  f1-score   support

          0       0.88      0.88      0.88       269
          1       0.80      0.81      0.80       163

avg / total       0.85      0.85      0.85       432



In [71]:
# start with tensorflow version 1

input_vector_size = X_train.shape[1]
output_size = 1
number_of_hidden_neurons = 5

y_train = np.expand_dims(y_train, 1)
y_test = np.expand_dims(y_test, 1)

# === We build the graph here!
titanic_graph_1 = tf.Graph()

with titanic_graph_1.as_default():
    
    # We create "None" size placeholders to let us put variable sized "Batches" of data at a time
    x = tf.placeholder("float", shape = [None, input_vector_size])
    y = tf.placeholder("float", shape =[None, output_size])

    # We're going to use an initializer to generate random values for our weights
    initializer = tf.contrib.layers.xavier_initializer()

    # Hidden layer weights, connecting input to hidden neurons
    hidden_weights = tf.Variable(initializer(shape=[input_vector_size, number_of_hidden_neurons]))
    
    # Output layer weights, connecting hidden neurons to output
    output_weights = tf.Variable(initializer(shape=[number_of_hidden_neurons, output_size]))
    # Biases for the hidden neurons
    bias = tf.Variable(tf.zeros([output_size]))
    
    # Biases for the output 
    bias1 = tf.Variable(tf.zeros([number_of_hidden_neurons]))
    
    # Hidden layer logits and activation
    hidden = tf.nn.tanh(tf.matmul(x, hidden_weights) + bias1)
    
    # Output layer 
    output_layer = (tf.matmul(hidden, output_weights) + bias)
    
    # Squared Error function
    error = tf.reduce_mean(tf.pow((y-output_layer), 2))
    
    # We will use Adam Optimizer for network optimization
    optimizer = tf.train.AdamOptimizer().minimize(error)
    
    # Our initialization operation
    init = tf.global_variables_initializer()

In [72]:
# We create our sessions
sess_1 = tf.Session(graph=titanic_graph_1)

# Make sure to run the initialization
sess_1.run(init)

Total_ephoch = 20000

train_error = []
valid_error = []

# Train loop for the model
for i in range(Total_ephoch):
    
    #Session runs train_op to minimize loss
    sess_1.run(optimizer, feed_dict={x: X_train, y: y_train})
    
    train_error.append(sess_1.run(error, feed_dict={x: X_train, y: y_train}))
    valid_error.append(sess_1.run(error, feed_dict={x: X_test, y: y_test}))
    
    if i%1000 == 0:
        print ("validation error:", valid_error[i])


validation error: 2.30426
validation error: 0.120678
validation error: 0.110402
validation error: 0.106732
validation error: 0.105907
validation error: 0.107708
validation error: 0.108566
validation error: 0.11017
validation error: 0.112583
validation error: 0.111674
validation error: 0.111655
validation error: 0.111214
validation error: 0.112143
validation error: 0.114493
validation error: 0.116316
validation error: 0.117358
validation error: 0.118238
validation error: 0.118545
validation error: 0.118698
validation error: 0.11895
