In [351]:
#loading all the packages that will be needed
import numpy as np
import pandas as pd
import csv as csv
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

In [352]:
#load full dataset
full_data = pd.read_csv('/home/tmorrill/Documents/Titanic/titanic_full.csv', header=0)

In [353]:
#explore the data with a bar chart for gender
male = sum(sum([full_data.sex == "male"]))
female = sum(sum([full_data.sex == "female"]))
x = range(2)
y = [female, male]
labels = ["Females","Males"]
plt.bar(x,y, width=0.8, align='center')
plt.xticks(x, labels)
plt.xlabel('Gender')
plt.ylabel('Count of Gender')
plt.title('Count of Gender')
plt.show()

In [354]:
#explore the distribution of ages
ages = full_data.age
bins = [0,10,20,30,40,50,60,70,80,90]

plt.hist(ages, bins, histtype='bar', rwidth=0.8, color='c')
plt.xlabel('Ages')
plt.ylabel('Count')
plt.show()

In [355]:
#explore the distribution of fares
fares = full_data.fare
bins_fares = [0,5,10,15,20,25,30,40,60,80,120,160,200,240,280]

plt.hist(fares, bins_fares, histtype='bar', rwidth=0.8, color='c')
plt.xlabel('Fares')
plt.ylabel('Count')
plt.show()

In [356]:
#explore the data with a bar chart for pclass
first_class = sum(sum([full_data.pclass == 1]))
second_class = sum(sum([full_data.pclass == 2]))
third_class = sum(sum([full_data.pclass == 3]))

x = range(3)
y = [first_class, second_class, third_class]
labels = ["First","Second", "Third"]
plt.bar(x,y, width=0.8, align='center')
plt.xticks(x, labels)
plt.xlabel('Class')
plt.ylabel('Count of Class')
plt.title('Class Breakdown')
plt.show()

In [357]:
#convert the objects to integer classifiers
#create a new variable, Gender, converting sex to 0 and 1
full_data['Gender'] = full_data['sex'].map( {'female':0, 'male':1} ).astype(int)

In [358]:
#fill in the missing values of embarked with the most common port of embarkment
if len(full_data.embarked[ full_data.embarked.isnull() ]) > 0:
    full_data.embarked[ full_data.embarked.isnull() ] = full_data.embarked.dropna().mode().values

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [359]:
#find the unique embarked values
port = list(enumerate(np.unique(full_data['embarked'])))

#change the port of embarkment to an integer classifier
full_data['embarkment'] = full_data['embarked'].map( {'C':0, 'Q':1, 'S':2} ).astype(int)

In [360]:
#fill in missing age values with the median age
median_age = full_data['age'].dropna().median()
if len(full_data.age[ full_data.age.isnull() ] > 0):
    full_data.loc[ (full_data.age.isnull()), 'age'] = median_age

In [361]:
#replace the null fare values with the median of that class
if len(full_data.fare[ full_data.fare.isnull() ]) > 0:
#creates a array of 3 zeros
    median_fare = np.zeros(3)
#defines the median for each class    
    for i in range(0,3):
        median_fare[i] = full_data[ full_data.pclass == i+1 ]['fare'].dropna().median()
#for null values, assigns the median value for that particular class
    for i in range(0,3):
        full_data.loc[ (full_data.fare.isnull()) & (full_data.pclass == i+1), 'fare'] = median_fare[i]

In [362]:
#drop the data that we won't be using
full_data = full_data.drop(['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat', 'home.dest', 'body'], axis =1)

In [363]:
#split the training and test data at 70% and 30%, respectively
from sklearn.cross_validation import train_test_split

data_train, data_test = train_test_split(full_data, test_size=0.30)

In [364]:
#create a list of the true values so that the model can be scored
true_values = data_test['survived']

In [365]:
#convert to a numpy array
true_values1 = true_values.values

In [366]:
#drop the survived column from the test data
data_test = data_test.drop(['survived'], axis=1)

In [367]:
#create a list of model features to be evaluated for their importance
feature_names = list(data_test.columns.values)

In [368]:
#convert the data back to numpy arrays
train_data = data_train.values
test_data = data_test.values

In [369]:
#create the forest variable with 100 trees
forest = RandomForestClassifier(n_estimators=100)
#fit the model with the training data
forest = forest.fit(train_data[0::,1::], train_data[0::,0] )

In [370]:
#predict survived/not survived using the test data
output = forest.predict(test_data).astype(int)

In [371]:
#evaluate the results of the model
model_results = forest.score(test_data, true_values1, sample_weight=None)
model_results

0.80152671755725191

In [372]:
#identify the most predictive/important variables
importances = list(forest.feature_importances_)
np.asarray(importances)
#combine the list of variable names with their importance scores
mergedlist = zip(feature_names,importances)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(7):
    print("%d. %s (%f)" % (f + 1, feature_names[f], importances[indices[f]]))

Feature ranking:
1. age (0.270027)
2. sibsp (0.269456)
3. parch (0.256993)
4. pclass (0.078352)
5. fare (0.052165)
6. Gender (0.041506)
7. embarkment (0.031499)


In [373]:
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature Importances")
plt.bar(range(7), importances, color="b", align="center")
plt.xticks(range(7), feature_names)
plt.xlim([-1, 7])
plt.xlabel('Variables')
plt.ylabel('Feature Importance')
plt.show()