# from the Kaggle learning competition

https://www.kaggle.com/c/titanic

Note:You will need the train and test data sets from this competition in the same folder as this Jupyter notebook to proceed.

In [None]:
"""
Original Author : AstroDave
Date : 23rd September 2012
Revised: 15 April 2014
please see packages.python.org/milk/randomforests.html for more
""" 

import pandas as pd
import numpy as np
import csv as csv
import matplotlib.pyplot as plt

#show plots in the notebook
%matplotlib inline

# TRAINING DATA
train_df = pd.read_csv('train.csv', header=0)  # Load the train file into a dataframe

train_df.head()

In [None]:
# convert all strings to integer classifiers.
# female = 0, Male = 1

train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
train_df['Gender'].head()

In [None]:
# code to convert variables to numeric values
# fill in the missing values of the data and make it complete.

# Embarked from 'C', 'Q', 'S'
# Note this is not ideal: in translating categories to numbers, Port "2" is not 2 times greater than Port "1", etc.

# All missing Embarked -> just make them embark from most common place
if len(train_df.Embarked[ train_df.Embarked.isnull() ]) > 0:
    train_df.Embarked[ train_df.Embarked.isnull() ] = train_df.Embarked.dropna().mode().values

Ports = list(enumerate(np.unique(train_df['Embarked'])))    # determine all values of Embarked,
Ports_dict = { name : i for i, name in Ports }              # set up a dictionary in the form  Ports : index
train_df.Embarked = train_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)     # Convert all Embark strings to int

# All the ages with no data -> make the median of all Ages
median_age = train_df['Age'].dropna().median()
if len(train_df.Age[ train_df.Age.isnull() ]) > 0:
    train_df.loc[ (train_df.Age.isnull()), 'Age'] = median_age

# Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1)

In [None]:
train_df.head()

In [None]:
# more about the data set
print(train_df.shape)
print(train_df.columns)
train_df.describe()

In [None]:
# looking at some correlations
corr_feature = 'Survived'
corr_df = train_df.corr().sort(corr_feature,ascending=False)[corr_feature]
print(corr_df.head())
print()
print(corr_df.tail())

In [None]:
train_df['Survived'].hist()
plt.xlim([-0.5,1.5])
plt.title('Survival outcomes')
plt.xlabel('Survived (1) or died (0)')
plt.ylabel('number of passengers')

In [None]:
# showing the differences in survival by class
pd.crosstab(train_df.Survived,train_df.Pclass).plot(kind='bar')
plt.title('Survived split by class')

In [None]:
# showing the differences in survival by gender
pd.crosstab(train_df.Survived,train_df.Gender).plot(kind='bar')
plt.title('Survived split by gender')

In [None]:
# A scatterplot showing the relationship between age and fare (for example)
plt.scatter(train_df.Age,train_df.Fare)

# there are many, many other plotting options using the matplotlib library

In [None]:
# basic model using class and gender for prediction (with overfitting)
from sklearn.linear_model import LogisticRegression
from sklearn import svm

#model = LogisticRegression()
model = svm.SVC()

y = train_df['Survived']

X = train_df[['Pclass']] 
#X = train_df[['Gender']] 
#X = train_df[['Gender','Pclass','Parch','SibSp','Age','Embarked','Fare']] 

model.fit(X,y)
print("Model accuracy (with overfitting) =",model.score(X,y))

In [None]:
# cross validation
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(LogisticRegression(), X, y,
                         scoring='accuracy', cv=10)
#print(scores)
print("model cross-validation accuracy=",scores.mean())

# (optional) the following is if you want to submit your model to the kaggle competition website. After preparing your model above, apply it to the test data as shown below.

In [None]:
# TEST DATA cleanup
test_df = pd.read_csv('test.csv', header=0)        # Load the test file into a dataframe

# I need to do the same with the test data now, so that the columns are the same as the training data
# I need to convert all strings to integer classifiers:
# female = 0, Male = 1
test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

# Embarked from 'C', 'Q', 'S'
# All missing Embarked -> just make them embark from most common place
if len(test_df.Embarked[ test_df.Embarked.isnull() ]) > 0:
    test_df.Embarked[ test_df.Embarked.isnull() ] = test_df.Embarked.dropna().mode().values
# Again convert all Embarked strings to int
test_df.Embarked = test_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)

# All the ages with no data -> make the median of all Ages
median_age = test_df['Age'].dropna().median()
if len(test_df.Age[ test_df.Age.isnull() ]) > 0:
    test_df.loc[ (test_df.Age.isnull()), 'Age'] = median_age

# All the missing Fares -> assume median of their respective class
if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
    median_fare = np.zeros(3)
    for f in range(0,3):                                              # loop 0 to 2
        median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
    for f in range(0,3):                                              # loop 0 to 2
        test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]

# Collect the test data's PassengerIds before dropping it
ids = test_df['PassengerId'].values
# Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1) 

In [None]:
test_df.head()

In [None]:
# The data is now ready to go. So lets fit to the train, then predict to the test!
# Convert back to a numpy array
train_data = train_df.values
test_data = test_df.values

print('Training...')
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )

print('Predicting...')
output = forest.predict(test_data).astype(int)
print('Done.')

In [None]:
# (optional) preparing the CSV file to submit to Kaggle

predictions_file = open("myfirstforest.csv", "wt")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()