In [1]:
#loading all the packages that will be needed
import numpy as np
import pandas as pd
import csv as csv
from sklearn.ensemble import RandomForestClassifier

In [2]:
#load full dataset
full_data = pd.read_csv('/home/tmorrill/Documents/Titanic/titanic_full.csv', header=0)

In [3]:
#explore the column labels and data types
#convert the objects to integer classifiers
full_data.dtypes

survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
pclass         int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [4]:
#create a new variable, Gender, converting sex to 0 and 1
full_data['Gender'] = full_data['sex'].map( {'female':0, 'male':1} ).astype(int)

In [5]:
full_data['Gender']

0       0
1       1
2       0
3       1
4       0
5       1
6       0
7       1
8       0
9       1
10      1
11      0
12      0
13      0
14      1
15      1
16      1
17      0
18      0
19      1
20      1
21      0
22      1
23      0
24      0
25      1
26      1
27      0
28      0
29      1
       ..
1279    0
1280    1
1281    1
1282    1
1283    1
1284    1
1285    1
1286    0
1287    1
1288    1
1289    1
1290    0
1291    1
1292    1
1293    1
1294    1
1295    1
1296    1
1297    1
1298    1
1299    1
1300    0
1301    1
1302    1
1303    1
1304    0
1305    0
1306    1
1307    1
1308    1
Name: Gender, dtype: int64

In [6]:
full_data['age']

0       29.0000
1        0.9167
2        2.0000
3       30.0000
4       25.0000
5       48.0000
6       63.0000
7       39.0000
8       53.0000
9       71.0000
10      47.0000
11      18.0000
12      24.0000
13      26.0000
14      80.0000
15          NaN
16      24.0000
17      50.0000
18      32.0000
19      36.0000
20      37.0000
21      47.0000
22      26.0000
23      42.0000
24      29.0000
25      25.0000
26      25.0000
27      19.0000
28      35.0000
29      28.0000
         ...   
1279    14.0000
1280    22.0000
1281    22.0000
1282        NaN
1283        NaN
1284        NaN
1285    32.5000
1286    38.0000
1287    51.0000
1288    18.0000
1289    21.0000
1290    47.0000
1291        NaN
1292        NaN
1293        NaN
1294    28.5000
1295    21.0000
1296    27.0000
1297        NaN
1298    36.0000
1299    27.0000
1300    15.0000
1301    45.5000
1302        NaN
1303        NaN
1304    14.5000
1305        NaN
1306    26.5000
1307    27.0000
1308    29.0000
Name: age, dtype: float6

In [7]:
#fill in missing age values with the median age
median_age = full_data['age'].dropna().median()
if len(full_data.age[ full_data.age.isnull() ] > 0):
    full_data.loc[ (full_data.age.isnull()), 'age'] = median_age

In [8]:
full_data.age

0       29.0000
1        0.9167
2        2.0000
3       30.0000
4       25.0000
5       48.0000
6       63.0000
7       39.0000
8       53.0000
9       71.0000
10      47.0000
11      18.0000
12      24.0000
13      26.0000
14      80.0000
15      28.0000
16      24.0000
17      50.0000
18      32.0000
19      36.0000
20      37.0000
21      47.0000
22      26.0000
23      42.0000
24      29.0000
25      25.0000
26      25.0000
27      19.0000
28      35.0000
29      28.0000
         ...   
1279    14.0000
1280    22.0000
1281    22.0000
1282    28.0000
1283    28.0000
1284    28.0000
1285    32.5000
1286    38.0000
1287    51.0000
1288    18.0000
1289    21.0000
1290    47.0000
1291    28.0000
1292    28.0000
1293    28.0000
1294    28.5000
1295    21.0000
1296    27.0000
1297    28.0000
1298    36.0000
1299    27.0000
1300    15.0000
1301    45.5000
1302    28.0000
1303    28.0000
1304    14.5000
1305    28.0000
1306    26.5000
1307    27.0000
1308    29.0000
Name: age, dtype: float6

In [9]:
#fill in the missing values of embarked with the most common port of embarkment
if len(full_data.embarked[ full_data.embarked.isnull() ]) > 0:
    full_data.embarked[ full_data.embarked.isnull() ] = full_data.embarked.dropna().mode().values
 
#mode_embarkment = full_data['embarked'].dropna().mode()

#if len(full_data.embarked[ full_data.embarked.isnull() ]) > 0:
#    full_data.embarked.loc[ (full_data.embarked.isnull()), 'embarked'] = mode_embarkment


A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [10]:
full_data.embarked.isnull().any().any()

False

In [11]:
#find the unique values
port = list(enumerate(np.unique(full_data['embarked'])))

port

#change the port of embarkment to an integer classifier
full_data['embarkment'] = full_data['embarked'].map( {'C':0, 'Q':1, 'S':2} ).astype(int)

#alternatively, we could have used the following code:
# set up a dictionary in the form  Ports : index
#Ports_dict = { name : i for i, name in Ports }              
# Convert all Embark strings to int
#train_df.Embarked = train_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)     

In [12]:
#drop the data that we won't be using
full_data = full_data.drop(['name', 'sex', 'ticket', 'cabin', 'embarked', 'boat', 'home.dest'], axis =1)

In [13]:
full_data.dtypes

survived        int64
age           float64
sibsp           int64
parch           int64
pclass          int64
fare          float64
body          float64
Gender          int64
embarkment      int64
dtype: object

In [14]:
#drop the data that we won't be using
full_data = full_data.drop(['body'], axis=1)

In [15]:
full_data.dtypes

survived        int64
age           float64
sibsp           int64
parch           int64
pclass          int64
fare          float64
Gender          int64
embarkment      int64
dtype: object

In [16]:
full_data.sibsp.isnull().any().any()
full_data.parch.isnull().any().any()
full_data.pclass.isnull().any().any()
full_data.fare.isnull().any().any()

True

In [17]:
full_data

Unnamed: 0,survived,age,sibsp,parch,pclass,fare,Gender,embarkment
0,1,29.0000,0,0,1,211.3375,0,2
1,1,0.9167,1,2,1,151.5500,1,2
2,0,2.0000,1,2,1,151.5500,0,2
3,0,30.0000,1,2,1,151.5500,1,2
4,0,25.0000,1,2,1,151.5500,0,2
5,1,48.0000,0,0,1,26.5500,1,2
6,1,63.0000,1,0,1,77.9583,0,2
7,0,39.0000,0,0,1,0.0000,1,2
8,1,53.0000,2,0,1,51.4792,0,2
9,0,71.0000,0,0,1,49.5042,1,0


In [18]:
#replace the null fare values with the median of that class
if len(full_data.fare[ full_data.fare.isnull() ]) > 0:
#creates a array of 3 zeros
    median_fare = np.zeros(3)
#defines the median for each class    
    for i in range(0,3):
        median_fare[i] = full_data[ full_data.pclass == i+1 ]['fare'].dropna().median()
#for null values, assigns the median value for that particular class
    for i in range(0,3):
        full_data.loc[ (full_data.fare.isnull()) & (full_data.pclass == i+1), 'fare'] = median_fare[i]

In [19]:
full_data.fare.isnull().any().any()

False

In [20]:
#split the training and test data at 70% and 30%, respectively
from sklearn.cross_validation import train_test_split

data_train, data_test = train_test_split(full_data, test_size=0.30)

In [21]:
data_train

Unnamed: 0,survived,age,sibsp,parch,pclass,fare,Gender,embarkment
384,0,28.0000,0,0,2,0.0000,1,2
968,0,36.0000,1,0,3,15.5500,1,2
1284,0,28.0000,0,0,3,8.0500,1,2
749,0,28.0000,1,1,3,14.4000,0,2
814,0,25.0000,0,0,3,7.7417,1,1
0,1,29.0000,0,0,1,211.3375,0,2
271,1,24.0000,1,0,1,82.2667,1,2
1120,1,25.0000,1,0,3,7.7750,1,2
1184,0,28.0000,2,0,3,21.6792,1,0
920,0,28.0000,0,0,3,7.7500,1,1


In [22]:
#create a list of the true values so that the model can be scored
true_values = data_test['survived']

In [23]:
true_values

49      1
260     1
955     0
917     1
1258    1
1210    0
228     0
1077    1
402     1
469     1
992     0
580     1
1103    0
589     1
97      1
700     0
306     0
1040    1
1016    0
404     0
17      1
368     0
111     1
1254    1
50      1
609     0
531     0
989     0
465     1
874     1
       ..
1010    0
471     1
387     1
328     0
1029    0
569     0
137     1
607     1
135     0
332     0
266     0
1176    0
75      0
521     1
526     1
22      1
857     1
1061    1
53      0
664     1
832     0
1096    0
660     1
342     1
379     1
629     0
172     0
645     1
94      1
321     0
Name: survived, dtype: int64

In [24]:
#convert to a numpy array
true_values1 = true_values.values

In [25]:
true_values1

array([1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0,

In [26]:
#drop the survived column from the test data
data_test = data_test.drop(['survived'], axis=1)

In [27]:
#convert the data back to numpy arrays
train_data = data_train.values
test_data = data_test.values

In [28]:
#create the forest variable with 100 trees
forest = RandomForestClassifier(n_estimators=100)
#fit the model with the training data
forest = forest.fit(train_data[0::,1::], train_data[0::,0] )

In [29]:
#predict survived/not survived using the test data
output = forest.predict(test_data).astype(int)

In [30]:
#evaluate the results of the model
model_results = forest.score(test_data, true_values1, sample_weight=None)
model_results

0.78371501272264632