In [266]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
import pandas as pd
import random as rand

# Import the data
X = pd.read_csv("train.csv")
# Create test set
y = X.pop("Survived")

In [267]:
X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,,0.0,0.0,7.9104
50%,446.0,3.0,,0.0,0.0,14.4542
75%,668.5,3.0,,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [268]:
# Impute Age
# After 'Sex', 'Age' is the most important variable for determining survival on the Titanic.
# This lead me to believe that imputing its missing values should be given some more careful consideration.
# Younger children back in the day were referred to as 'Master' and 'Miss'
# The ceiling age for people referred to in this way was about 17
# The floor age was <1yr old, so choosing 7 here for a low-end age for these naming conventions is arbitrary

# Seed the random number generator for consistent results
rand.seed(1)
idx = 0
for name in X["Name"]:
    if pd.isnull(X["Age"][idx]):
        if 'Master' in name:
            X["Age"][idx] = rand.randrange(7, 17)
        elif 'Miss' in name:
            X["Age"][idx] = rand.randrange(7, 17)
        else:
            X["Age"][idx] = 29 #29 is the mean age of all passengers            
    idx += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [269]:
# Get just the numeric variables by selecting only the variables that are not "object" datatypes.
numeric_variables = list(X.dtypes[X.dtypes != "object"].index)
X[numeric_variables].head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,1,3,22.0,1,0,7.25
1,2,1,38.0,1,0,71.2833
2,3,3,26.0,0,0,7.925
3,4,1,35.0,1,0,53.1
4,5,3,35.0,0,0,8.05


In [270]:
# Here is a simple function to show descriptive stats on the categorical variables
def describe_categorical(X):
    """
    Just like .describe(), but returns the results for
    categorical variables only.
    """
    from IPython.display import display, HTML
    display(HTML(X[X.columns[X.dtypes == "object"]].describe().to_html()))

In [271]:
describe_categorical(X)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Graham, Mr. George Edward",male,CA. 2343,C23 C25 C27,S
freq,1,577,7,4,644


In [272]:
# Drop variables I won't be using
# I dropped the 'Cabin' variable as well because there were so many missing values and no real way to impute their values
X.drop(["Name", "Ticket", "PassengerId", "Cabin"], axis=1, inplace=True)

In [273]:
# One hot encode categorical attributes
categorical_variables = ['Sex', 'Embarked']

for variable in categorical_variables:
    # Fill missing data with the word "Missing"
    X[variable].fillna("Missing", inplace=True)
    # Create array of dummies
    dummies = pd.get_dummies(X[variable], prefix=variable)
    # Update X to include dummies and drop the main variable
    X = pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)

## Parameter tests

Parameters to test

 * ###Parameters that will make your model better
  * <b>n_estimators</b>: The number of trees in the forest. Choose as high of a number as your computer can handle.
  * <b>max_features</b>: The number of features to consider when looking for the best split. Try ["auto", "None", "sqrt", "log2", 0.9, and 0.2]
  * <b>min_samples_leaf</b>: The minimum number of samples in newly created leaves.Try [1, 2, 3]. If 3 is the best, try higher numbers such as 1 through 10.
 * ###Parameters that will make it easier to train your model
  * <b>n_jobs</b>: Determines if multiple processors should be used to train and test the model. Always set this to -1 and %%timeit vs. if it is set to 1. It should be much faster (especially when many trees are trained).

### n_jobs

In [274]:
%%timeit
model = RandomForestRegressor(1000, oob_score=True, n_jobs=1, random_state=42)
model.fit(X, y)

1 loop, best of 3: 3.69 s per loop


In [275]:
%%timeit
model = RandomForestRegressor(1000, oob_score=True, n_jobs=-1, random_state=42)
model.fit(X, y)

1 loop, best of 3: 3.51 s per loop


### n_estimators

In [276]:
results = []
n_estimator_options = [30, 50, 100, 200, 500, 1000, 2000]

for trees in n_estimator_options:
    model = RandomForestRegressor(trees, oob_score=True, n_jobs=-1, random_state=42)
    model.fit(X, y)
    print (trees, "trees")
    roc = roc_auc_score(y, model.oob_prediction_)
    print ("C-stat: ", roc)
    results.append(roc)
    print ("")

(30, 'trees')
('C-stat: ', 0.84535146305350495)

(50, 'trees')
('C-stat: ', 0.85233119227942356)

(100, 'trees')
('C-stat: ', 0.85443762715836347)

(200, 'trees')
('C-stat: ', 0.85672514619883033)

(500, 'trees')
('C-stat: ', 0.85976096890678422)

(1000, 'trees')
('C-stat: ', 0.86151855047454706)

(2000, 'trees')
('C-stat: ', 0.86125757624175803)



### max_features

In [277]:
results = []
max_features_options = ["auto", None, "sqrt", "log2", 0.9, 0.2]

for max_features in max_features_options:
    model = RandomForestRegressor(n_estimators=1000, oob_score=True, n_jobs=-1, random_state=42, max_features=max_features)
    model.fit(X, y)
    print (max_features, "option")
    roc = roc_auc_score(y, model.oob_prediction_)
    print ("C-stat: ", roc)
    results.append(roc)
    print ("")

('auto', 'option')
('C-stat: ', 0.86151855047454706)

(None, 'option')
('C-stat: ', 0.86151855047454706)

('sqrt', 'option')
('C-stat: ', 0.86069301973817369)

('log2', 'option')
('C-stat: ', 0.86069301973817369)

(0.9, 'option')
('C-stat: ', 0.86112708912536351)

(0.2, 'option')
('C-stat: ', 0.85859457386635984)



### min_samples_leaf

In [278]:
results = []
min_samples_leaf_options = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

for min_samples in min_samples_leaf_options:
    model = RandomForestRegressor(n_estimators=1000, 
                                  oob_score=True, 
                                  n_jobs=-1, 
                                  random_state=42, 
                                  max_features="auto", 
                                  min_samples_leaf=min_samples)
    model.fit(X, y)
    print (min_samples, "min samples")
    roc = roc_auc_score(y, model.oob_prediction_)
    print ("C-stat: ", roc)
    results.append(roc)
    print ("")    

(1, 'min samples')
('C-stat: ', 0.86151855047454706)

(2, 'min samples')
('C-stat: ', 0.8683757815911971)

(3, 'min samples')
('C-stat: ', 0.87315587085503688)

(4, 'min samples')
('C-stat: ', 0.87507589556769871)

(5, 'min samples')
('C-stat: ', 0.87573898315917309)

(6, 'min samples')
('C-stat: ', 0.87531822878386023)

(7, 'min samples')
('C-stat: ', 0.87438617795247076)

(8, 'min samples')
('C-stat: ', 0.87343814910682893)

(9, 'min samples')
('C-stat: ', 0.87175513160557749)

(10, 'min samples')
('C-stat: ', 0.87062601859840849)



## Final model

In [279]:
model = RandomForestRegressor(n_estimators=1000, 
                              oob_score=True, 
                              n_jobs=-1, 
                              random_state=42, 
                              max_features="auto", 
                              min_samples_leaf=5)
model.fit(X, y)
roc = roc_auc_score(y, model.oob_prediction_)
print ("C-stat: ", roc)

('C-stat: ', 0.87573898315917309)


### In summary, all I did in addition to the lesson in order to achieve an AUC score >0.874 was to drop the 'Cabin' attribute from the model and use a more categorical system for predicting the age of those people whose 'Age' attribute was missing. Missing values in data are a bane and should be carefully filled in or removed completely in order to achieve a more accurate model.