In [15]:
import numpy as np
import pandas
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import RandomizedSearchCV

In [2]:
data = pandas.read_csv('train.csv')

In [3]:
def convert_age_to_days(age_string):
    if type(age_string) != type(''):
        return age_string
    num, unit = age_string.split()
    if unit[0] == 'd':   # I don't expect it, but just in case
        return int(num)
    if unit[0] == 'w':
        return int(num) * 7
    if unit[0] == 'm':
        return int(num) * 30
    if unit[0] == 'y':
        return int(num) * 365

In [4]:
def add_more_columns(data):
    data['Cat'] = data['AnimalType'].apply(lambda x: x.lower() == 'cat')
    data['Sex'] = data['SexuponOutcome'].apply(lambda x: x.split()[1] if (x != 'Unknown' and type(x) == type('')) else None)
    data['Sex'] = data['Sex'].apply(lambda x: True if x == 'Female' else (False if x == 'Male' else None))
    data['Sterilisation'] = data['SexuponOutcome'].apply(lambda x: not x.split()[0]=='Intact' if (x != 'Unknown' and type(x) == type('')) else None)
    data['AgeInDays'] = data['AgeuponOutcome'].apply(convert_age_to_days)
    # names
    data['HasName'] = data['Name'].apply(lambda x: not pandas.isnull(x))
    data['NameMax'] = data['Name'].apply(lambda x: 'max' in x.lower() if not pandas.isnull(x) else False)
    data['NameBella'] = data['Name'].apply(lambda x: 'bella' in x.lower() if not pandas.isnull(x) else False)
    # breeds
    data['Mix'] = data['Breed'].apply(lambda x: x.endswith('Mix'))
    data['Domestic'] = data['Breed'].apply(lambda x: 'domestic' in x.lower())
    data['Shorthair'] = data['Breed'].apply(lambda x: 'shorthair' in x.lower())
    data['Longhair'] = data['Breed'].apply(lambda x: 'longhair' in x.lower())
    data['Siamese'] = data['Breed'].apply(lambda x: 'siamese' in x.lower())
    data['PitBull'] = data['Breed'].apply(lambda x: 'pit bull' in x.lower())
    data['Australian'] = data['Breed'].apply(lambda x: 'australian' in x.lower())
    data['Retriever'] = data['Breed'].apply(lambda x: 'retriever' in x.lower())
    data['Shepherd'] = data['Breed'].apply(lambda x: 'shepherd' in x.lower())
    data['Terrier'] = data['Breed'].apply(lambda x: 'terrier' in x.lower())
    data['Chihuahua'] = data['Breed'].apply(lambda x: 'chihuahua' in x.lower())
    # colors
    data['Black1'] = data['Color'].apply(lambda x: 'black' in x.lower())
    data['Black2'] = data['Color'].apply(lambda x: x.lower() == 'black')
    data['White'] = data['Color'].apply(lambda x: 'white' in x.lower())
    data['Tabby'] = data['Color'].apply(lambda x: 'tabby' in x.lower())
    data['Tiger'] = data['Color'].apply(lambda x: 'tiger' in x.lower())
    data['Blue'] = data['Color'].apply(lambda x: 'blue' in x.lower())
    data['Brown'] = data['Color'].apply(lambda x: 'brown' in x.lower())
    data['Orange'] = data['Color'].apply(lambda x: 'orange' in x.lower())
    data['Red'] = data['Color'].apply(lambda x: 'red' in x.lower())
    data['Yellow'] = data['Color'].apply(lambda x: 'yellow' in x.lower())
    data['Tan'] = data['Color'].apply(lambda x: 'tan' in x.lower())
    data['Tricolor'] = data['Color'].apply(lambda x: 'tricolor' in x.lower())
    data['2colors'] = data['Color'].apply(lambda x: '/' in x)

In [5]:
def select_columns(data):
    result = data.copy()
    result = pandas.DataFrame(result, 
                              columns=('Cat', 'Sex', 'Sterilisation', 'AgeInDays', 
                                       'HasName', 'NameMax', 'NameBella',
                                       'Mix', 'Domestic', 'Shorthair', 'Longhair', 'Siamese',
                                       'PitBull', 'Australian', 'Retriever', 'Shepherd', 'Terrier', 'Chihuahua',
                                       'Black1', 'Black2', 'White', 'Tabby', 'Tiger', 'Blue', 'Brown',
                                       'Orange', 'Red', 'Yellow', 'Tan', 'Tricolor', '2colors'
                                      )
                             )
    return result.values

In [6]:
add_more_columns(data)
numeric_data = select_columns(data)

In [7]:
def enum_outcome(outcome):
    if outcome == 'Return_to_owner':
        return 4
    if outcome == 'Adoption':
        return 3
    if outcome == 'Transfer':
        return 2
    if outcome == 'Euthanasia':
        return 1
    if outcome == 'Died':
        return 0
    raise Exception

In [8]:
pandas.DataFrame(numeric_data).fillna(0).values

array([[False, False, True, ..., False, False, True],
       [True, True, True, ..., False, False, False],
       [False, False, True, ..., False, False, True],
       ..., 
       [False, False, True, ..., True, False, True],
       [True, False, False, ..., False, False, False],
       [True, False, False, ..., False, False, True]], dtype=object)

In [9]:
labels = data['OutcomeType'].apply(enum_outcome)
features = pandas.DataFrame(numeric_data).fillna(0).values

In [10]:
features.shape

(26729, 31)

In [11]:
label_train, label_test, feat_train, feat_test = sklearn.cross_validation.train_test_split(labels, features, test_size=0.2, random_state=0)
rf_classifier = RandomForestClassifier(random_state=42)
params = {
    'max_depth': list(range(2,10)) + [None],
    'criterion': ['gini', 'entropy'],
    'n_estimators': [10, 100, 500, 1000],
}
rscv = RandomizedSearchCV(rf_classifier, params)
rscv.fit(feat_train, label_train)

print(rscv.best_params_)
print(rscv.score(feat_train, label_train), rscv.score(feat_test, label_test))

{'criterion': 'gini', 'max_depth': 9, 'n_estimators': 1000}
0.652995370154 0.637111859334


In [14]:
for i in range(20):
    label_train, label_test, feat_train, feat_test = sklearn.cross_validation.train_test_split(labels, features, test_size=0.2, random_state=i)
    rf_classifier = RandomForestClassifier(random_state=i, n_jobs=7)
    params = {
        'max_depth': list(range(2,10)) + [None],
        'criterion': ['gini', 'entropy'],
        'n_estimators': [10, 100, 500, 1000, 2000, 5000],
    }
    rscv = RandomizedSearchCV(rf_classifier, params)
    rscv.fit(feat_train, label_train)

    print(rscv.best_params_)
    print(rscv.score(feat_train, label_train), rscv.score(feat_test, label_test))

{'criterion': 'entropy', 'max_depth': 9, 'n_estimators': 2000}
0.651779450966 0.636737747849
{'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 5000}
0.639526726839 0.635241301908
{'criterion': 'entropy', 'max_depth': 9, 'n_estimators': 1000}
0.652013281579 0.635241301908
{'criterion': 'gini', 'max_depth': 8, 'n_estimators': 500}
0.646354580742 0.640852974186
{'criterion': 'entropy', 'max_depth': 9, 'n_estimators': 2000}
0.651592386475 0.634493078938
{'criterion': 'entropy', 'max_depth': 9, 'n_estimators': 100}
0.654959547304 0.632996632997
{'criterion': 'gini', 'max_depth': 7, 'n_estimators': 1000}
0.629518776598 0.634118967452
{'criterion': 'entropy', 'max_depth': 9, 'n_estimators': 100}
0.651218257494 0.639543583988
{'criterion': 'gini', 'max_depth': 9, 'n_estimators': 100}
0.653509797503 0.640478862701
{'criterion': 'entropy', 'max_depth': 9, 'n_estimators': 2000}
0.653322733012 0.636924803591
{'criterion': 'entropy', 'max_depth': 8, 'n_estimators': 500}
0.640274984801 0.6367

In [19]:
for i in range(20):
    label_train, label_test, feat_train, feat_test = sklearn.cross_validation.train_test_split(labels, features, test_size=0.2, random_state=i)
    tree_classifier = DecisionTreeClassifier(random_state=i)
    params = {
        'max_depth': list(range(2,100)) + [None],
        'criterion': ['gini', 'entropy'],
    }
    rscv = RandomizedSearchCV(tree_classifier, params)
    rscv.fit(feat_train, label_train)

    print(rscv.best_params_)
    print(rscv.score(feat_train, label_train), rscv.score(feat_test, label_test))

{'criterion': 'gini', 'max_depth': 7}
0.652948604031 0.648335203891
{'criterion': 'entropy', 'max_depth': 3}
0.608380489174 0.61316872428
{'criterion': 'gini', 'max_depth': 10}
0.674741617173 0.62962962963
{'criterion': 'gini', 'max_depth': 5}
0.645325726044 0.63898241676
{'criterion': 'gini', 'max_depth': 9}
0.664827199177 0.631313131313
{'criterion': 'entropy', 'max_depth': 18}
0.773932563251 0.588290310513
{'criterion': 'gini', 'max_depth': 15}
0.733853996165 0.619528619529
{'criterion': 'entropy', 'max_depth': 14}
0.721554505916 0.614852225963
{'criterion': 'gini', 'max_depth': 7}
0.650703830145 0.648335203891
{'criterion': 'gini', 'max_depth': 25}
0.810690735631 0.58024691358
{'criterion': 'gini', 'max_depth': 51}
0.815975307487 0.559857837636
{'criterion': 'gini', 'max_depth': 12}
0.696955525417 0.620463898242
{'criterion': 'gini', 'max_depth': 10}
0.675676939625 0.632061354284
{'criterion': 'entropy', 'max_depth': 4}
0.630033203947 0.630564908343
{'criterion': 'gini', 'max_depth