In [4]:
import numpy as np
import pandas
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
#import theanets
#from keras.models import Sequential
#from keras.layers import Dense, Activation
#from keras.wrappers.scikit_learn import KerasClassifier

# Adding more features and converting them to boolean or numeric

In [9]:
data = pandas.read_csv('train.csv')

In [10]:
def convert_age_to_days(age_string):
    if type(age_string) != type(''):
        return age_string
    num, unit = age_string.split()
    if unit[0] == 'd':   # I don't expect it, but just in case
        return int(num)
    if unit[0] == 'w':
        return int(num) * 7
    if unit[0] == 'm':
        return int(num) * 30
    if unit[0] == 'y':
        return int(num) * 365

In [11]:
def add_more_columns(data):
    data['Cat'] = data['AnimalType'].apply(lambda x: x.lower() == 'cat')
    data['Sex'] = data['SexuponOutcome'].apply(lambda x: x.split()[1] if (x != 'Unknown' and type(x) == type('')) else None)
    data['Sex'] = data['Sex'].apply(lambda x: True if x == 'Female' else (False if x == 'Male' else None))
    data['Sterilisation'] = data['SexuponOutcome'].apply(lambda x: not x.split()[0]=='Intact' if (x != 'Unknown' and type(x) == type('')) else None)
    data['AgeInDays'] = data['AgeuponOutcome'].apply(convert_age_to_days)
    data['Mix'] = data['Breed'].apply(lambda x: x.endswith('Mix'))
    data['HasName'] = data['Name'].apply(lambda x: not pandas.isnull(x))
    data['Black1'] = data['Color'].apply(lambda x: 'black' in x.lower())
    data['Black2'] = data['Color'].apply(lambda x: x.lower() == 'black')

In [12]:
add_more_columns(data)

In [13]:
data.columns

Index(['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype',
       'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color',
       'Cat', 'Sex', 'Sterilisation', 'AgeInDays', 'Mix', 'HasName', 'Black1',
       'Black2'],
      dtype='object')

In [14]:
def select_columns(data):
    result = data.copy()
    result = pandas.DataFrame(result, columns=('Cat', 'Sex', 'Sterilisation', 'AgeInDays', 'Mix', 'HasName', 'Black1', 'Black2'))
    return result.values

In [15]:
filtered_data = select_columns(data)

In [16]:
filtered_data

array([[False, False, True, ..., True, False, False],
       [True, True, True, ..., True, False, False],
       [False, False, True, ..., True, False, False],
       ..., 
       [False, False, True, ..., True, False, False],
       [True, False, False, ..., False, True, True],
       [True, False, False, ..., False, False, False]], dtype=object)

In [17]:
data.groupby('OutcomeType').count()

Unnamed: 0_level_0,AnimalID,Name,DateTime,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Cat,Sex,Sterilisation,AgeInDays,Mix,HasName,Black1,Black2
OutcomeType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Adoption,10769,9091,10769,1966,10769,10769,10769,10769,10769,10769,10769,10769,10769,10769,10769,10769,10769
Died,197,77,197,181,197,197,197,197,197,197,172,172,197,197,197,197,197
Euthanasia,1555,740,1555,1554,1555,1555,1553,1555,1555,1555,1454,1454,1553,1555,1555,1555,1555
Return_to_owner,4786,4633,4786,0,4786,4785,4786,4786,4786,4786,4773,4773,4786,4786,4786,4786,4786
Transfer,9422,4497,9422,9416,9422,9422,9406,9422,9422,9422,8467,8467,9406,9422,9422,9422,9422


In [18]:
def enum_outcome(outcome):
    if outcome == 'Return_to_owner':
        return 4
    if outcome == 'Adoption':
        return 3
    if outcome == 'Transfer':
        return 2
    if outcome == 'Euthanasia':
        return 1
    if outcome == 'Died':
        return 0
    raise Exception

In [19]:
enum_outcomes = data['OutcomeType'].apply(enum_outcome).values

In [20]:
enum_outcomes

array([4, 1, 3, ..., 3, 2, 2])

In [21]:
imp = sklearn.preprocessing.Imputer()
filtered_data = imp.fit_transform(filtered_data)

# Testing various predictors

In [22]:
def very_stupid_predictor(row):
    return 4 if (row[2] and row[5]) else 1      # sterilised and has a name

In [23]:
very_stupid_result = np.apply_along_axis(very_stupid_predictor, 1, filtered_data)
sum(very_stupid_result == enum_outcomes) / enum_outcomes.size

0.18766134161397732

In [24]:
for i in range(5):
    label_train, label_test, feat_train, feat_test = sklearn.cross_validation.train_test_split(enum_outcomes, filtered_data, test_size=0.2, random_state=i)
    rf_classifier = RandomForestClassifier(n_estimators=1000, n_jobs=12, criterion='entropy')
    rf_classifier.fit(feat_train, label_train)
    print(rf_classifier.score(feat_train, label_train), rf_classifier.score(feat_test, label_test))

0.672075948183 0.637111859334
0.672122714306 0.634493078938
0.671000327363 0.639169472503
0.671374456344 0.638047138047
0.673759528597 0.629816685372


In [25]:
for i in range(5):
    label_train, label_test, feat_train, feat_test = sklearn.cross_validation.train_test_split(enum_outcomes, filtered_data, test_size=0.2, random_state=i)
    rf_classifier = RandomForestClassifier(n_estimators=1000, n_jobs=12, criterion='gini')
    rf_classifier.fit(feat_train, label_train)
    print(rf_classifier.score(feat_train, label_train), rf_classifier.score(feat_test, label_test))

0.672075948183 0.635241301908
0.672122714306 0.635054246165
0.671000327363 0.63823419379
0.671374456344 0.635615413393
0.673759528597 0.627572016461


In [99]:
scaler = sklearn.preprocessing.StandardScaler()
filtered_and_scaled_data = scaler.fit_transform(filtered_data)
filtered_and_scaled_data

array([[-0.84495371, -0.98306611,  0.62804797, ...,  0.6355954 ,
        -0.65496275, -0.30625512],
       [ 1.18349678,  1.06063673,  0.62804797, ...,  0.6355954 ,
        -0.65496275, -0.30625512],
       [-0.84495371, -0.98306611,  0.62804797, ...,  0.6355954 ,
        -0.65496275, -0.30625512],
       ..., 
       [-0.84495371, -0.98306611,  0.62804797, ...,  0.6355954 ,
        -0.65496275, -0.30625512],
       [ 1.18349678, -0.98306611, -1.66018535, ..., -1.57332794,
         1.52680438,  3.2652515 ],
       [ 1.18349678, -0.98306611, -1.66018535, ..., -1.57332794,
        -0.65496275, -0.30625512]])

In [100]:
for i in range(5):
    label_train, label_test, feat_train, feat_test = sklearn.cross_validation.train_test_split(enum_outcomes, filtered_and_scaled_data, test_size=0.2, random_state=i)
    svc_classifier = GridSearchCV(SVC(random_state=42, max_iter=100), {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C':[1, 10]})
    svc_classifier.fit(feat_train, label_train)
    print(svc_classifier.score(feat_train, label_train), svc_classifier.score(feat_test, label_test))



0.324790721601 0.325102880658




0.391666276949 0.392255892256




0.375298134032 0.373363262252




0.491231352009 0.491021324355




0.389327970818 0.391507669285


In [108]:
nn_experiment = theanets.main.Experiment(theanets.Classifier, layers=(2, 3, 1))

In [104]:
or i in range(5):
    label_train, label_test, feat_train, feat_test = sklearn.cross_validation.train_test_split(enum_outcomes, filtered_and_scaled_data, test_size=0.2, random_state=i)
    nn_experiment.train()

In [109]:
nn_experiment.train?

In [130]:
def create_keras_model():
    model = Sequential()
    model.add(Dense(8, input_dim=8))
    model.add(Dense(8))
    #model.add(Dense(8))
    #model.add(Dense(64))
    #model.add(Dense(64))
    model.add(Dense(5))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
    return model

In [29]:
for i in range(5):
    label_train, label_test, feat_train, feat_test = sklearn.cross_validation.train_test_split(enum_outcomes, filtered_data, test_size=0.2, random_state=i)
    nn_classifier = KerasClassifier(build_fn=create_keras_model)
    nn_classifier.fit(feat_train, label_train)
    print(nn_classifier.score(feat_train, label_train), nn_classifier.score(feat_test, label_test))

Exception: A target array with shape (21383, 5) was passed for an output of shape (None, 64) while using as loss `categorical_crossentropy`. This loss expects targets to have the same shape as the output.

In [65]:
label_train.shape, feat_train.shape

((21383,), (21383, 8))

In [114]:

#nn_classifier = create_keras_model()

In [131]:
nn_classifier = KerasClassifier(build_fn=create_keras_model)
nn_classifier.fit(feat_train, label_train)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa1a32a4eb8>

In [121]:
print(nn_classifier.score(feat_train, label_train), nn_classifier.score(feat_test, label_test))

0.352523032317 0.35241301908


In [86]:
feat_train

array([[ 1.,  1.,  1., ...,  1.,  0.,  0.],
       [ 1.,  1.,  1., ...,  1.,  0.,  0.],
       [ 1.,  0.,  0., ...,  1.,  1.,  1.],
       ..., 
       [ 1.,  1.,  0., ...,  1.,  1.,  1.],
       [ 1.,  1.,  1., ...,  1.,  0.,  0.],
       [ 1.,  1.,  0., ...,  1.,  0.,  0.]])

In [87]:
rf_classifier.fit(feat_train, label_train)
print(rf_classifier.score(feat_train, label_train), rf_classifier.score(feat_test, label_test))

0.672075948183 0.636363636364
