Random Forest using Kaggle Animal Shelter Dataset
https://www.kaggle.com/c/shelter-animal-outcomes

In [1]:
import pandas as pd
import os
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

In [2]:
data_dir = '/Users/christopherallison/Documents/Coding/Data'

win_data_dir = u'C:\\Users\\Owner\\Documents\\Data'

In [3]:
# Data cleanup
# TRAIN DATA
train_df = pd.read_csv(os.path.join(win_data_dir, 'shelter_train.csv'), header=0)        # Load the train file into a dataframe

In [4]:
train_df.columns

Index(['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype',
       'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color'],
      dtype='object')

In [5]:
outcomes = train_df.OutcomeType.unique()

In [6]:
from sklearn import preprocessing

In [7]:
encoder = preprocessing.LabelEncoder()

In [8]:
encoder.fit(outcomes)

LabelEncoder()

In [9]:
encoded_y = encoder.transform(outcomes)

In [10]:
encoded_y

array([3, 2, 0, 4, 1], dtype=int64)

In [11]:
list(encoder.inverse_transform([1, 2, 3]))

['Died', 'Euthanasia', 'Return_to_owner']

In [12]:
# convert integers into dummy variables (one hot encoding)
from keras.utils import np_utils

dummy_y = np_utils.to_categorical(encoded_y)

Using Theano backend.


In [13]:
dummy_y

array([[ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.],
       [ 0.,  1.,  0.,  0.,  0.]])

In [14]:
# Transform value in DF

train_df.OutcomeType = encoder.transform(train_df.OutcomeType)

In [15]:
train_df.head(50)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,3,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,2,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,0,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,4,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,4,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
5,A677334,Elsa,2014-04-25 13:04:00,4,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
6,A699218,Jimmy,2015-03-28 13:11:00,4,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
7,A701489,,2015-04-30 17:02:00,4,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby
8,A671784,Lucy,2014-02-04 17:17:00,0,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White
9,A677747,,2014-05-03 07:48:00,0,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White


In [16]:
train_df.OutcomeSubtype.unique()

array([nan, 'Suffering', 'Foster', 'Partner', 'Offsite', 'SCRP',
       'Aggressive', 'Behavior', 'Rabies Risk', 'Medical', 'In Kennel',
       'In Foster', 'Barn', 'Court/Investigation', 'Enroute', 'At Vet',
       'In Surgery'], dtype=object)

In [17]:
# Should convert all of these using LabelEncoder

outcome_sub = {}
for i, st in enumerate(train_df.OutcomeSubtype.unique()):
    outcome_sub[st] = i

In [18]:
train_df['OutcomeSubtype'] = train_df.OutcomeSubtype.map( outcome_sub ).astype(int)

In [19]:
train_df.AnimalType.unique()

array(['Dog', 'Cat'], dtype=object)

In [20]:
train_df['AnimalType'] = train_df.AnimalType.map( {'Dog': 0, 'Cat': 1} ).astype(int)

In [21]:
train_df.SexuponOutcome.unique()

array(['Neutered Male', 'Spayed Female', 'Intact Male', 'Intact Female',
       'Unknown', nan], dtype=object)

In [22]:
sex_outcome = {}
for i, sex in enumerate(train_df.SexuponOutcome.unique()):
    sex_outcome[sex] = i

In [23]:
sex_outcome

{'Unknown': 4,
 'Intact Female': 3,
 nan: 5,
 'Intact Male': 2,
 'Spayed Female': 1,
 'Neutered Male': 0}

In [24]:
train_df['SexuponOutcome'] = train_df.SexuponOutcome.map( sex_outcome ).astype(int)

In [25]:
train_df.AgeuponOutcome.unique()

array(['1 year', '2 years', '3 weeks', '1 month', '5 months', '4 years',
       '3 months', '2 weeks', '2 months', '10 months', '6 months',
       '5 years', '7 years', '3 years', '4 months', '12 years', '9 years',
       '6 years', '1 weeks', '11 years', '4 weeks', '7 months', '8 years',
       '11 months', '4 days', '9 months', '8 months', '15 years',
       '10 years', '1 week', '0 years', '14 years', '3 days', '6 days',
       '5 days', '5 weeks', '2 days', '16 years', '1 day', '13 years', nan,
       '17 years', '18 years', '19 years', '20 years'], dtype=object)

In [26]:
period_dict = {'year': 365, 'month': 30, 'week': 7, 'day': 1, 'day': 1}

def convert_age_to_days(age):
    try:
        number, period = str(age).split()
        period = period.rstrip("s")
        days = int(number) * period_dict[period]
        return int(days)
    except ValueError:
        return 30
    

In [27]:
train_df['AgeuponOutcome'] = train_df.AgeuponOutcome.apply(convert_age_to_days)

In [28]:
train_df.AgeuponOutcome[:10]

0    365
1    365
2    730
3     21
4    730
5     30
6     21
7     21
8    150
9    365
Name: AgeuponOutcome, dtype: int64

In [29]:
train_df.Breed.unique()

array(['Shetland Sheepdog Mix', 'Domestic Shorthair Mix', 'Pit Bull Mix',
       ..., 'Vizsla/Boxer', 'German Shepherd/Australian Kelpie',
       'Boxer/German Shepherd'], dtype=object)

In [30]:
# Could do something with a countvectorizer for Breed & Color, but we'll stick to simple here

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

TypeError: no supported conversion for types: (dtype('O'),)

In [None]:
colour_counts = count_vect.fit_transform(train_df.Color)

In [None]:
count_vect.vocabulary_.get(u'black')

In [None]:
train_df['ColorVect'] = count_vect.fit_transform(train_df.Color)

In [None]:
breed_vect = CountVectorizer()
train_df['BreedVect'] = count_vect.fit_transform(train_df.Breed)

In [None]:
'''breed = {}

for i, b in enumerate(train_df.Breed.unique()):
    breed[b] = i
    
train_df['Breed'] = train_df.Breed.map( breed ).astype(int)'''

In [None]:
'''train_df.Color.unique()[:10]'''

In [None]:
'''color = {}

for i, c in enumerate(train_df.Color.unique()):
    color[c] = i
    
train_df['Color'] = train_df.Color.map( color ).astype(int)'''

In [None]:
# We could also do something very cool with text analysis for names, but we'll keep it simple and go length

train_df['Name'] = train_df.Name.apply(lambda x: len(str(x)))

In [None]:
train_df.interpolate().head() # could have used dropna().median()

In [None]:
# Save our clean dataframe to csv so we don't need to do this again.
train_df.to_csv(os.path.join(win_data_dir, "clean_train.csv"))

In [None]:
# Work out a test_train_split to see how we do
X = train_df.drop(['AnimalID', 'DateTime', 'OutcomeType', 'Breed', 'Color'], axis=1)
X.dtypes

In [None]:
X.AgeuponOutcome.replace('NA',0)

In [None]:
train_features = X.values
train_features[:10]

In [None]:
train_target = train_df['OutcomeType'].values
train_target

In [None]:
# Set up our train_test_split

X_train, x_test, y_train, y_test = train_test_split(train_features,
                                          train_target,
                                          test_size=0.4,
                                          random_state=42)

In [None]:
# New we finally get to the classifier!

clf = RandomForestClassifier(n_estimators=100)

In [None]:
clf = clf.fit(X_train, y_train)
score = clf.score(X_train, y_train)
"Mean accuracy of Random Forest: {0}".format(score)

In [None]:
clf.predict(x_test[0])

In [None]:
def reverse_dict_search(d, target):
    for k, v in d.items():
        if v == target:
            return k

In [None]:
def predict_outcome(animal):
    
    animaltype = {'Dog': 0,'Cat': 1}
    
    name, animal_class, sex, age, animal_breed, animal_color = animal
    print("Name length:{}".format(name))
    print("Animal Type:{}".format(reverse_dict_search(animaltype, animal_class)))
    print("Sex:{}".format(reverse_dict_search(sex_outcome, sex)))
    print("Age:{} days".format(age))
    print("Breed:{}".format(reverse_dict_search(breed, animal_breed)))
    print("Color:{}".format(reverse_dict_search(color, animal_color)))
    #print("Outcome Subtype: {}".format(reverse_dict_search(outcome_sub, outcome_subtype)))
    print("************")
    print("Prediction: {}\n".format(encoder.inverse_transform(clf.predict(animal))))

In [None]:
predict_outcome(x_test[4])

In [None]:
# Evaluate the model
print (X_train.shape, y_train.shape)
print (x_test.shape, y_test.shape)

In [None]:
clf = clf.fit(X_train, y_train)
y_predict = clf.predict(x_test)

from sklearn.metrics import accuracy_score
from sklearn import metrics
print ("Accuracy = %.2f" % (accuracy_score(y_test, y_predict)))

In [None]:
model_score = clf.score(x_test, y_test)
print ("Model Score %.2f \n" % (model_score))

confusion_matrix = metrics.confusion_matrix(y_test, y_predict)
print ("Confusion Matrix \n", confusion_matrix)

print ("          Predicted")
print ("         |  0  |  1  |")
print ("         |-----|-----|")
print ("       0 | %3d | %3d |" % (confusion_matrix[0, 0],
                                   confusion_matrix[0, 1]))
print ("Actual   |-----|-----|")
print ("       1 | %3d | %3d |" % (confusion_matrix[1, 0],
                                   confusion_matrix[1, 1]))
print ("         |-----|-----|")

In [None]:
# Create confusion matrix for test data

categories = ['Return_to_owner', 'Euthanasia', 'Adoption', 'Transfer', 'Died']

from sklearn.metrics import classification_report
print(classification_report(y_test, 
                            y_predict, 
                            target_names=categories))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_confusion_matrix(cm, target_names, title='Confusion matrix', cmap=plt.cm.Blues):
    # Simple plot based on the Iris sample CM
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plot_confusion_matrix(confusion_matrix, categories, title="Animal Sanctuary Confusion Matrix")

In [None]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_],
            axis=0)
indices = np.argsort(importances)[::-1]


In [None]:
for f in range(X.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()

In [None]:
for i in [5, 3, 4, 2, 0, 1]:
    print("Feature {}: {}".format(i, X.columns[i]))

In [None]:
# Now we do the same thing with the real test data... but - it's in a different format, so we pass for now.

In [None]:
# TEST DATA
test_df = pd.read_csv(os.path.join(data_dir, 'shelter_test.csv'), header=0)        # Load the test file into a dataframe

In [None]:
test_df.head()

In [None]:
# Clean test_df

test_df['Name'] = test_df.Name.apply(lambda x: len(str(x)))

In [None]:
test_df['AnimalType'] = test_df.AnimalType.map( {'Dog': 0, 'Cat': 1} ).astype(int)

In [None]:
test_df['AgeuponOutcome'] = test_df.AgeuponOutcome.apply(convert_age_to_days)

In [None]:
test_df['SexuponOutcome'] = test_df.SexuponOutcome.map( sex_outcome ).astype(int)

In [None]:
test_df['Breed'] = test_df.Breed.map( breed ).astype(int)

In [None]:
test_df['Color'] = test_df.Color.map( color ).astype(int)

In [None]:
test_df.head()

In [None]:
# The data is now ready to go. So lets fit to the train, then predict to the test!
# Convert back to a numpy array
train_data = train_df.values
test_data = test_df.values

In [None]:
print 'Training...'
forest = RandomForestClassifier(n_estimators=100)
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )

In [None]:
print 'Predicting...'
output = forest.predict(test_data).astype(int)

In [None]:
predictions_file = open("myfirstforest.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'