In [24]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC

from datetime import date, datetime

pd.set_option('display.max_columns', None)

## Data import

In [2]:
train_data = pd.read_csv('train.csv')
train_data.head(5)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [3]:
test_data = pd.read_csv('test.csv')
test_data.head(5)

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


#### Extracting and cleaning labels

In [4]:
labels = train_data.loc[:, ['OutcomeType']]

labels.head(5)

Unnamed: 0,OutcomeType
0,Return_to_owner
1,Euthanasia
2,Adoption
3,Transfer
4,Transfer


## Extracting and cleaning features
#### Going to use the trick where the train and test features are concatenated for cleaning purposes

In [10]:
feature_columns = ['AnimalID', 'Name', 'DateTime', 'AnimalType', 'SexuponOutcome', 
                    'AgeuponOutcome', 'Breed', 'Color']

train_features = train_data.loc[:, feature_columns]
n_train_features = len(train_features.index)
print 'Train data count: {}'.format(n_train_features)

test_features = test_data.loc[:, feature_columns]
n_test_features = len(test_features.index)
print 'Test data count: {}'.format(n_test_features)

features = pd.concat((train_features, test_features), ignore_index=True)
print 'Total data count: {}'.format(len(features.index))

Train data count: 26729
Test data count: 11456
Total data count: 38185


#### Whether or not the animal has a name

In [11]:
features['HasName'] = features.Name.notnull().astype(float)
features.head(5)

Unnamed: 0,AnimalID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,HasName
0,A671945,Hambone,2014-02-12 18:22:00,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,1
1,A656520,Emily,2013-10-13 12:44:00,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1
2,A686464,Pearce,2015-01-31 12:28:00,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,1
3,A683430,,2014-07-11 19:09:00,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,0
4,A667013,,2013-11-15 12:52:00,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,0


#### Whether the animal is a dog or a cat. (Default is cat)

In [12]:
features['IsDog'] = (features.AnimalType == 'Dog').astype(float)
features.drop('AnimalType', axis=1, inplace=True)
features.head(5)

Unnamed: 0,AnimalID,Name,DateTime,SexuponOutcome,AgeuponOutcome,Breed,Color,HasName,IsDog
0,A671945,Hambone,2014-02-12 18:22:00,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,1,1
1,A656520,Emily,2013-10-13 12:44:00,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1,0
2,A686464,Pearce,2015-01-31 12:28:00,Neutered Male,2 years,Pit Bull Mix,Blue/White,1,1
3,A683430,,2014-07-11 19:09:00,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,0,0
4,A667013,,2013-11-15 12:52:00,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,0,1


#### Sex and spay/neuter status

In [13]:
# Cleaning sex
# When sex is unknown, code "None" for both Sex and Fixed values. 
# This will get handled later when dummy-coding variables. The default
# will be the "unknown" case and there will be separate features for
# both male/female and fixed/intact (for a total of 4 dummy variables).
features.loc[(features.SexuponOutcome == 'Unknown'), 'SexuponOutcome'] = None
features['Sex'] = features.SexuponOutcome.str.split(' ').str[1]
features['Fixed'] = features.SexuponOutcome.str.split(' ').str[0]
features.loc[(features.Fixed.str.contains('Neutered|Spayed',na=False)), 'Fixed'] = 'Fixed'
features.drop('SexuponOutcome', axis=1, inplace=True)
features.head(10)

Unnamed: 0,AnimalID,Name,DateTime,AgeuponOutcome,Breed,Color,HasName,IsDog,Sex,Fixed
0,A671945,Hambone,2014-02-12 18:22:00,1 year,Shetland Sheepdog Mix,Brown/White,1,1,Male,Fixed
1,A656520,Emily,2013-10-13 12:44:00,1 year,Domestic Shorthair Mix,Cream Tabby,1,0,Female,Fixed
2,A686464,Pearce,2015-01-31 12:28:00,2 years,Pit Bull Mix,Blue/White,1,1,Male,Fixed
3,A683430,,2014-07-11 19:09:00,3 weeks,Domestic Shorthair Mix,Blue Cream,0,0,Male,Intact
4,A667013,,2013-11-15 12:52:00,2 years,Lhasa Apso/Miniature Poodle,Tan,0,1,Male,Fixed
5,A677334,Elsa,2014-04-25 13:04:00,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,1,1,Female,Intact
6,A699218,Jimmy,2015-03-28 13:11:00,3 weeks,Domestic Shorthair Mix,Blue Tabby,1,0,Male,Intact
7,A701489,,2015-04-30 17:02:00,3 weeks,Domestic Shorthair Mix,Brown Tabby,0,0,,
8,A671784,Lucy,2014-02-04 17:17:00,5 months,American Pit Bull Terrier Mix,Red/White,1,1,Female,Fixed
9,A677747,,2014-05-03 07:48:00,1 year,Cairn Terrier,White,0,1,Female,Fixed


#### Age - convert to continuous # of years

In [14]:
# Cleaning age
# Unknown values left as NaN
# Otherwise, age is converted to years.  (Note precision loss as unit reported grows)
features['AgeNum'] = pd.to_numeric(features['AgeuponOutcome'].str.split(' ').str[0])
features['AgeUnit'] = features['AgeuponOutcome'].str.split(' ').str[1]
features.loc[(features.AgeUnit.str.contains('year', na=False)), 'Age'] = features.AgeNum / 1.0
features.loc[(features.AgeUnit.str.contains('month', na=False)), 'Age'] = features.AgeNum / 12.0
features.loc[(features.AgeUnit.str.contains('week', na=False)), 'Age'] = features.AgeNum / (365.25 / 7)
features.loc[(features.AgeUnit.str.contains('day', na=False)), 'Age'] = features.AgeNum / 365.25
features.drop(['AgeuponOutcome', 'AgeNum', 'AgeUnit'], axis=1, inplace=True)
features.head(5)

Unnamed: 0,AnimalID,Name,DateTime,Breed,Color,HasName,IsDog,Sex,Fixed,Age
0,A671945,Hambone,2014-02-12 18:22:00,Shetland Sheepdog Mix,Brown/White,1,1,Male,Fixed,1.0
1,A656520,Emily,2013-10-13 12:44:00,Domestic Shorthair Mix,Cream Tabby,1,0,Female,Fixed,1.0
2,A686464,Pearce,2015-01-31 12:28:00,Pit Bull Mix,Blue/White,1,1,Male,Fixed,2.0
3,A683430,,2014-07-11 19:09:00,Domestic Shorthair Mix,Blue Cream,0,0,Male,Intact,0.057495
4,A667013,,2013-11-15 12:52:00,Lhasa Apso/Miniature Poodle,Tan,0,1,Male,Fixed,2.0


#### Breed - for now just flagging mixes.

In [15]:
# Cleaning breed
# First just extracting whether it's a mix or not
features['IsMix'] = features.Breed.str.contains('Mix|/').astype(float)
features.head(10)

Unnamed: 0,AnimalID,Name,DateTime,Breed,Color,HasName,IsDog,Sex,Fixed,Age,IsMix
0,A671945,Hambone,2014-02-12 18:22:00,Shetland Sheepdog Mix,Brown/White,1,1,Male,Fixed,1.0,1
1,A656520,Emily,2013-10-13 12:44:00,Domestic Shorthair Mix,Cream Tabby,1,0,Female,Fixed,1.0,1
2,A686464,Pearce,2015-01-31 12:28:00,Pit Bull Mix,Blue/White,1,1,Male,Fixed,2.0,1
3,A683430,,2014-07-11 19:09:00,Domestic Shorthair Mix,Blue Cream,0,0,Male,Intact,0.057495,1
4,A667013,,2013-11-15 12:52:00,Lhasa Apso/Miniature Poodle,Tan,0,1,Male,Fixed,2.0,1
5,A677334,Elsa,2014-04-25 13:04:00,Cairn Terrier/Chihuahua Shorthair,Black/Tan,1,1,Female,Intact,0.083333,1
6,A699218,Jimmy,2015-03-28 13:11:00,Domestic Shorthair Mix,Blue Tabby,1,0,Male,Intact,0.057495,1
7,A701489,,2015-04-30 17:02:00,Domestic Shorthair Mix,Brown Tabby,0,0,,,0.057495,1
8,A671784,Lucy,2014-02-04 17:17:00,American Pit Bull Terrier Mix,Red/White,1,1,Female,Fixed,0.416667,1
9,A677747,,2014-05-03 07:48:00,Cairn Terrier,White,0,1,Female,Fixed,1.0,0


#### Date and time

In [16]:
# Leaving month and day as strings because I think they make more sense as categorical variables
# than ordinal ones. For example, is January really "less than" December? It comes after it.
# This will make the feature count larger since we have to do so much dummy coding, but it makes
# it possible to have the model pick up on spikes that might not be related to the day's number
# in any linear way - e.g., if there's a spike of activity at the beginning and end of each month.
# 
# One issue there, though, is that it becomes a poor proxy for time in relation to the end of the
# month, since months can be anywhere from 28-31 days long. I don't know that that will really 
# matter much, though, unless there is a hard spike on certain times. If I get bored I'll do
# some plotting to look into that.
# 
# Leaving year out of it because it'll create an extrapolation issue - we can't really use fixed
# effects from past years to apply to future years.
#
# For time, I'm just getting hour of the day. I'm not sure there's much sense to 
# having greater resolution than that

# First, parsing the date into a datetime object for easier manipulation
features['DateTime'] = features.DateTime.apply(
                            lambda s: datetime.strptime(s, '%Y-%m-%d %H:%M:%S'))

features['Month'] = features.DateTime.apply(lambda d: d.strftime('%B'))   # 'January', 'Feburary', . . .
features['Day'] = features.DateTime.apply(lambda d: d.strftime('Day%d'))  # 'Day01', 'Day02', . . .
features['WeekDay'] = features.DateTime.apply(lambda d: d.strftime('%A')) # 'Sunday', 'Monday', . . .  
features['Hour'] = features.DateTime.apply(lambda d: d.strftime('Hour%H')) # 'Hour00', 'Hour01', . . . 

features.drop(['DateTime'], axis=1, inplace=True)

features.head(10)

Unnamed: 0,AnimalID,Name,Breed,Color,HasName,IsDog,Sex,Fixed,Age,IsMix,Month,Day,WeekDay,Hour
0,A671945,Hambone,Shetland Sheepdog Mix,Brown/White,1,1,Male,Fixed,1.0,1,February,Day12,Wednesday,Hour18
1,A656520,Emily,Domestic Shorthair Mix,Cream Tabby,1,0,Female,Fixed,1.0,1,October,Day13,Sunday,Hour12
2,A686464,Pearce,Pit Bull Mix,Blue/White,1,1,Male,Fixed,2.0,1,January,Day31,Saturday,Hour12
3,A683430,,Domestic Shorthair Mix,Blue Cream,0,0,Male,Intact,0.057495,1,July,Day11,Friday,Hour19
4,A667013,,Lhasa Apso/Miniature Poodle,Tan,0,1,Male,Fixed,2.0,1,November,Day15,Friday,Hour12
5,A677334,Elsa,Cairn Terrier/Chihuahua Shorthair,Black/Tan,1,1,Female,Intact,0.083333,1,April,Day25,Friday,Hour13
6,A699218,Jimmy,Domestic Shorthair Mix,Blue Tabby,1,0,Male,Intact,0.057495,1,March,Day28,Saturday,Hour13
7,A701489,,Domestic Shorthair Mix,Brown Tabby,0,0,,,0.057495,1,April,Day30,Thursday,Hour17
8,A671784,Lucy,American Pit Bull Terrier Mix,Red/White,1,1,Female,Fixed,0.416667,1,February,Day04,Tuesday,Hour17
9,A677747,,Cairn Terrier,White,0,1,Female,Fixed,1.0,0,May,Day03,Saturday,Hour07


## Creating interaction variables
##### I'm going to go overboard here

#### Species and has name

#### Species and sex

#### Species and spay/neuter

#### Species and age

#### Species and is mixed

#### Species and month

## Dummy coding features

In [17]:
columns_to_code = ['Sex', 'Fixed', 'Month', 'Day', 'WeekDay', 'Hour']

for column in columns_to_code:
    dummies = pd.get_dummies(features[column])
    features = pd.concat((features, dummies), axis=1)
    
features.drop(columns_to_code, axis=1, inplace=True)
features.head(5)

Unnamed: 0,AnimalID,Name,Breed,Color,HasName,IsDog,Age,IsMix,Female,Male,Intact,April,August,December,February,January,July,June,March,May,November,October,September,Day01,Day02,Day03,Day04,Day05,Day06,Day07,Day08,Day09,Day10,Day11,Day12,Day13,Day14,Day15,Day16,Day17,Day18,Day19,Day20,Day21,Day22,Day23,Day24,Day25,Day26,Day27,Day28,Day29,Day30,Day31,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Hour00,Hour03,Hour05,Hour06,Hour07,Hour08,Hour09,Hour10,Hour11,Hour12,Hour13,Hour14,Hour15,Hour16,Hour17,Hour18,Hour19,Hour20,Hour21,Hour22,Hour23
0,A671945,Hambone,Shetland Sheepdog Mix,Brown/White,1,1,1.0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,A656520,Emily,Domestic Shorthair Mix,Cream Tabby,1,0,1.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,A686464,Pearce,Pit Bull Mix,Blue/White,1,1,2.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,A683430,,Domestic Shorthair Mix,Blue Cream,0,0,0.057495,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,A667013,,Lhasa Apso/Miniature Poodle,Tan,0,1,2.0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


## Dealing with NAs

In [18]:
# Filling NA ages with mean, which should hopefully at least get us an unbiased fill-in value
mean_age = features.loc[(features.Age.notnull()), 'Age'].mean()
print 'Filling NA ages with mean age: {}'.format(mean_age)
features.loc[(features.Age.isnull()), 'Age'] = mean_age

Filling NA ages with mean age: 2.16205678054


## Getting numeric features into a single df

In [25]:
# I've tried to keep it so that the dtype of all the columns we want to use
# for training is float64, so we can just grab those
numeric_features = features.loc[:, features.dtypes == np.float64]

print 'Feature count: {}'.format(len(numeric_features.dtypes))
numeric_features.head(4)

Feature count: 78


Unnamed: 0,HasName,IsDog,Age,IsMix,Female,Male,Intact,April,August,December,February,January,July,June,March,May,November,October,September,Day01,Day02,Day03,Day04,Day05,Day06,Day07,Day08,Day09,Day10,Day11,Day12,Day13,Day14,Day15,Day16,Day17,Day18,Day19,Day20,Day21,Day22,Day23,Day24,Day25,Day26,Day27,Day28,Day29,Day30,Day31,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Hour00,Hour03,Hour05,Hour06,Hour07,Hour08,Hour09,Hour10,Hour11,Hour12,Hour13,Hour14,Hour15,Hour16,Hour17,Hour18,Hour19,Hour20,Hour21,Hour22,Hour23
0,1,1,1.0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,1,0,1.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,2.0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0.057495,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


## Splitting train data and test data back out again

In [21]:
train_data = numeric_features[:n_train_features]
n_train_data = len(train_data.index)

test_data = numeric_features[:n_test_features]
n_test_data = len(test_data.index)

print '# train rows is {} (should be {})'.format(n_train_data, n_train_features)
print '# test rows is {} (should be {})'.format(n_test_data, n_test_features)

# train rows is 26729 (should be 26729)
# test rows is 11456 (should be 11456)


# Learning!!!

## With logistic regression

In [23]:
# From previous explorations, it seemed like L1 loss always works best, so I'm just sticking with that
params = { 'C': [.01, .1, 1, 5, 10, 50, 100] }

clf = GridSearchCV(LogisticRegression(penalty='l1'), params)
lr_adoption = clf.fit(train_data, labels.OutcomeType)

print "Best adoption parameters: {}".format(lr_adoption.best_params_)
print "\nAll parameters:"
for element in lr_adoption.grid_scores_:
    print"\t{}".format(element)

Best adoption parameters: {'C': 1}

All parameters:
	mean: 0.62206, std: 0.00479, params: {'C': 0.01}
	mean: 0.64787, std: 0.00175, params: {'C': 0.1}
	mean: 0.64967, std: 0.00218, params: {'C': 1}
	mean: 0.64948, std: 0.00209, params: {'C': 5}
	mean: 0.64967, std: 0.00213, params: {'C': 10}
	mean: 0.64967, std: 0.00213, params: {'C': 50}
	mean: 0.64959, std: 0.00224, params: {'C': 100}


## With SVM
#### SVC edition

## With SVM
#### LinearSVC edition