In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import date, datetime

pd.set_option('display.max_columns', None)

## Data import

In [2]:
data = pd.read_csv('train.csv')

#### Extracting and cleaning labels

In [3]:
labels = data.loc[:, ['OutcomeType', 'OutcomeSubtype']]

# Dummy coding OutcomeType
outcome_dummies = pd.get_dummies(labels.OutcomeType)
labels = pd.concat([labels, outcome_dummies], axis=1)
labels.drop('OutcomeType', axis=1, inplace=True)

# Cleaning OutcomeSubtype
labels.loc[(labels.OutcomeSubtype == 'In Foster'), 'OutcomeSubtype'] = 'Foster'
labels.loc[(labels.OutcomeSubtype == 'In Kennel'), 'OutcomeSubtype'] = 'Kennel'
labels.loc[(labels.OutcomeSubtype == 'At Vet'), 'OutcomeSubtype'] = 'Vet'
labels.loc[(labels.OutcomeSubtype == 'In Surgery'), 'OutcomeSubtype'] = 'Surgery'
labels.loc[(labels.OutcomeSubtype == 'Rabies Risk'), 'OutcomeSubtype'] = 'RabiesRisk'
labels.loc[(labels.OutcomeSubtype == 'Court/Investigation'), 'OutcomeSubtype'] = 'Court'

# Dummy coding OutcomeSubtype
subtype_dummies = pd.get_dummies(labels.OutcomeSubtype)
labels = pd.concat([labels, subtype_dummies], axis=1)
labels.drop('OutcomeSubtype', axis=1, inplace=True)

labels.head(10)

Unnamed: 0,Adoption,Died,Euthanasia,Return_to_owner,Transfer,Aggressive,Barn,Behavior,Court,Enroute,Foster,Kennel,Medical,Offsite,Partner,RabiesRisk,SCRP,Suffering,Surgery,Vet
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


#### Extracting and cleaning features

In [26]:
features = data.loc[:, ['AnimalID', 'Name', 'DateTime', 'AnimalType', 'SexuponOutcome', 
                    'AgeuponOutcome', 'Breed', 'Color']]

# Whether or not an animal has a name
features['HasName'] = features.Name.notnull()

# Dummy coding animal type
features['IsDog'] = (features.AnimalType == 'Dog')
features.drop('AnimalType', axis=1, inplace=True)

# Cleaning sex
# recoding Unknown to nan
# Baseline is NaN, sex and intact/spayed/neutered all get a dummy variable
features.loc[(features.SexuponOutcome == 'Unknown'), 'SexuponOutcome'] = None
features['Male'] = features.SexuponOutcome.str.contains('Male', na=False)
features['Female'] = features.SexuponOutcome.str.contains('Female', na=False)
features['Intact'] = features.SexuponOutcome.str.contains('Intact', na=False)
features['Fixed'] = features.SexuponOutcome.str.contains('Neutered|Spayed', na=False)
features.drop('SexuponOutcome', axis=1, inplace=True)

# Cleaning age
# Unknown values left as NaN
# Otherwise, age is converted to years.  (Note precision loss as unit reported grows)
features['AgeNum'] = pd.to_numeric(features['AgeuponOutcome'].str.split(' ').str[0])
features['AgeUnit'] = features['AgeuponOutcome'].str.split(' ').str[1]
features.loc[(features.AgeUnit.str.contains('year', na=False)), 'Age'] = features.AgeNum / 1.0
features.loc[(features.AgeUnit.str.contains('month', na=False)), 'Age'] = features.AgeNum / 12.0
features.loc[(features.AgeUnit.str.contains('week', na=False)), 'Age'] = features.AgeNum / (365.25 / 7)
features.loc[(features.AgeUnit.str.contains('day', na=False)), 'Age'] = features.AgeNum / 365.25
features.drop(['AgeuponOutcome', 'AgeNum', 'AgeUnit'], axis=1, inplace=True)

# Cleaning breed
# First just extracting whether it's a mix or not
features['IsMix'] = features.Breed.str.contains('Mix|/')

# Splitting date and time
# Leaving month and day as strings because I think they make more sense as categorical variables
# than ordinal ones. For example, is January really "less than" December? It comes after it.
# This will make the feature count larger since we have to do so much dummy coding, but it makes
# it possible to have the model pick up on spikes that might not be related to the day's number
# in any linear way - e.g., if there's a spike of activity at the beginning and end of each month.
# 
# One issue there, though, is that it becomes a poor proxy for time in relation to the end of the
# month, since months can be anywhere from 28-31 days long. I don't know that that will really 
# matter much, though, unless there is a hard spike on certain times. If I get bored I'll do
# some plotting to look into that.
# 
# Leaving year out of it because it'll create an extrapolation issue - we can't really use fixed
# effects from past years to apply to future years.
#
# For time, I'm just getting hour of the day. I'm not sure there's much sense to 
# having greater resolution than that

# First, parsing the date into a datetime object for easier manipulation
features['DateTime'] = features.DateTime.apply(
                            lambda s: datetime.strptime(s, '%Y-%m-%d %H:%M:%S'))

features['Month'] = features.DateTime.apply(lambda d: d.strftime('%B'))   # 'January', 'Feburary', . . .
features['Day'] = features.DateTime.apply(lambda d: d.strftime('Day%d'))  # 'Day01', 'Day02', . . .
features['WeekDay'] = features.DateTime.apply(lambda d: d.strftime('%A')) # 'Sunday', 'Monday', . . .  
features['Hour'] = features.DateTime.apply(lambda d: d.strftime('Hour%H')) # 'Hour00', 'Hour01', . . . 

# Finally, dummy coding all these time components. Whew, this'll create a mess:
month_dummies = pd.get_dummies(features.Month)
day_dummies = pd.get_dummies(features.Day)
weekday_dummies = pd.get_dummies(features.WeekDay)
hour_dummies = pd.get_dummies(features.Hour)
features = pd.concat((features, month_dummies, day_dummies, weekday_dummies, hour_dummies), axis=1)
features.drop(['DateTime', 'Month', 'Day', 'WeekDay', 'Hour'], axis=1, inplace=True)

features.head(10)

Unnamed: 0,AnimalID,Name,Breed,Color,HasName,IsDog,Male,Female,Intact,Fixed,Age,IsMix,April,August,December,February,January,July,June,March,May,November,October,September,Day01,Day02,Day03,Day04,Day05,Day06,Day07,Day08,Day09,Day10,Day11,Day12,Day13,Day14,Day15,Day16,Day17,Day18,Day19,Day20,Day21,Day22,Day23,Day24,Day25,Day26,Day27,Day28,Day29,Day30,Day31,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Hour00,Hour05,Hour06,Hour07,Hour08,Hour09,Hour10,Hour11,Hour12,Hour13,Hour14,Hour15,Hour16,Hour17,Hour18,Hour19,Hour20,Hour21,Hour22,Hour23
0,A671945,Hambone,Shetland Sheepdog Mix,Brown/White,True,True,True,False,False,True,1.0,True,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,A656520,Emily,Domestic Shorthair Mix,Cream Tabby,True,False,False,True,False,True,1.0,True,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,A686464,Pearce,Pit Bull Mix,Blue/White,True,True,True,False,False,True,2.0,True,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,A683430,,Domestic Shorthair Mix,Blue Cream,False,False,True,False,True,False,0.057495,True,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,A667013,,Lhasa Apso/Miniature Poodle,Tan,False,True,True,False,False,True,2.0,True,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,A677334,Elsa,Cairn Terrier/Chihuahua Shorthair,Black/Tan,True,True,False,True,True,False,0.083333,True,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,A699218,Jimmy,Domestic Shorthair Mix,Blue Tabby,True,False,True,False,True,False,0.057495,True,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
7,A701489,,Domestic Shorthair Mix,Brown Tabby,False,False,False,False,False,False,0.057495,True,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
8,A671784,Lucy,American Pit Bull Terrier Mix,Red/White,True,True,False,True,False,True,0.416667,True,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
9,A677747,,Cairn Terrier,White,False,True,False,True,False,True,1.0,False,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
features['Breed'].unique()