In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from datetime import date, datetime

pd.set_option('display.max_columns', None)

## Data import

In [2]:
data = pd.read_csv('train.csv')

#### Extracting and cleaning labels

In [3]:
labels = data.loc[:, ['OutcomeType', 'OutcomeSubtype']]

# Dummy coding OutcomeType
outcome_dummies = pd.get_dummies(labels.OutcomeType)
labels = pd.concat([labels, outcome_dummies], axis=1)
labels.drop('OutcomeType', axis=1, inplace=True)

# OutcomeSubtype seems to not be used for the scoring, and I'm not sure we should
# try using it for prediction because it might be cheating. e.g, a subtype of
# 'foster' pretty strongly implies a transfer.  So here's the transform code,
# but I'm commenting it out.
# labels.loc[(labels.OutcomeSubtype == 'In Foster'), 'OutcomeSubtype'] = 'Foster'
# labels.loc[(labels.OutcomeSubtype == 'In Kennel'), 'OutcomeSubtype'] = 'Kennel'
# labels.loc[(labels.OutcomeSubtype == 'At Vet'), 'OutcomeSubtype'] = 'Vet'
# labels.loc[(labels.OutcomeSubtype == 'In Surgery'), 'OutcomeSubtype'] = 'Surgery'
# labels.loc[(labels.OutcomeSubtype == 'Rabies Risk'), 'OutcomeSubtype'] = 'RabiesRisk'
# labels.loc[(labels.OutcomeSubtype == 'Court/Investigation'), 'OutcomeSubtype'] = 'Court'
# subtype_dummies = pd.get_dummies(labels.OutcomeSubtype)
# labels = pd.concat([labels, subtype_dummies], axis=1)
labels.drop('OutcomeSubtype', axis=1, inplace=True)

labels.head(10)

Unnamed: 0,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,0,0,0,1,0
1,0,0,1,0,0
2,1,0,0,0,0
3,0,0,0,0,1
4,0,0,0,0,1
5,0,0,0,0,1
6,0,0,0,0,1
7,0,0,0,0,1
8,1,0,0,0,0
9,1,0,0,0,0


## Extracting and cleaning features

In [4]:
features = data.loc[:, ['AnimalID', 'Name', 'DateTime', 'AnimalType', 'SexuponOutcome', 
                    'AgeuponOutcome', 'Breed', 'Color']]

#### Whether or not the animal has a name

In [5]:
features['HasName'] = features.Name.notnull().astype(int)
features.head(5)

Unnamed: 0,AnimalID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,HasName
0,A671945,Hambone,2014-02-12 18:22:00,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,1
1,A656520,Emily,2013-10-13 12:44:00,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1
2,A686464,Pearce,2015-01-31 12:28:00,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,1
3,A683430,,2014-07-11 19:09:00,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,0
4,A667013,,2013-11-15 12:52:00,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,0


#### Whether the animal is a dog or a cat. (Default is cat)

In [6]:
features['IsDog'] = (features.AnimalType == 'Dog').astype(int)
features.drop('AnimalType', axis=1, inplace=True)
features.head(5)

Unnamed: 0,AnimalID,Name,DateTime,SexuponOutcome,AgeuponOutcome,Breed,Color,HasName,IsDog
0,A671945,Hambone,2014-02-12 18:22:00,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,1,1
1,A656520,Emily,2013-10-13 12:44:00,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1,0
2,A686464,Pearce,2015-01-31 12:28:00,Neutered Male,2 years,Pit Bull Mix,Blue/White,1,1
3,A683430,,2014-07-11 19:09:00,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,0,0
4,A667013,,2013-11-15 12:52:00,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,0,1


#### Sex and spay/neuter status

In [7]:
# Cleaning sex
# When sex is unknown, code "None" for both Sex and Fixed values. 
# This will get handled later when dummy-coding variables. The default
# will be the "unknown" case and there will be separate features for
# both male/female and fixed/intact (for a total of 4 dummy variables).
features.loc[(features.SexuponOutcome == 'Unknown'), 'SexuponOutcome'] = None
features['Sex'] = features.SexuponOutcome.str.split(' ').str[1]
features['Fixed'] = features.SexuponOutcome.str.split(' ').str[0]
features.loc[(features.Fixed.str.contains('Neutered|Spayed',na=False)), 'Fixed'] = 'Fixed'
features.drop('SexuponOutcome', axis=1, inplace=True)
features.head(10)

Unnamed: 0,AnimalID,Name,DateTime,AgeuponOutcome,Breed,Color,HasName,IsDog,Sex,Fixed
0,A671945,Hambone,2014-02-12 18:22:00,1 year,Shetland Sheepdog Mix,Brown/White,1,1,Male,Fixed
1,A656520,Emily,2013-10-13 12:44:00,1 year,Domestic Shorthair Mix,Cream Tabby,1,0,Female,Fixed
2,A686464,Pearce,2015-01-31 12:28:00,2 years,Pit Bull Mix,Blue/White,1,1,Male,Fixed
3,A683430,,2014-07-11 19:09:00,3 weeks,Domestic Shorthair Mix,Blue Cream,0,0,Male,Intact
4,A667013,,2013-11-15 12:52:00,2 years,Lhasa Apso/Miniature Poodle,Tan,0,1,Male,Fixed
5,A677334,Elsa,2014-04-25 13:04:00,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,1,1,Female,Intact
6,A699218,Jimmy,2015-03-28 13:11:00,3 weeks,Domestic Shorthair Mix,Blue Tabby,1,0,Male,Intact
7,A701489,,2015-04-30 17:02:00,3 weeks,Domestic Shorthair Mix,Brown Tabby,0,0,,
8,A671784,Lucy,2014-02-04 17:17:00,5 months,American Pit Bull Terrier Mix,Red/White,1,1,Female,Fixed
9,A677747,,2014-05-03 07:48:00,1 year,Cairn Terrier,White,0,1,Female,Fixed


#### Age - convert to continuous # of years

In [8]:
# Cleaning age
# Unknown values left as NaN
# Otherwise, age is converted to years.  (Note precision loss as unit reported grows)
features['AgeNum'] = pd.to_numeric(features['AgeuponOutcome'].str.split(' ').str[0])
features['AgeUnit'] = features['AgeuponOutcome'].str.split(' ').str[1]
features.loc[(features.AgeUnit.str.contains('year', na=False)), 'Age'] = features.AgeNum / 1.0
features.loc[(features.AgeUnit.str.contains('month', na=False)), 'Age'] = features.AgeNum / 12.0
features.loc[(features.AgeUnit.str.contains('week', na=False)), 'Age'] = features.AgeNum / (365.25 / 7)
features.loc[(features.AgeUnit.str.contains('day', na=False)), 'Age'] = features.AgeNum / 365.25
features.drop(['AgeuponOutcome', 'AgeNum', 'AgeUnit'], axis=1, inplace=True)
features.head(5)

Unnamed: 0,AnimalID,Name,DateTime,Breed,Color,HasName,IsDog,Sex,Fixed,Age
0,A671945,Hambone,2014-02-12 18:22:00,Shetland Sheepdog Mix,Brown/White,1,1,Male,Fixed,1.0
1,A656520,Emily,2013-10-13 12:44:00,Domestic Shorthair Mix,Cream Tabby,1,0,Female,Fixed,1.0
2,A686464,Pearce,2015-01-31 12:28:00,Pit Bull Mix,Blue/White,1,1,Male,Fixed,2.0
3,A683430,,2014-07-11 19:09:00,Domestic Shorthair Mix,Blue Cream,0,0,Male,Intact,0.057495
4,A667013,,2013-11-15 12:52:00,Lhasa Apso/Miniature Poodle,Tan,0,1,Male,Fixed,2.0


#### Breed - for now just flagging mixes.

In [9]:
# Cleaning breed
# First just extracting whether it's a mix or not
features['IsMix'] = features.Breed.str.contains('Mix|/').astype(int)
features.head(10)

Unnamed: 0,AnimalID,Name,DateTime,Breed,Color,HasName,IsDog,Sex,Fixed,Age,IsMix
0,A671945,Hambone,2014-02-12 18:22:00,Shetland Sheepdog Mix,Brown/White,1,1,Male,Fixed,1.0,1
1,A656520,Emily,2013-10-13 12:44:00,Domestic Shorthair Mix,Cream Tabby,1,0,Female,Fixed,1.0,1
2,A686464,Pearce,2015-01-31 12:28:00,Pit Bull Mix,Blue/White,1,1,Male,Fixed,2.0,1
3,A683430,,2014-07-11 19:09:00,Domestic Shorthair Mix,Blue Cream,0,0,Male,Intact,0.057495,1
4,A667013,,2013-11-15 12:52:00,Lhasa Apso/Miniature Poodle,Tan,0,1,Male,Fixed,2.0,1
5,A677334,Elsa,2014-04-25 13:04:00,Cairn Terrier/Chihuahua Shorthair,Black/Tan,1,1,Female,Intact,0.083333,1
6,A699218,Jimmy,2015-03-28 13:11:00,Domestic Shorthair Mix,Blue Tabby,1,0,Male,Intact,0.057495,1
7,A701489,,2015-04-30 17:02:00,Domestic Shorthair Mix,Brown Tabby,0,0,,,0.057495,1
8,A671784,Lucy,2014-02-04 17:17:00,American Pit Bull Terrier Mix,Red/White,1,1,Female,Fixed,0.416667,1
9,A677747,,2014-05-03 07:48:00,Cairn Terrier,White,0,1,Female,Fixed,1.0,0


#### Date and time

In [10]:
# Leaving month and day as strings because I think they make more sense as categorical variables
# than ordinal ones. For example, is January really "less than" December? It comes after it.
# This will make the feature count larger since we have to do so much dummy coding, but it makes
# it possible to have the model pick up on spikes that might not be related to the day's number
# in any linear way - e.g., if there's a spike of activity at the beginning and end of each month.
# 
# One issue there, though, is that it becomes a poor proxy for time in relation to the end of the
# month, since months can be anywhere from 28-31 days long. I don't know that that will really 
# matter much, though, unless there is a hard spike on certain times. If I get bored I'll do
# some plotting to look into that.
# 
# Leaving year out of it because it'll create an extrapolation issue - we can't really use fixed
# effects from past years to apply to future years.
#
# For time, I'm just getting hour of the day. I'm not sure there's much sense to 
# having greater resolution than that

# First, parsing the date into a datetime object for easier manipulation
features['DateTime'] = features.DateTime.apply(
                            lambda s: datetime.strptime(s, '%Y-%m-%d %H:%M:%S'))

features['Month'] = features.DateTime.apply(lambda d: d.strftime('%B'))   # 'January', 'Feburary', . . .
features['Day'] = features.DateTime.apply(lambda d: d.strftime('Day%d'))  # 'Day01', 'Day02', . . .
features['WeekDay'] = features.DateTime.apply(lambda d: d.strftime('%A')) # 'Sunday', 'Monday', . . .  
features['Hour'] = features.DateTime.apply(lambda d: d.strftime('Hour%H')) # 'Hour00', 'Hour01', . . . 

features.drop(['DateTime'], axis=1, inplace=True)

features.head(10)

Unnamed: 0,AnimalID,Name,Breed,Color,HasName,IsDog,Sex,Fixed,Age,IsMix,Month,Day,WeekDay,Hour
0,A671945,Hambone,Shetland Sheepdog Mix,Brown/White,1,1,Male,Fixed,1.0,1,February,Day12,Wednesday,Hour18
1,A656520,Emily,Domestic Shorthair Mix,Cream Tabby,1,0,Female,Fixed,1.0,1,October,Day13,Sunday,Hour12
2,A686464,Pearce,Pit Bull Mix,Blue/White,1,1,Male,Fixed,2.0,1,January,Day31,Saturday,Hour12
3,A683430,,Domestic Shorthair Mix,Blue Cream,0,0,Male,Intact,0.057495,1,July,Day11,Friday,Hour19
4,A667013,,Lhasa Apso/Miniature Poodle,Tan,0,1,Male,Fixed,2.0,1,November,Day15,Friday,Hour12
5,A677334,Elsa,Cairn Terrier/Chihuahua Shorthair,Black/Tan,1,1,Female,Intact,0.083333,1,April,Day25,Friday,Hour13
6,A699218,Jimmy,Domestic Shorthair Mix,Blue Tabby,1,0,Male,Intact,0.057495,1,March,Day28,Saturday,Hour13
7,A701489,,Domestic Shorthair Mix,Brown Tabby,0,0,,,0.057495,1,April,Day30,Thursday,Hour17
8,A671784,Lucy,American Pit Bull Terrier Mix,Red/White,1,1,Female,Fixed,0.416667,1,February,Day04,Tuesday,Hour17
9,A677747,,Cairn Terrier,White,0,1,Female,Fixed,1.0,0,May,Day03,Saturday,Hour07


## Creating interaction variables

In [None]:
# I'm going to go overboard on this.
