In [35]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import date, datetime

pd.set_option('display.max_columns', None)

## Data import

In [None]:
data = pd.read_csv('train.csv')

#### Extracting and cleaning labels

In [47]:
labels = data.loc[:, ['OutcomeType', 'OutcomeSubtype']]

# Dummy coding OutcomeType
outcome_dummies = pd.get_dummies(labels.OutcomeType)
labels = pd.concat([labels, outcome_dummies], axis=1)
labels.drop('OutcomeType', axis=1, inplace=True)

# Cleaning OutcomeSubtype
labels.loc[(labels.OutcomeSubtype == 'In Foster'), 'OutcomeSubtype'] = 'Foster'
labels.loc[(labels.OutcomeSubtype == 'In Kennel'), 'OutcomeSubtype'] = 'Kennel'
labels.loc[(labels.OutcomeSubtype == 'At Vet'), 'OutcomeSubtype'] = 'Vet'
labels.loc[(labels.OutcomeSubtype == 'In Surgery'), 'OutcomeSubtype'] = 'Surgery'
labels.loc[(labels.OutcomeSubtype == 'Rabies Risk'), 'OutcomeSubtype'] = 'RabiesRisk'
labels.loc[(labels.OutcomeSubtype == 'Court/Investigation'), 'OutcomeSubtype'] = 'Court'

# Dummy coding OutcomeSubtype
subtype_dummies = pd.get_dummies(labels.OutcomeSubtype)
labels = pd.concat([labels, subtype_dummies], axis=1)
labels.drop('OutcomeSubtype', axis=1, inplace=True)

labels

Unnamed: 0,Adoption,Died,Euthanasia,Return_to_owner,Transfer,Aggressive,Barn,Behavior,Court,Enroute,Foster,Kennel,Medical,Offsite,Partner,RabiesRisk,SCRP,Suffering,Surgery,Vet
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


#### Extracting and cleaning features

In [88]:
features = data.loc[:, ['AnimalID', 'Name', 'DateTime', 'AnimalType', 'SexuponOutcome', 
                    'AgeuponOutcome', 'Breed', 'Color']]


# Dummy coding animal type
features['IsDog'] = (features.AnimalType == 'Dog')
features.drop('AnimalType', axis=1, inplace=True)

# Cleaning sex
# recoding Unknown to nan
# Baseline is NaN, sex and intact/spayed/neutered all get a dummy variable
features.loc[(features.SexuponOutcome == 'Unknown'), 'SexuponOutcome'] = None
features['Male'] = features.SexuponOutcome.str.contains('Male', na=False)
features['Female'] = features.SexuponOutcome.str.contains('Female', na=False)
features['Intact'] = features.SexuponOutcome.str.contains('Intact', na=False)
features['Fixed'] = features.SexuponOutcome.str.contains('Neutered|Spayed', na=False)
features.drop('SexuponOutcome', axis=1, inplace=True)

# Cleaning age
# Unknown values left as NaN
# Otherwise, age is converted to years.  (Note precision loss as unit reported grows)
features['AgeNum'] = pd.to_numeric(features['AgeuponOutcome'].str.split(' ').str[0])
features['AgeUnit'] = features['AgeuponOutcome'].str.split(' ').str[1]
features.loc[(features.AgeUnit.str.contains('year', na=False)), 'Age'] = features.AgeNum / 1.0
features.loc[(features.AgeUnit.str.contains('month', na=False)), 'Age'] = features.AgeNum / 12.0
features.loc[(features.AgeUnit.str.contains('week', na=False)), 'Age'] = features.AgeNum / (365.25 / 7)
features.loc[(features.AgeUnit.str.contains('day', na=False)), 'Age'] = features.AgeNum / 365.25
features.drop(['AgeuponOutcome', 'AgeNum', 'AgeUnit'], axis=1, inplace=True)

# Cleaning breed
# First just extracting whether it's a mix or not
features['IsMix'] = features.Breed.str.contains('Mix|/')

features
#features.loc[(pd.isnull(features.SexuponOutcome))]

Unnamed: 0,AnimalID,Name,DateTime,Breed,Color,IsDog,Male,Female,Intact,Fixed,Age,IsMix
0,A671945,Hambone,2014-02-12 18:22:00,Shetland Sheepdog Mix,Brown/White,True,True,False,False,True,1.000000,True
1,A656520,Emily,2013-10-13 12:44:00,Domestic Shorthair Mix,Cream Tabby,False,False,True,False,True,1.000000,True
2,A686464,Pearce,2015-01-31 12:28:00,Pit Bull Mix,Blue/White,True,True,False,False,True,2.000000,True
3,A683430,,2014-07-11 19:09:00,Domestic Shorthair Mix,Blue Cream,False,True,False,True,False,0.057495,True
4,A667013,,2013-11-15 12:52:00,Lhasa Apso/Miniature Poodle,Tan,True,True,False,False,True,2.000000,True
5,A677334,Elsa,2014-04-25 13:04:00,Cairn Terrier/Chihuahua Shorthair,Black/Tan,True,False,True,True,False,0.083333,True
6,A699218,Jimmy,2015-03-28 13:11:00,Domestic Shorthair Mix,Blue Tabby,False,True,False,True,False,0.057495,True
7,A701489,,2015-04-30 17:02:00,Domestic Shorthair Mix,Brown Tabby,False,False,False,False,False,0.057495,True
8,A671784,Lucy,2014-02-04 17:17:00,American Pit Bull Terrier Mix,Red/White,True,False,True,False,True,0.416667,True
9,A677747,,2014-05-03 07:48:00,Cairn Terrier,White,True,False,True,False,True,1.000000,False


In [87]:
features['Breed'].unique()

array(['Shetland Sheepdog Mix', 'Domestic Shorthair Mix', 'Pit Bull Mix',
       ..., 'Vizsla/Boxer', 'German Shepherd/Australian Kelpie',
       'Boxer/German Shepherd'], dtype=object)