In [1]:
import pandas as pd, numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime, dateutil

%matplotlib inline

# DEAD ANIMALS

In [2]:
train = pd.read_csv('train.csv')
display(train.dtypes)
display(train.head())
display(train.describe())

AnimalID          object
Name              object
DateTime          object
OutcomeType       object
OutcomeSubtype    object
AnimalType        object
SexuponOutcome    object
AgeuponOutcome    object
Breed             object
Color             object
dtype: object

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
count,26729,19038,26729,26729,13117,26729,26728,26711,26729,26729
unique,26729,6374,22918,5,16,2,5,44,1380,366
top,A690806,Max,2015-08-11 00:00:00,Adoption,Partner,Dog,Neutered Male,1 year,Domestic Shorthair Mix,Black/White
freq,1,136,19,10769,7816,15595,9779,3969,8810,2824


In [5]:
import re
sterilized_pat = re.compile('.*(neutered|spayed).*', flags=re.IGNORECASE)

def purify_age(df):
    days_in_unit = {
    'day': 1,
    'days': 1,
    'week': 7,
    'weeks': 7,
    'month': 30,
    'months': 30,
    'years': 365,
    'year': 365
    }
    
    def strage_to_days(age):
        if not isinstance(age, str): return float('nan')
        age = age.split(' ')
        return int(age[0]) * days_in_unit[age[1]]
    df['AgeuponOutcome'] = df['AgeuponOutcome'].map(strage_to_days)

def purify_sex(df):
    df['Sterilized'] = df.SexuponOutcome.str.match(sterilized_pat)
    df.loc[df['Sterilized'].isnull(), 'Sterilized'] = 2
    df['Sterilized'] = df['Sterilized'].astype('int')
    def shorten_sex(sex):
        if isinstance(sex, str):
            if 'Male' in sex:
                return 0
            elif 'Female' in sex:
                return 1
        return 2 # the Unknown
    df['SexuponOutcome'] = df['SexuponOutcome'].map(shorten_sex).astype('int')

import datetime

def datetime_purify(df):
    def transform_date(date):
        date = dateutil.parser.parse(date)
        date = datetime.datetime(year=date.year, month=date.month, day=date.day)
        return date
    df.DateTime = df.DateTime.apply(transform_date)
    
def purify_1(df):
    df = df.drop(['ID', 'AnimalID'], axis=1, errors='ignore')
    df['Name'] = pd.notnull(df['Name'])
    df['Mix'] = df.Breed.str.match(re.compile('.*(Mix|/)')).astype('int')
    df['Breed'] = df.Breed.str.rstrip(' Mix')
    df['Multicolor'] = df.Color.str.contains('/')
    df.loc[df['Color'].str.match(
        re.compile('.*(multi|poly|parti|point|calico).*',
                   flags=re.IGNORECASE)), 'Multicolor'] = 1
    purify_age(df)
    purify_sex(df)
    datetime_purify(df)
    return df

applied_transforms = [purify_1]

In [6]:
X = purify_1(train)
X.head()

Unnamed: 0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Mix,Multicolor,Sterilized
0,True,2014-02-12,Return_to_owner,,Dog,0,365.0,Shetland Sheepdog,Brown/White,1,True,1
1,True,2013-10-13,Euthanasia,Suffering,Cat,1,365.0,Domestic Shorthair,Cream Tabby,1,False,1
2,True,2015-01-31,Adoption,Foster,Dog,0,730.0,Pit Bull,Blue/White,1,True,1
3,False,2014-07-11,Transfer,Partner,Cat,0,21.0,Domestic Shorthair,Blue Cream,1,False,0
4,False,2013-11-15,Transfer,Partner,Dog,0,730.0,Lhasa Apso/Miniature Poodle,Tan,1,False,1


In [None]:
df = X[['Breed', 'AgeuponOutcome']].groupby('Breed').describe()
df[df['AgeuponOutcome', 'count'] > 100]

In [None]:
X.isnull().sum()

In [None]:
X[['AnimalType', 'Mix', 'AgeuponOutcome']].groupby(['AnimalType', 'Mix']).describe()

In [None]:
display(
    X.loc[X['Sterilized'] == 2],
    X.loc[X['Mix'] == 2],
    X.loc[X['SexuponOutcome'] == 2].head(),
    X.loc[X['AgeuponOutcome'].isnull()],
    X.loc[X['AgeuponOutcome'].isnull()].describe())

In [None]:
def impute_age(src, dst):
    for f, g in dst.loc[dst['AgeuponOutcome'].isnull()].groupby(['Breed', 'Mix', 'SexuponOutcome']):
        gtrain = src.loc[
                (src['Breed'] == f[0]) &
                (src['Mix'] == f[1]) &
                ( (src['SexuponOutcome'] == f[2]) ), # if f[2] else True ),
            'AgeuponOutcome']
        med, cnt = gtrain.median(), gtrain.count()
        if cnt < 30: med = gtrain.mean()
        dst.loc[dst['AgeuponOutcome'].isnull() &
             (dst['Breed'] == f[0]) &
             (dst['Mix'] == f[1]) &
             (dst['SexuponOutcome'] == f[2]),
                'AgeuponOutcome'] = med
        print('Imputed age %s from %s samples of %s %s %s' % (
              med, cnt,
              f[0],
              'Mix' if f[1] else '',
              'Male' if f[2] == 0 else ('Female' if f[2] == 1 else 'Bisexual')))
    return dst

In [None]:
X = impute_age(X, X)

In [None]:
applied_transforms.append(lambda X_te: impute_age(X, X_te))

In [None]:
def extract_month(X):
    X['Month'] = X['DateTime'].apply(lambda d: d.month)
applied_transforms.append(extract_month)
extract_month(X)

In [None]:
from ggplot import *
import ggplot as gg

In [None]:
# [ggplot(aes(x='Month'), g) +
#             geom_histogram() +
#             ggtitle(['Male', 'Female', 'Agamous'][f[1]] + ' ' + f[0])
# for f, g in X.groupby(['AnimalType', 'SexuponOutcome'])]

In [None]:
mon_avg_outcomes = pd.concat([g[['Month', 'OutcomeType']]\
 .assign(Group = lambda x: '%s %s' % (['Male', 'Female', 'Agamous'][f[1]], f[0]))
 for f, g in X.groupby(['AnimalType', 'SexuponOutcome'])],
          axis=0)
display(mon_avg_outcomes.head())
display(ggplot(aes(x='Month'), mon_avg_outcomes) +
    geom_histogram() +
    facet_grid('Group', 'OutcomeType', scales='free_y'))

In [None]:
mon_grouper = pd.Grouper(key='DateTime', freq='M')
mon_outcomes = pd.concat([g[['DateTime', 'OutcomeType']]\
 .assign(Group = lambda x: '%s %s' % (['Male', 'Female', 'Agamous'][f[1]], f[0]))
 for f, g in X.groupby(['AnimalType', 'SexuponOutcome', 'DateTime'])],
          axis=0)

display(mon_outcomes.head())

In [None]:
mon_outcomes_fg = sns.FacetGrid(mon_outcomes,
                                row='Group',
                                col='OutcomeType',
                                sharey=False,
                                size=4,
                                aspect=2)
mon_outcomes_fg.map(plt.hist, 'DateTime', normed=True, bins=mon_outcomes['DateTime'].unique().size)

In [None]:
applied_transforms.append(lambda df: df.drop('DateTime', axis=1, inplace=True))
applied_transforms[-1](X)

In [None]:
def simplify_mixed_breeds(df):
    df['Breed'] = df['Breed'].str.replace(re.compile('([^/]+)/?.*'), '\\1')
    return df
applied_transforms.append(simplify_mixed_breeds)
X = simplify_mixed_breeds(X)

In [None]:
breed_cnts = (X.Breed * (1-X.Mix)).value_counts()
display(breed_cnts, (1-X.Mix).sum())
display(breed_cnts[breed_cnts < 15].hist(bins=25))
display(breed_cnts.size)

In [None]:
clrs_cnts = pd.concat((X.Color, X_te.Color)).value_counts()
clrs_cnts[clrs_cnts < 25].hist(cumulative=True, bins=25)
clrs_cnts

In [None]:
test = pd.read_csv('test.csv')
X_te = purify_1(test)
X_te.head()

In [None]:
test.isnull().sum()

In [None]:
X_te.loc[X_te['AgeuponOutcome'].isnull()]

In [None]:
X.loc[X['Breed'] == 'Domestic Shorthair', ['AgeuponOutcome', 'Mix']].groupby(['Mix']).describe()

In [None]:
X.loc[X['Breed'] == 'Domestic Longhair', ['AgeuponOutcome', 'SexuponOutcome', 'Mix']].groupby(['SexuponOutcome', 'Mix']).describe()

In [None]:
for T in applied_transforms[1:]:
    X_te = T(X_te)

In [None]:
display(X.head(), X_te.head())

In [None]:
X.columns

In [None]:

X = pd.concat((train.DateTime.apply(transform_date), X))