In [6]:
import pandas as pd
import numpy as np

from datetime import date, datetime

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [127]:
train.head(2)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby


In [183]:
test.head(2)

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan


### Some utility functions for assisting in feature creation.
#### For the following unknowns are gettting recoded to -999

In [308]:
# From: http://stackoverflow.com/a/28688724
Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
seasons = [('winter', (date(Y,  1,  1),  date(Y,  3, 20))),
           ('spring', (date(Y,  3, 21),  date(Y,  6, 20))),
           ('summer', (date(Y,  6, 21),  date(Y,  9, 22))),
           ('autumn', (date(Y,  9, 23),  date(Y, 12, 20))),
           ('winter', (date(Y, 12, 21),  date(Y, 12, 31)))]

def get_season(now):
    if isinstance(now, datetime):
        now = now.date()
    now = now.replace(year=Y)
    return next(season for season, (start, end) in seasons
                if start <= now <= end)

def get_breed(line):
    # mix is the last word
    l = line.split('/')
    mix = 1 if 'Mix' in line else 0
    
    if mix:
        l = line.split(' ')
        # remove 
        del l[-1]
        l = ' '.join(l).split('/')

    if len(l) == 2:
        # remember We need to return series to add to multiple columns
        return pd.Series({'breed_1': l[0], 'breed_2': l[1], 'is_mix': mix})

    return pd.Series({'breed_1': l[0], 'breed_2': 'NaN', 'is_mix': mix})

def age_years(x):
    age = int(x.split(' ')[0])

    if 'month' in x:
        age /= 12.0

    elif 'week' in x:
        age /= 52.0

    elif 'day' in x:
        age /= 365.0

    else:
        age = float(age)

    return age

def gender(string):
    try:
        if 'Female' in string:
            return 1

        if 'Male' in string:
            return 0
        
        return -999

    except:
        return -999
    
def fixed(string):
    try:
        if 'Spayed' in string or 'Neutered' in string:
            return 1

        if 'Unknown' in string:
            return -999

        return 0
    except:
        return -999

### We are going to modify all the data test+train together to generate our new features then break them back up for simplicity sake.

In [275]:
# get number of training elements
train_num = train.shape[0]

#df.rename(columns={'$a': 'a', '$b': 'b'}, inplace=True)
train.rename(columns={'AnimalID': 'ID'}, inplace=True)

all_data = pd.concat((train, test), axis=0, ignore_index=True)

# create a new data frame to store our new features.
new_data = pd.DataFrame()

#We need an index to work with
new_data['ID'] = all_data['ID']

# Add the easy stuff to our new dataframe
# is it a cat?
new_data['is_cat'] = all_data['AnimalType'].map(lambda x: 1 if 'Cat' in x else 0) 

# color stuff
new_data = new_data.join(all_data['Color'].apply(
        lambda x: pd.Series({'color_1':x.split('/')[0], 'color_2':x.split('/')[1]}
                            if len(x.split('/')) == 2 else {'color_1':x, 'color_2':'NaN'})))

# lets convert the date into seasons
new_data = new_data.join(
    pd.get_dummies(
        all_data['DateTime'].map(lambda x: get_season(datetime.strptime(x, '%Y-%m-%d %H:%M:%S')))
    ))

In [276]:
# add in the breed information
new_data = new_data.join(all_data['Breed'].apply(lambda x: get_breed(x)))

In [277]:
new_data['age_years'] = all_data['AgeuponOutcome'].map(age_in_years)

In [278]:
# what is the gender, I am coding all unknowns as -999
new_data['is_female'] = all_data['SexuponOutcome'].map(gender)

In [279]:
# are they fixed
new_data['is_fixed'] = all_data['SexuponOutcome'].map(fixed)

## We are going to recode breed and color so we have a sparse array.
### I am also recoding age, but it's probably not the best approach because it is using the quartiles of the two data sets.

In [282]:

new_data = new_data.join(
    pd.get_dummies(
        new_data['breed_1'], prefix='breed')
    )

new_data = new_data.join(
    pd.get_dummies(
        new_data['color_1'], prefix='color')
    )

# This is probably an inappropriate way to dummy code the ages 
# but just to get something together for testing
new_data = new_data.join(
    pd.get_dummies(
        pd.qcut(new_data['age_years'], 4, labels=["age_1","age_2","age_3", "age_4"]))
    )

In [283]:
new_data.head()

Unnamed: 0,ID,is_cat,color_1,color_2,autumn,spring,summer,winter,breed_1,breed_2,...,color_Tortie,color_Tortie Point,color_Tricolor,color_White,color_Yellow,color_Yellow Brindle,age_1,age_2,age_3,age_4
0,A671945,0,Brown,White,0,0,0,1,Shetland Sheepdog,,...,0,0,0,0,0,0,0,1,0,0
1,A656520,1,Cream Tabby,,1,0,0,0,Domestic Shorthair,,...,0,0,0,0,0,0,0,1,0,0
2,A686464,0,Blue,White,0,0,0,1,Pit Bull,,...,0,0,0,0,0,0,0,0,1,0
3,A683430,1,Blue Cream,,0,0,1,0,Domestic Shorthair,,...,0,0,0,0,0,0,1,0,0,0
4,A667013,0,Tan,,1,0,0,0,Lhasa Apso,Miniature Poodle,...,0,0,0,0,0,0,0,0,1,0


In [287]:
# We want to drop the original non-binary columns now. 
cols_to_drop = ['color_1',
                'color_2',
                'breed_1',
                'breed_2',
                'age_years',
                'ID']
new_data = new_data.drop(cols_to_drop, axis=1)

### Now we have a binary feature matrix.

In [310]:
new_data.head()

Unnamed: 0,is_cat,autumn,spring,summer,winter,is_mix,is_female,is_fixed,breed_Abyssinian,breed_Affenpinscher,...,color_Tortie,color_Tortie Point,color_Tricolor,color_White,color_Yellow,color_Yellow Brindle,age_1,age_2,age_3,age_4
0,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0


In [303]:
# Lets break things back up into our test and train data sets.
X_train_all = new_data.iloc[:train_num]
y_train_all = all_data['OutcomeType'][:train_num]

X_test_all = new_data.iloc[train_num:]
ids_test = all_data['ID'][train_num:].values

In [307]:
X_train_all.shape, y_train_all.shape, X_test_all.shape, ids_test.shape

((26729, 300), (26729,), (11456, 300), (11456,))

### Just to get things going...

In [386]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression

In [312]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train_all, y_train_all, test_size=0.30, random_state=23)

In [412]:
lr = LogisticRegression(random_state=23, max_iter=100)

In [413]:
y_pred = lr.fit(X_train, y_train).predict(X_test)

In [414]:
print np.mean(y_pred == y_test.values)

0.632373113855


### Not particularly good, but whatever. To create a submission file I think we need the 

In [415]:
# Not particularly good, but whatever. To create a submission file I think we need the 
# robability for each class. First we retrain on entire dataset then classify the test data.
y_pred_sub = lr.fit(X_train_all, y_train_all).predict_proba(X_test_all)

### Prepare the submission file...

In [408]:
# Prepare the submission file
sub = pd.DataFrame(y_pred_sub, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
sub.insert(0, 'ID', ids_test.astype(int))

In [416]:
sub.head()

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.022876,0.004877,0.056165,0.194388,0.721694
1,2,0.461903,0.004059,0.052411,0.262515,0.219113
2,3,0.675,0.002925,0.018321,0.051124,0.25263
3,4,0.065765,0.001519,0.031001,0.319095,0.58262
4,5,0.476858,0.001032,0.019763,0.369461,0.132887


In [417]:
sub.to_csv("submission.csv", index=False)