## Using Regression for Predicting Shelter Outcome ##

Inspired from Megan Risdal
https://www.kaggle.com/mrisdal/shelter-animal-outcomes/quick-dirty-randomforest

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn import linear_model

In [None]:
animals1 = pd.read_csv('../input/train.csv')
animals2 = pd.read_csv('../input/test.csv')
print (animals1.shape)
print (animals2.shape)

In [None]:
print (animals1.head())

In [None]:
print (animals2.head())

In [None]:
print (animals1.info())
print (animals2.info())

# ## Merge the dataframes animals1 & animals2 ##

**Lets rename the column AnimalID for animals1 dataframe, so all columns have same column name when merging**

In [None]:
animals1.rename(columns = {'AnimalID':'ID'}, inplace=True)
animals = pd.merge(animals1, animals2, how='outer')

## Lets check the columns with missing values ##

In [None]:
print (animals.info())

In [None]:
# Name column has missing values
# Lets replace the missing values with 'noname' instead of null
animals['Name'] = animals.loc[animals.Name.isnull(), 'Name']='Noname'

In [None]:
# Lets see all the unique values in AgeuponOutcome column
print (animals.AgeuponOutcome.unique())

In [None]:
# Lets convert the AgeuponOutcome into days and create a new column 
def agetodays(x):
        try:
            y = x.split()
        except:
            return None 
        if 'year' in y[1]:
            return float(y[0]) * 365
        elif 'month' in y[1]:
            return float(y[0]) * (365/12)
        elif 'week' in y[1]:
            return float(y[0]) * 7
        elif 'day' in y[1]:
            return float(y[0])
        
animals['AgeInDays'] = animals['AgeuponOutcome'].apply(agetodays)
print (animals.AgeInDays.unique())

In [None]:
# Lets impute the missing values with median value
animals.loc[(animals['AgeInDays'].isnull()),'AgeInDays'] = animals['AgeInDays'].median()

# Lets drop the AgeuponOutcome column
animals.drop('AgeuponOutcome', axis=1, inplace=True)

In [None]:
# Lets impute the missing value for SexuponOutcome based on the most repeated value
animals.loc[(animals['SexuponOutcome'].isnull()), 'SexuponOutcome'] = animals['SexuponOutcome'].fillna(animals['SexuponOutcome'].value_counts().index[0])

In [None]:
# Lets drop the outcomesubtype as we don't need it for our prediction
animals.drop('OutcomeSubtype', axis=1, inplace=True)

## Lets convert the datetime column to day and hours column##

In [None]:
def timetoday(x):
    y = x.split(' ')[1].split(':')[0]
    y = int(y)
    if (y>5) & (y<11):
        return 'morning'
    elif (y>10) & (y<16):
        return 'afternoon'
    elif (y>15) & (y<20):
        return 'night'
    else:
        return 'latenight'   
    
animals['Timeofday'] = animals.DateTime.apply(timetoday)

animals['hours'] = animals.DateTime.str[11:13].astype('int')

In [None]:
# Lets drop the columns we don't need for prediction
animals.drop(['ID', 'DateTime'], axis=1, inplace=True)
animals.info()

In [None]:
# Lets convert the categotical to numerical for prediction 
le = LabelEncoder()
col_num = animals.select_dtypes(include=['O']).columns.values
col_num_list = list(col_num)
col_num_list.remove('OutcomeType')

for col in col_num_list:
    animals[col] = le.fit_transform(animals[col])
print(animals.head())

## Set Training and Testing data ##

In [None]:
# Lets have training and testing data

train = animals[animals['OutcomeType'].isnull()==False]
test = animals[animals['OutcomeType'].isnull()==True]
print (train.shape)
print (test.shape)

## Predict OutcomeType##

In [None]:
train['OutcomeType'] = le.fit_transform(train['OutcomeType'])

In [None]:
# Initialize the target and attribute features
target_train = ['OutcomeType']
features_train = ['Name', 'AnimalType', 'SexuponOutcome', 'Breed', 'Color', 'AgeInDays', 'Timeofday', 'hours']

# Initialize logistic regression model
log_model = linear_model.LogisticRegression()

# Train the model
log_model.fit(X = train[features_train],
              y = train[target_train])

# Check trained model intercept
print(log_model.intercept_)

# Check trained model coefficients
print(log_model.coef_)

In [None]:
# Make predictions
preds = log_model.predict(X= test[features_train])
print (preds)
preds = le.inverse_transform(preds)
print (preds)

In [None]:
# Retransform the AnimalType 
animals.loc[animals['AnimalType']==0, 'AnimalType']='Cat'
animals.loc[animals['AnimalType']==1, 'AnimalType']='Dog'

# Retransform the SexuponOutcome

animals.loc[animals['SexuponOutcome']==2, 'SexuponOutcome']='Neutered Male'
animals.loc[animals['SexuponOutcome']==3, 'SexuponOutcome']='Spayed Female'
animals.loc[animals['SexuponOutcome']==1, 'SexuponOutcome']='Intact Male'
animals.loc[animals['SexuponOutcome']==0, 'SexuponOutcome']='Intact Female'
animals.loc[animals['SexuponOutcome']==4, 'SexuponOutcome']='Unknown'

In [None]:
# Impute the predicted values
animals.loc[animals['OutcomeType'].isnull()==True, 'OutcomeType']=preds

In [None]:
animals.info()

## Both cats and dogs are commonly adopted  or transferred ##

In [None]:
sns.countplot(data = animals, x='AnimalType', hue='OutcomeType')
plt.show()

## Animals are much more likely to be adopted if they’ve been neutered ##

In [None]:
from statsmodels.graphics.mosaicplot import mosaic
plt.rcParams['font.size'] = 8.0
mosaic(animals, ['AnimalType', 'SexuponOutcome','OutcomeType'])
plt.xticks(rotation=90)
plt.show()