# Importing Data 

In [311]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier

In [162]:
train_df =  pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [163]:
train_df.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [164]:
test_df.head()

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


In [165]:
train_df.shape

(26729, 10)

In [166]:
#Null values name
print(sum(train_df["Name"].isna()))
print(np.mean(train_df["Name"].isna()))

7691
0.2877399079651315


# Cleaning Data

In [167]:
train_new = train_df.copy()
test_new = test_df.copy()

In [168]:
train_new.columns

Index(['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype',
       'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color'],
      dtype='object')

## Dropping ID

In [169]:
train_new.drop("AnimalID", axis = 1, inplace = True)
test_new.drop("ID", axis = 1, inplace = True)

## Creating NameFlag

In [170]:
train_new["NameFlag"] = [0 if str(x) == 'nan' else 1 for x in train_new['Name']]
test_new["NameFlag"] = [0 if str(x) == 'nan' else 1 for x in test_new['Name']]

In [171]:
train_new.drop("Name", axis = 1, inplace = True)
test_new.drop("Name", axis = 1, inplace = True)

## Dropping DataTime

In [172]:
train_new.drop("DateTime", axis =1, inplace = True)
test_new.drop("DateTime", axis =1, inplace = True)

## Check Outcome's distribution

In [173]:
#Count of each outcome 
print(sum(train_new["OutcomeType"].isna()))
train_new["OutcomeType"].value_counts()

0


Adoption           10769
Transfer            9422
Return_to_owner     4786
Euthanasia          1555
Died                 197
Name: OutcomeType, dtype: int64

## Drop OutcomeSubtype

In [174]:
#Count of each subtype of outcome
print(np.mean(train_new["OutcomeSubtype"].isna()))
train_new["OutcomeSubtype"].value_counts()

0.5092596056717423


Partner                7816
Foster                 1800
SCRP                   1599
Suffering              1002
Aggressive              320
Offsite                 165
In Kennel               114
Behavior                 86
Rabies Risk              74
Medical                  66
In Foster                52
Enroute                   8
Court/Investigation       6
At Vet                    4
In Surgery                3
Barn                      2
Name: OutcomeSubtype, dtype: int64

We drop outcome subtypes because:<bR>
- 50% of the outcome subtypes are missing hence we drop this columns <br>
- It's a subtype of outcome which we do not have for making predictions in the test data

In [175]:
train_new.drop("OutcomeSubtype", axis = 1, inplace = True)

## Check AnimalType's distribution

In [176]:
train_new["AnimalType"].value_counts()

Dog    15595
Cat    11134
Name: AnimalType, dtype: int64

In [257]:
fit_at = OneHotEncoder(drop = "first", sparse = True).fit(np.array(train_new["AnimalType"]).reshape(-1,1))

## Check SexuponOutcome

In [178]:
train_new["SexuponOutcome"].value_counts()

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
Name: SexuponOutcome, dtype: int64

In [179]:
train_new = train_new.dropna(subset = ["SexuponOutcome"])

In [256]:
fit_sex = OneHotEncoder(drop = "first" , sparse = True).fit(np.array(train_new["SexuponOutcome"]).reshape(-1,1))

## Normalize age

In [181]:
def clean_age(x):
    x = str(x)
    age_new = x
    if x == 'nan':
        return np.nan
    elif (x.split(' ')[1]) in ['year','years']:
        age_new = int(x.split(' ')[0])*365
    elif (x.split(' ')[1]) in ['month','months']:
        age_new = int(x.split(' ')[0])*30
    elif (x.split(' ')[1]) in ['weeks']:
        age_new = int(x.split(' ')[0])*7
    else:
        age_new =  int(x.split(' ')[0])
    return round(age_new/365,1)

In [182]:
train_new['AgeuponOutcome'] = train_new['AgeuponOutcome'].apply(clean_age)
median = np.median(train_new.loc[~(train_new['AgeuponOutcome'].isna()),'AgeuponOutcome'])
train_new['AgeuponOutcome'] =  train_new['AgeuponOutcome'].fillna(median)

In [183]:
test_new['AgeuponOutcome'] = test_new['AgeuponOutcome'].apply(clean_age)
test_new['AgeuponOutcome'] =  test_new['AgeuponOutcome'].fillna(median)

In [184]:
train_new.head()

Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,NameFlag
0,Return_to_owner,Dog,Neutered Male,1.0,Shetland Sheepdog Mix,Brown/White,1
1,Euthanasia,Cat,Spayed Female,1.0,Domestic Shorthair Mix,Cream Tabby,1
2,Adoption,Dog,Neutered Male,2.0,Pit Bull Mix,Blue/White,1
3,Transfer,Cat,Intact Male,0.1,Domestic Shorthair Mix,Blue Cream,0
4,Transfer,Dog,Neutered Male,2.0,Lhasa Apso/Miniature Poodle,Tan,0


## Handle Breed  - Create a sparse matrix for counts

In [185]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [186]:
train_new["Breed"].nunique()

1380

There are 1,380 unique values. We create a sparse matrix out of these

In [187]:
def identify_mix(x):
    x = x.lower()
    if (("mix" in x)):
        return 1
    else: 
        return 0

In [188]:
breed_mix = train_new["Breed"].apply(identify_mix)
np.mean(breed_mix)

0.8342562107153547

In [247]:
breed_mix_test = test_new["Breed"].apply(identify_mix)

In [189]:
def clean_breed(x):
    x = x.lower()
    x = x.replace("mix","")
    x = x.replace("/"," ")
    return x.strip()

In [322]:
breed = train_new["Breed"].apply(clean_breed)
breed_test = test_new["Breed"].apply(clean_breed)

In [192]:
fit_breed = CountVectorizer(min_df = 10, max_df = 0.8).fit(breed)

## Clean color and Vectorize

In [193]:
def clean_color(x):
    return x.replace("/"," ")

In [194]:
train_new["Color"] = train_new["Color"].apply(clean_color)
test_new["Color"] = test_new["Color"].apply(clean_color)

In [195]:
colors = train_new["Color"]

In [196]:
fit_c = CountVectorizer(min_df = 10, max_df = 0.8).fit(colors)

## Create sparse matrix with all the features

In [261]:
from scipy.sparse import csr_matrix, hstack
def add_feature(X, feature_to_add):
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [211]:
y = train_new["OutcomeType"]

In [262]:
train_s = fit_breed.transform(breed)
train_s = hstack((train_s , fit_c.transform(train_new["Color"])) )
train_s = hstack((train_s, fit_at.transform(np.array(train_new["AnimalType"]).reshape(-1,1))))
train_s = hstack((train_s, fit_sex.transform(np.array(train_new["SexuponOutcome"]).reshape(-1,1))))
train_s = add_feature(train_s, breed_mix)
train_s = add_feature(train_s, train_new["AgeuponOutcome"])
train_s = add_feature(train_s, train_new["NameFlag"])

In [263]:
X_train, X_test, y_train, y_test = train_test_split(train_s, y)

# Testing Basic Models

In [350]:
from sklearn.metrics import log_loss

## Logistic Regression

In [269]:
lr = LogisticRegression(max_iter = 1000)

In [351]:
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [356]:
y_pred = lr.predict_proba(X_test)

In [361]:
log_loss(y_test, y_pred)

0.8935881132505155

## Naive Bayes

In [362]:
from sklearn.naive_bayes import MultinomialNB

In [363]:
nb = MultinomialNB(alpha = 0.001)

In [364]:
nb.fit(X_train, y_train)

MultinomialNB(alpha=0.001, class_prior=None, fit_prior=True)

In [365]:
y_pred = nb.predict_proba(X_test)

In [366]:
log_loss(y_test, y_pred)

1.2394398926309393

## SVM

In [367]:
from sklearn.svm import SVC

In [368]:
svm = SVC(C=0.1, gamma = "scale", cache_size = 600, probability = True)

In [369]:
svm.fit(X_train, y_train)

SVC(C=0.1, break_ties=False, cache_size=600, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [370]:
y_pred = svm.predict_proba(X_test)

In [371]:
log_loss(y_test, y_pred)

0.9088386296652229

## Tree based models - Random Forest and AdaBoost

In [372]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [397]:
rf = RandomForestClassifier(n_estimators = 100, max_depth = 10, max_features = 100)

In [398]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features=100,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [399]:
y_pred = rf.predict_proba(X_test)
log_loss(y_test, y_pred)

0.8367678487202428

In [400]:
ada = AdaBoostClassifier(n_estimators = 100, learning_rate = 1)

In [376]:
ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
                   n_estimators=100, random_state=None)

In [377]:
y_pred = ada.predict_proba(X_test)
log_loss(y_test, y_pred)

1.5710098119669407

In [312]:
xgb = XGBClassifier()

In [378]:
xgb.fit(X_train, y_train)
y_pred = xgb.predict_proba(X_test)
log_loss(y_test, y_pred)

0.8418362367752158

## Trying Ensemble

In [314]:
from sklearn.ensemble import VotingClassifier

In [403]:
model = VotingClassifier(estimators=[('xgb', xgb),('rf', rf)], voting='soft')
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)

In [404]:
log_loss(y_test, y_pred)

0.8352142761246277

In [405]:
model.fit(train_s, y)

VotingClassifier(estimators=[('xgb',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=1, gamma=0,
                                            learning_rate=0.1, max_delta_step=0,
                                            max_depth=3, min_child_weight=1,
                                            missing=None, n_estimators=100,
                                            n_jobs=1, nthread=None,
                                            objective='multi:softprob',
                                            random_state=0, reg_alpha=0,
                                            reg_lambda=1, scale_pos_weight=...
                                                     criterion='gini',
                                                     max_depth=10,
                   

# Testing on Test Set

## Feature engineering

In [406]:
test_new

Unnamed: 0,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,NameFlag
0,Dog,Intact Female,0.8,Labrador Retriever Mix,Red White,1
1,Dog,Spayed Female,2.0,German Shepherd/Siberian Husky,Black Tan,1
2,Cat,Neutered Male,1.0,Domestic Shorthair Mix,Brown Tabby,1
3,Dog,Intact Male,0.3,Collie Smooth Mix,Tricolor,1
4,Dog,Neutered Male,2.0,Miniature Poodle Mix,White,1
...,...,...,...,...,...,...
11451,Cat,Neutered Male,0.2,Domestic Shorthair Mix,Black,0
11452,Cat,Intact Female,0.0,Domestic Shorthair Mix,Blue,0
11453,Cat,Intact Female,1.0,Domestic Shorthair Mix,Calico,0
11454,Dog,Neutered Male,6.0,German Shepherd Mix,Black Tan,1


In [407]:
test_s = fit_breed.transform(breed_test)
test_s = hstack((test_s , fit_c.transform(test_new["Color"])) )
test_s = hstack((test_s, fit_at.transform(np.array(test_new["AnimalType"]).reshape(-1,1))))
test_s = hstack((test_s, fit_sex.transform(np.array(test_new["SexuponOutcome"]).reshape(-1,1))))
test_s = add_feature(test_s, breed_mix_test)
test_s = add_feature(test_s, test_new["AgeuponOutcome"])
test_s = add_feature(test_s, test_new["NameFlag"])

## Running the best model

In [408]:
preds = model.predict_proba(test_s)

In [412]:
preds_df = pd.DataFrame(preds, columns = list(model.classes_) )

In [416]:
preds_df["ID"] = test_df["ID"]

In [417]:
preds_df = preds_df[["ID", "Adoption", "Died", "Euthanasia", "Return_to_owner", "Transfer"]]

In [418]:
preds_df.to_csv("Animal_predictions.csv", index = False)