# Importing Data 

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [5]:
train_df =  pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [6]:
train_df.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [7]:
test_df.head()

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan
2,3,Gus,2016-01-13 12:20:00,Cat,Neutered Male,1 year,Domestic Shorthair Mix,Brown Tabby
3,4,Pongo,2013-12-28 18:12:00,Dog,Intact Male,4 months,Collie Smooth Mix,Tricolor
4,5,Skooter,2015-09-24 17:59:00,Dog,Neutered Male,2 years,Miniature Poodle Mix,White


In [8]:
train_df.shape

(26729, 10)

In [9]:
#Null values name
print(sum(train_df["Name"].isna()))
print(np.mean(train_df["Name"].isna()))

7691
0.2877399079651315


# Cleaning Data

In [10]:
train_new = train_df.copy()

In [11]:
train_new.columns

Index(['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype',
       'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color'],
      dtype='object')

## Dropping ID

In [12]:
train_new.drop("AnimalID", axis = 1, inplace = True)

## Creating NameFlag

In [13]:
train_new["NameFlag"] = [0 if str(x) == 'nan' else 1 for x in train_df['Name']]

In [14]:
train_new.drop("Name", axis = 1, inplace = True)

## Dropping DataTime

In [15]:
train_new.drop("DateTime", axis =1, inplace = True)

## Check Outcome's distribution

In [16]:
#Count of each outcome 
print(sum(train_new["OutcomeType"].isna()))
train_new["OutcomeType"].value_counts()

0


Adoption           10769
Transfer            9422
Return_to_owner     4786
Euthanasia          1555
Died                 197
Name: OutcomeType, dtype: int64

## Drop OutcomeSubtype

In [17]:
#Count of each subtype of outcome
print(np.mean(train_new["OutcomeSubtype"].isna()))
train_new["OutcomeSubtype"].value_counts()

0.5092596056717423


Partner                7816
Foster                 1800
SCRP                   1599
Suffering              1002
Aggressive              320
Offsite                 165
In Kennel               114
Behavior                 86
Rabies Risk              74
Medical                  66
In Foster                52
Enroute                   8
Court/Investigation       6
At Vet                    4
In Surgery                3
Barn                      2
Name: OutcomeSubtype, dtype: int64

We drop outcome subtypes because:<bR>
- 50% of the outcome subtypes are missing hence we drop this columns <br>
- It's a subtype of outcome which we do not have for making predictions in the test data

In [18]:
train_new.drop("OutcomeSubtype", axis = 1, inplace = True)

## Check AnimalType's distribution

In [19]:
train_new["AnimalType"].value_counts()

Dog    15595
Cat    11134
Name: AnimalType, dtype: int64

## Check SexuponOutcome

In [20]:
train_new["SexuponOutcome"].value_counts()

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
Name: SexuponOutcome, dtype: int64

In [21]:
print(sum(train_new["Breed"].isna()))
print(train_new["Breed"].nunique())

0
1380


## Normalize age

In [22]:
def clean_age(x):
    x = str(x)
    age_new = x
    if x == 'nan':
        return np.nan
    elif (x.split(' ')[1]) in ['year','years']:
        age_new = int(x.split(' ')[0])*365
    elif (x.split(' ')[1]) in ['month','months']:
        age_new = int(x.split(' ')[0])*30
    elif (x.split(' ')[1]) in ['weeks']:
        age_new = int(x.split(' ')[0])*7
    else:
        age_new =  int(x.split(' ')[0])
    return round(age_new/365,1)

In [23]:
train_new['AgeuponOutcome'] = train_new['AgeuponOutcome'].apply(clean_age)
median = np.median(train_new.loc[~(train_new['AgeuponOutcome'].isna()),'AgeuponOutcome'])
train_new['AgeuponOutcome'] =  train_new['AgeuponOutcome'].fillna(median)

In [24]:
train_new.head()

Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,NameFlag
0,Return_to_owner,Dog,Neutered Male,1.0,Shetland Sheepdog Mix,Brown/White,1
1,Euthanasia,Cat,Spayed Female,1.0,Domestic Shorthair Mix,Cream Tabby,1
2,Adoption,Dog,Neutered Male,2.0,Pit Bull Mix,Blue/White,1
3,Transfer,Cat,Intact Male,0.1,Domestic Shorthair Mix,Blue Cream,0
4,Transfer,Dog,Neutered Male,2.0,Lhasa Apso/Miniature Poodle,Tan,0


## Handle Breed  - Create a sparse matrix for counts and check for top features

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(train_new["Breed"], train_new["OutcomeType"])

In [30]:
fit_v = CountVectorizer(min_df = 10, max_df = 0.8, ngram_range = (1,3)).fit(X_train)

In [31]:
X_train_v = fit_v.transform(X_train)
X_test_v = fit_v.transform(X_test)

In [32]:
lr= LogisticRegression(C= 100,solver= "liblinear", multi_class = "ovr")
lr.fit(X_train_v, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
lr.score(X_test_v, y_test)

0.45204249588508155

**Top features**

In [34]:
args = lr.coef_[0].argsort()
features = np.array(fit_v.get_feature_names())

**Trying a different approach by cleaning breed**

In [35]:
breed = train_new["Breed"]
outcome = train_new["OutcomeType"]

In [36]:
def identify_mix(x):
    x = x.lower()
    if (("mix" in x)):
        return 1
    else: 
        return 0

In [37]:
breed_mix = breed.apply(identify_mix)
np.mean(breed_mix)

0.8342249990646863

In [38]:
def clean_breed(x):
    x = x.lower()
    x = x.replace("mix","")
    x = x.replace("/"," ")
    return x.strip()

In [39]:
breed = breed.apply(clean_breed)

In [40]:
from scipy.sparse import csr_matrix, hstack
def add_feature(X, feature_to_add):
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

In [41]:
breeds = pd.DataFrame({"breed": breed, "mix": breed_mix})

In [42]:
X_train, X_test, y_train, y_test = train_test_split(breeds, outcome)

In [43]:
fit_v = CountVectorizer(min_df = 10, max_df = 0.8 , ngram_range = (1,2)).fit(X_train["breed"])
X_train_v = fit_v.transform(X_train["breed"])
X_test_v = fit_v.transform(X_test["breed"])

In [44]:
X_train_new = add_feature(X_train_v, X_train["mix"])
X_test_new = add_feature(X_test_v, X_test["mix"])

In [45]:
lr= LogisticRegression(C=1,solver= "liblinear", multi_class = "ovr")

In [46]:
lr.fit(X_train_new, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [47]:
lr.score(X_test_new, y_test)

0.460721232979201

In [48]:
args = lr.coef_[0].argsort()
features = np.array(fit_v.get_feature_names())
features[args[0:20]]

array(['retriever plott', 'retriever pointer', 'catahoula labrador',
       'weimaraner', 'pekingese', 'spaniel', 'german', 'dog labrador',
       'apso', 'lhasa apso', 'lhasa', 'maltese miniature', 'shih tzu',
       'shih', 'tzu', 'ragdoll', 'boston', 'boston terrier',
       'shorthair jack', 'shorthair miniature'], dtype='<U22')

This approach adds a little benefit. We can revisit this later

## Clean color and Vectorize

In [49]:
def clean_color(x):
    return x.replace("/"," ")

In [50]:
train_new["Color"] = train_new["Color"].apply(clean_color)

In [51]:
colors = train_new["Color"]

In [52]:
X_train, X_test, y_train, y_test = train_test_split(colors, outcome)

In [53]:
fit_c = CountVectorizer(min_df = 10, max_df = 0.8).fit(colors)
X_train_v = fit_c.transform(X_train)
X_test_v = fit_c.transform(X_test)

In [54]:
lr= LogisticRegression(C=1,solver= "liblinear", multi_class = "ovr")
lr.fit(X_train_v, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [55]:
X_train_v.shape

(20046, 33)

In [56]:
lr.score(X_test_v, y_test)

0.44381265898548555

In [57]:
train_new.head()

Unnamed: 0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,NameFlag
0,Return_to_owner,Dog,Neutered Male,1.0,Shetland Sheepdog Mix,Brown White,1
1,Euthanasia,Cat,Spayed Female,1.0,Domestic Shorthair Mix,Cream Tabby,1
2,Adoption,Dog,Neutered Male,2.0,Pit Bull Mix,Blue White,1
3,Transfer,Cat,Intact Male,0.1,Domestic Shorthair Mix,Blue Cream,0
4,Transfer,Dog,Neutered Male,2.0,Lhasa Apso/Miniature Poodle,Tan,0


## Create sparse matrix with all the features

In [58]:
y = train_new["OutcomeType"]

In [59]:
dog = list(pd.get_dummies(train_new["AnimalType"], drop_first = True))

In [60]:
dog = pd.get_dummies(train_new["AnimalType"], drop_first = True).values
dog = dog[:,0]

In [61]:
#We do not use the unknown sex
X =pd.get_dummies(train_new["SexuponOutcome"]).values

In [62]:
in_f, in_m, n_m, s_f, u_f = X[:,0], X[:,1], X[:,2], X[:,3], X[:,4]

In [63]:
age = train_new["AgeuponOutcome"]

In [64]:
name = train_new["NameFlag"]

In [65]:
breed = train_new["Breed"]
breed_mix = breed.apply(identify_mix)
breed = breed.apply(clean_breed)
breeds = pd.DataFrame({"breed": breed, "mix": breed_mix})

In [66]:
dict = {"breed": breed, "mix": breed_mix, "dog_f":dog, "intact_male":in_m, "intact_female":in_f, "neutered_m":n_m, "spayed_f":s_f, "age":age, "name_f":name}

In [67]:
train_final = pd.DataFrame(dict)

In [68]:
X_train, X_test, y_train, y_test = train_test_split(train_final, outcome)

In [69]:
fit_v = CountVectorizer(min_df = 20, max_df = 0.8).fit(X_train["breed"])
X_train_v = fit_v.transform(X_train["breed"])
X_test_v = fit_v.transform(X_test["breed"])

In [70]:
X_train_v.shape

(20046, 114)

In [71]:
outcome.value_counts()

Adoption           10769
Transfer            9422
Return_to_owner     4786
Euthanasia          1555
Died                 197
Name: OutcomeType, dtype: int64

In [72]:
X_train_v  = add_feature(X_train_v, X_train["mix"])
X_train_v  = add_feature(X_train_v, X_train["dog_f"])
X_train_v  = add_feature(X_train_v, X_train["intact_male"])
X_train_v  = add_feature(X_train_v, X_train["intact_female"])
X_train_v  = add_feature(X_train_v, X_train["neutered_m"])
X_train_v  = add_feature(X_train_v, X_train["spayed_f"])
X_train_v  = add_feature(X_train_v, X_train["age"])
X_train_v  = add_feature(X_train_v, X_train["name_f"])

In [73]:
X_train_v.shape

(20046, 122)

In [74]:
X_test_v  = add_feature(X_test_v, X_test["mix"])
X_test_v  = add_feature(X_test_v, X_test["dog_f"])
X_test_v  = add_feature(X_test_v, X_test["intact_male"])
X_test_v  = add_feature(X_test_v, X_test["intact_female"])
X_test_v  = add_feature(X_test_v, X_test["neutered_m"])
X_test_v  = add_feature(X_test_v, X_test["spayed_f"])
X_test_v  = add_feature(X_test_v, X_test["age"])
X_test_v  = add_feature(X_test_v, X_test["name_f"])

# Testing Basic Models

## Logistic Regression

In [75]:
lr.fit(X_train_v, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [76]:
lr.score(X_test_v, y_test)

0.6329492742780188

In [77]:
lr.fit(X_train.iloc[:,1:],y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [78]:
lr.score(X_test.iloc[:,1:], y_test)

0.6307047733054018

## Naive Bayes

In [79]:
from sklearn.naive_bayes import MultinomialNB

In [80]:
nb = MultinomialNB(alpha = 0.01)

In [81]:
nb.fit(X_train_v, y_train)

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [82]:
nb.score(X_test_v, y_test)

0.6058656292084393

## SVM

In [83]:
from sklearn.svm import SVC

In [84]:
svm = SVC(C=0.1, gamma = "scale", cache_size = 600)

In [85]:
svm.fit(X_train_v, y_train)

SVC(C=0.1, break_ties=False, cache_size=600, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Tree based models - Random Forest and AdaBoost

In [86]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [87]:
rf = RandomForestClassifier(n_estimators = 100)

In [88]:
rf.fit(X_train_v, y_train)
rf.score(X_test_v, y_test)

0.6105042645518479

In [89]:
ada = AdaBoostClassifier(n_estimators = 50)

In [90]:
ada.fit(X_train_v, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [91]:
ada.score(X_test_v, y_test)

0.5733951818045788