In [51]:
# All the imports

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os


from numpy import random, delete
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC

In [34]:

# pre-processing, separate by animal type
train = pd.read_csv('train.csv')
print(train.shape)
train = train.dropna()
print(train.shape)

dogs_df = train.loc[train['Type'] == 1]
cats_df = train.loc[train['Type'] == 2]
print(dogs_df.shape)
print(cats_df.shape)

# By AGE
#age_df = dogs_df.groupby('Age').count()['Type'].rename('Count')
yung_dawgs = dogs_df.loc[dogs_df['Age'] <= 3]
print("yung_dawgs: ")
print(yung_dawgs.shape)
old_dawgs = dogs_df.loc[dogs_df['Age'] > 3]
print("old_dawgs: ")
print(old_dawgs.shape)

#speed_df = dogs_df.groupby('AdoptionSpeed').count()['Type'].rename('Count')
#speed_df
#age_df

(14993, 24)
(13724, 24)
(7480, 24)
(6244, 24)
yung_dawgs: 
(3861, 24)
old_dawgs: 
(3619, 24)


In [35]:
# Select certain features of interest in dogs
    
# want Age, Breed1, Gender, Color1, Color2?, Color3?, MaturitySize, FurLength, Vaccinated, Dewormed, Sterilized, Health, Fee?, State, PhotoAmt
dogs_sub_df = dogs_df[['Age', 'Breed1', 'Gender', 'Color1', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'State', 'PhotoAmt']].reset_index(drop=True)

print(dogs_sub_df.shape)

#dogs_sub_df.head(10)
dogs_labels_df = dogs_df[['AdoptionSpeed']].reset_index(drop=True)



(7480, 12)


In [36]:
# function to change to binary class (adopted w/in 100 days 1, not 0)
def adoptSpeed(x):
    if x == 4: # might be too rare of a bench mark if it's 2/7 ?
        return 0
    else:
        return 1
    
dogs_labels_df = dogs_labels_df.applymap(adoptSpeed)
dogs_labels_df.head(20)

Unnamed: 0,AdoptionSpeed
0,1
1,1
2,1
3,1
4,1
5,1
6,1
7,0
8,0
9,1


In [37]:
# PROCESSING INTO TRAIN AND TEST SETS 

dogs_train = dogs_sub_df.values
dogs_train_labels = dogs_labels_df.values
#dogs_labels_df.head()


dogs_train, dogs_test, dogs_train_labels, dogs_test_labels = train_test_split(dogs_train, dogs_train_labels, test_size=0.40)

scaler = StandardScaler()
# Fit on training set only.
scaler.fit(dogs_train)
# Apply transform to both the training set and the test set.
dogs_train = scaler.transform(dogs_train)
dogs_test = scaler.transform(dogs_test)


print(dogs_train.shape)
print(dogs_test.shape)
print(dogs_test_labels.shape)

(4488, 12)
(2992, 12)
(2992, 1)


In [59]:
# TRAINING VIA LOGISTIC REGRESSION

logistic = linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr')
logistic.fit(dogs_train,dogs_train_labels.ravel())
adopt_pred = logistic.predict(dogs_test)

dogs_test_labels = dogs_test_labels.reshape(dogs_test_labels.shape[0],)
dogs_test_labels

train_pred = logistic.predict(dogs_train)

train_accuracy = metrics.accuracy_score(dogs_train_labels, train_pred)
print('train accuracy: ', train_accuracy)

test_accuracy = metrics.accuracy_score(dogs_test_labels, adopt_pred)
print('test accuracy: ', test_accuracy)

logistic.coef_

train accuracy:  0.7261586452762924
test accuracy:  0.7132352941176471


array([[-0.44074153, -0.58989625, -0.10515804,  0.02609505, -0.04541464,
         0.18059579,  0.20927958, -0.11847265,  0.11892125, -0.07890545,
        -0.15702195,  0.24904273]])

In [58]:
# TRAINING VIA Ridge Regression Classifier
clf = RidgeClassifier().fit(dogs_train, dogs_train_labels.ravel())
adopt_pred = clf.predict(dogs_test)

dogs_test_labels = dogs_test_labels.reshape(dogs_test_labels.shape[0],)
dogs_test_labels

train_pred = clf.predict(dogs_train)

train_accuracy = metrics.accuracy_score(dogs_train_labels, train_pred)
print('train accuracy: ', train_accuracy)

test_accuracy = metrics.accuracy_score(dogs_test_labels, adopt_pred)
print('test accuracy: ', test_accuracy)

train accuracy:  0.7248217468805704
test accuracy:  0.7085561497326203


In [57]:
# TRAINING VIA NEAREST NEIGHBORS
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(dogs_train, dogs_train_labels.ravel())
adopt_pred = neigh.predict(dogs_test)

dogs_test_labels = dogs_test_labels.reshape(dogs_test_labels.shape[0],)
dogs_test_labels

train_pred = neigh.predict(dogs_train)

train_accuracy = metrics.accuracy_score(dogs_train_labels, train_pred)
print('train accuracy: ', train_accuracy)

test_accuracy = metrics.accuracy_score(dogs_test_labels, adopt_pred)
print('test accuracy: ', test_accuracy)

train accuracy:  0.8302139037433155
test accuracy:  0.7042112299465241


In [41]:
# TRAINING VIA DECISION TREES
dec_tree = DecisionTreeClassifier(random_state=0)
dec_tree.fit(dogs_train,dogs_train_labels)
adopt_pred = dec_tree.predict(dogs_test)

dogs_test_labels = dogs_test_labels.reshape(dogs_test_labels.shape[0],)
dogs_test_labels

train_pred = dec_tree.predict(dogs_train)

train_accuracy = metrics.accuracy_score(dogs_train_labels, train_pred)
print('train accuracy: ', train_accuracy)

test_accuracy = metrics.accuracy_score(dogs_test_labels, adopt_pred)
print('test accuracy: ', test_accuracy)

train accuracy:  0.9763814616755794
test accuracy:  0.6764705882352942


In [60]:
# TRAINING VIA LINEAR SVM
clf = LinearSVC(random_state=0, tol=1e-5, max_iter=10000)
clf.fit(dogs_train,dogs_train_labels.ravel())
adopt_pred = clf.predict(dogs_test)

#dogs_test_labels = dogs_test_labels.reshape(dogs_test_labels.shape[0],)
#dogs_test_labels

train_pred = clf.predict(dogs_train)

train_accuracy = metrics.accuracy_score(dogs_train_labels, train_pred)
print('train accuracy: ', train_accuracy)

test_accuracy = metrics.accuracy_score(dogs_test_labels, adopt_pred)
print('test accuracy: ', test_accuracy)

train accuracy:  0.7243761140819964
test accuracy:  0.7105614973262032
