In [1]:
# All the imports

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os


from numpy import random, delete
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

import mord

In [2]:

# pre-processing, separate by animal type
train = pd.read_csv('train.csv')
print(train.shape)
train = train.dropna()
print(train.shape)

dogs_df = train.loc[train['Type'] == 1]
cats_df = train.loc[train['Type'] == 2]
print(dogs_df.shape)
print(cats_df.shape)


(14993, 24)
(13724, 24)
(7480, 24)
(6244, 24)


In [3]:
# Select certain features of interest in dogs and cats
    
# want Age, Breed1, Gender, Color1, Color2?, Color3?, MaturitySize, FurLength, Vaccinated, Dewormed, Sterilized, Health, Fee?, State, PhotoAmt
dogs_sub_df = dogs_df[['Age', 'Breed1', 'Gender', 'Color1', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'State', 'PhotoAmt']].reset_index(drop=True)
cats_sub_df = cats_df[['Age', 'Breed1', 'Gender', 'Color1', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'State', 'PhotoAmt']].reset_index(drop=True)


print(dogs_sub_df.shape)
print(cats_sub_df.shape)

dogs_labels_df = dogs_df[['AdoptionSpeed']].reset_index(drop=True)
cats_labels_df = cats_df[['AdoptionSpeed']].reset_index(drop=True)

dogs_labels_df.head()


(7480, 12)
(6244, 12)


Unnamed: 0,AdoptionSpeed
0,3
1,2
2,2
3,3
4,1


In [4]:
# function to change to binary class (adopted w/in 100 days 1, not 0)
def adoptSpeed(x):
    if x == 4: # might be too rare of a bench mark if it's 2/7 ?
        return 0
    else:
        return 1

def adoptSpeedBins(x):
    if x == 0 or x == 1: 
        return 0
    elif x == 2:
        return 1
    else:
        return 2
    
dogs_labels_2_df = dogs_labels_df.applymap(adoptSpeed)
dogs_labels_bins_df = dogs_labels_df.applymap(adoptSpeedBins)
cats_labels_2_df = cats_labels_df.applymap(adoptSpeed)
cats_labels_bins_df = cats_labels_df.applymap(adoptSpeedBins)


In [5]:
# PROCESSING INTO TRAIN AND TEST SETS 

dogs_train_orig = dogs_sub_df.values
dogs_train_labels = dogs_labels_df.values
dogs_train_2_labels = dogs_labels_2_df.values
dogs_train_bins_labels = dogs_labels_bins_df.values

cats_train_orig = cats_sub_df.values
cats_train_labels = cats_labels_df.values
cats_train_2_labels = cats_labels_2_df.values
cats_train_bins_labels = cats_labels_bins_df.values

dogs_train, dogs_test, dogs_train_labels, dogs_test_labels = train_test_split(dogs_train_orig, dogs_train_labels, test_size=0.40)
print(dogs_train.shape)
print(dogs_train_2_labels.shape)
dogs_train_2, dogs_test_2, dogs_train_2_labels, dogs_test_2_labels = train_test_split(dogs_train_orig, dogs_train_2_labels, test_size=0.40)
dogs_train_bins, dogs_test_bins, dogs_train_bins_labels, dogs_test_bins_labels = train_test_split(dogs_train_orig, dogs_train_bins_labels, test_size=0.40)


cats_train, cats_test, cats_train_labels, cats_test_labels = train_test_split(cats_train_orig, cats_train_labels, test_size=0.40)
cats_train_2, cats_test_2, cats_train_2_labels, cats_test_2_labels = train_test_split(cats_train_orig, cats_train_2_labels, test_size=0.40)
cats_train_bins, cats_test_bins, cats_train_bins_labels, cats_test_bins_labels = train_test_split(cats_train_orig, cats_train_bins_labels, test_size=0.40)

scaler_d = StandardScaler()
# Fit on training set only.
scaler_d.fit(dogs_train)
# Apply transform to both the training set and the test set.
dogs_train = scaler_d.transform(dogs_train)
dogs_test = scaler_d.transform(dogs_test)

scaler_d2 = StandardScaler()
# Fit on training set only.
scaler_d2.fit(dogs_train_2)
# Apply transform to both the training set and the test set.
dogs_train_2 = scaler_d2.transform(dogs_train_2)
dogs_test_2 = scaler_d2.transform(dogs_test_2)

scaler_dbins = StandardScaler()
# Fit on training set only.
scaler_dbins.fit(dogs_train_bins)
# Apply transform to both the training set and the test set.
dogs_train_bins = scaler_dbins.transform(dogs_train_bins)
dogs_test_bins = scaler_dbins.transform(dogs_test_bins)

scaler_c = StandardScaler()
# Fit on training set only.
scaler_c.fit(cats_train)
# Apply transform to both the training set and the test set.
cats_train = scaler_c.transform(cats_train)
cats_test = scaler_c.transform(cats_test)

scaler_c2 = StandardScaler()
# Fit on training set only.
scaler_c2.fit(cats_train_2)
# Apply transform to both the training set and the test set.
cats_train_2 = scaler_c2.transform(cats_train_2)
cats_test_2 = scaler_c2.transform(cats_test_2)

scaler_cbins = StandardScaler()
# Fit on training set only.
scaler_cbins.fit(cats_train_bins)
# Apply transform to both the training set and the test set.
cats_train_bins = scaler_cbins.transform(cats_train_bins)
cats_test_bins = scaler_cbins.transform(cats_test_bins)


print(dogs_train.shape)
print(dogs_test.shape)
print(dogs_test_labels.shape)

print(cats_train.shape)
print(cats_test.shape)
print(cats_test_labels.shape)

(4488, 12)
(7480, 1)
(4488, 12)
(2992, 12)
(2992, 1)
(3746, 12)
(2498, 12)
(2498, 1)




In [6]:
# computed on full label set DOGS

# plot result vectors...

clf = mord.LogisticAT(alpha=1) # absolute loss - 
clf.fit(dogs_train, dogs_train_labels.ravel())
print('Mean Absolute Error of LogisticAT on Training Data %s' % metrics.mean_absolute_error(clf.predict(dogs_train), dogs_train_labels))
print('Mean Absolute Error of LogisticAT on Test Data %s' % metrics.mean_absolute_error(clf.predict(dogs_test), dogs_test_labels))

clf2 = mord.LogisticIT(alpha=1) # 0-1 loss
clf2.fit(dogs_train, dogs_train_labels.ravel())
print('Mean Absolute Error of LogisticIT on Training Data %s' % metrics.mean_absolute_error(clf2.predict(dogs_train), dogs_train_labels))
print('Mean Absolute Error of LogisticIT on Test Data %s' % metrics.mean_absolute_error(clf2.predict(dogs_test), dogs_test_labels))

clf3 = mord.LogisticSE(alpha=1) # squared error
clf3.fit(dogs_train, dogs_train_labels.ravel())
print('Mean Absolute Error of LogisticSE on Training Data %s' % metrics.mean_absolute_error(clf3.predict(dogs_train), dogs_train_labels))
print('Mean Absolute Error of LogisticSE on Test Data %s' % metrics.mean_absolute_error(clf3.predict(dogs_test), dogs_test_labels))




Mean Absolute Error of LogisticAT on Training Data 0.9133244206773619
Mean Absolute Error of LogisticAT on Test Data 0.9087566844919787
Mean Absolute Error of LogisticIT on Training Data 1.1045008912655971
Mean Absolute Error of LogisticIT on Test Data 1.1620989304812834
Mean Absolute Error of LogisticSE on Training Data 0.9126559714795008
Mean Absolute Error of LogisticSE on Test Data 0.9207887700534759


In [7]:
# CATS

clf = mord.LogisticAT(alpha=1)
clf.fit(cats_train, cats_train_labels.ravel())
print('Mean Absolute Error of LogisticAT on Training Data %s' % metrics.mean_absolute_error(clf.predict(cats_train), cats_train_labels))
print('Mean Absolute Error of LogisticAT on Test Data %s' % metrics.mean_absolute_error(clf.predict(cats_test), cats_test_labels))


clf2 = mord.LogisticIT(alpha=1)
clf2.fit(cats_train, cats_train_labels.ravel())
print('Mean Absolute Error of LogisticIT on Training Data %s' % metrics.mean_absolute_error(clf2.predict(cats_train), cats_train_labels))
print('Mean Absolute Error of LogisticIT on Test Data %s' % metrics.mean_absolute_error(clf2.predict(cats_test), cats_test_labels))


clf3 = mord.LogisticSE(alpha=1)
clf3.fit(cats_train, cats_train_labels.ravel())
print('Mean Absolute Error of LogisticSE on Trianing Data %s' % metrics.mean_absolute_error(clf3.predict(cats_train), cats_train_labels))
print('Mean Absolute Error of LogisticSE on Test Data %s' % metrics.mean_absolute_error(clf3.predict(cats_test), cats_test_labels))



Mean Absolute Error of LogisticAT on Training Data 0.9684997330485852
Mean Absolute Error of LogisticAT on Test Data 0.9719775820656525
Mean Absolute Error of LogisticIT on Training Data 1.120928990923652
Mean Absolute Error of LogisticIT on Test Data 1.1120896717373898
Mean Absolute Error of LogisticSE on Trianing Data 0.9698344901227977
Mean Absolute Error of LogisticSE on Test Data 0.9711769415532426


In [8]:
temp_df = pd.DataFrame(dogs_train_labels)
temp_sub = temp_df.loc[temp_df[0]==1] #3212 rows out of 4487 hmmm
temp_sub.head() 

Unnamed: 0,0
7,1
8,1
10,1
12,1
16,1


In [9]:
# DOGS
# output for binary classification
# before cv
logistic = linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr')
logistic.fit(dogs_train_2,dogs_train_2_labels.ravel())

dogs_test_labels = dogs_test_2_labels.reshape(dogs_test_2_labels.shape[0],)
train_pred = logistic.predict(dogs_train_2)

adopt_pred = logistic.predict(dogs_test_2)

train_accuracy = metrics.accuracy_score(dogs_train_2_labels, train_pred)
print('train accuracy (before cv): ', train_accuracy)

test_accuracy = metrics.accuracy_score(dogs_test_2_labels, adopt_pred)
print('test accuracy (before cv): ', test_accuracy)

f1 = f1_score(dogs_test_2_labels, adopt_pred, average='macro')
print(f1)


# after cv
log = linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr')
parameters = {'C':[0.1, 1, 5, 10]}
clf = GridSearchCV(log, parameters, cv=5)
clf.fit(dogs_train_2,dogs_train_2_labels.ravel())

train_pred_cv = clf.predict(dogs_train_2)
adopt_pred_cv = clf.predict(dogs_test_2)
lr_train_accuracy_dogs = clf.score(dogs_train_2, dogs_train_2_labels)
lr_test_accuracy_dogs = clf.score(dogs_test_2, dogs_test_2_labels)

print('train accuracy (after cv): ', lr_train_accuracy_dogs)
print('test accuracy (after cv): ', lr_test_accuracy_dogs)
lr_f1_cv_dogs = f1_score(dogs_test_2_labels, adopt_pred_cv, average='macro')
print(lr_f1_cv_dogs)
print(clf.best_params_)


train accuracy (before cv):  0.7219251336898396
test accuracy (before cv):  0.7212566844919787
0.48693596075319
train accuracy (after cv):  0.7208110516934046
test accuracy (after cv):  0.7219251336898396
0.4864566236496851
{'C': 0.1}


In [10]:
# CATS
# output for binary classification
# before cv
logistic = linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr')
logistic.fit(cats_train_2,cats_train_2_labels.ravel())

cats_test_labels = cats_test_2_labels.reshape(cats_test_2_labels.shape[0],)
train_pred = logistic.predict(cats_train_2)

adopt_pred = logistic.predict(cats_test_2)

train_accuracy = metrics.accuracy_score(cats_train_2_labels, train_pred)
print('train accuracy (before cv): ', train_accuracy)

test_accuracy = metrics.accuracy_score(cats_test_2_labels, adopt_pred)
print('test accuracy (before cv): ', test_accuracy)

f1 = f1_score(cats_test_2_labels, adopt_pred, average='macro')
print(f1)


# after cv
log = linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr')
parameters = {'C':[0.1, 1, 5, 10]}
clf = GridSearchCV(log, parameters, cv=5)
clf.fit(cats_train_2,cats_train_2_labels.ravel())

train_pred_cv = clf.predict(cats_train_2)
adopt_pred_cv = clf.predict(cats_test_2)
lr_train_accuracy_cats = clf.score(cats_train_2, cats_train_2_labels)
lr_test_accuracy_cats = clf.score(cats_test_2, cats_test_2_labels)

print('train accuracy (after cv): ', lr_train_accuracy_cats)
print('test accuracy (after cv): ', lr_test_accuracy_cats)
lr_f1_cats = f1_score(cats_test_2_labels, adopt_pred_cv, average='macro')
print(lr_f1_cats)
print(clf.best_params_)


train accuracy (before cv):  0.7330485851575014
test accuracy (before cv):  0.7526020816653323
0.4713716305062459
train accuracy (after cv):  0.7335824879871863
test accuracy (after cv):  0.7526020816653323
0.4713716305062459
{'C': 0.1}


In [11]:
# TRAINING VIA Ridge Regression Classifier - DOGS
rc = RidgeClassifier()
parameters = {'alpha':[0.01, 0.1, 1, 10], 'solver' : ('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga')}
clf = GridSearchCV(rc, parameters, cv=5)

clf.fit(dogs_train_2,dogs_train_2_labels.ravel())

train_pred_cv = clf.predict(dogs_train_2)
adopt_pred_cv = clf.predict(dogs_test_2)
rc_train_accuracy_dogs = clf.score(dogs_train_2, dogs_train_2_labels)
rc_test_accuracy_dogs = clf.score(dogs_test_2, dogs_test_2_labels)

print('train accuracy (after cv): ', rc_train_accuracy_dogs)
print('test accuracy (after cv): ', rc_test_accuracy_dogs)
rc_f1_dogs = f1_score(dogs_test_2_labels, adopt_pred_cv, average='macro')
print(rc_f1_dogs)
print(clf.best_params_)


train accuracy (after cv):  0.7176916221033868
test accuracy (after cv):  0.7185828877005348
0.46306880183326055
{'alpha': 1, 'solver': 'sparse_cg'}


In [13]:
# TRAINING VIA Ridge Regression Classifier - CATS
rc = RidgeClassifier()
parameters = {'alpha':[0.01, 0.1, 1, 10], 'solver' : ('auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga')}
clf = GridSearchCV(rc, parameters, cv=5)

clf.fit(cats_train_2,cats_train_2_labels.ravel())

train_pred_cv = clf.predict(cats_train_2)
adopt_pred_cv = clf.predict(cats_test_2)
rc_train_accuracy_cats = clf.score(cats_train_2, cats_train_2_labels)
rc_test_accuracy_cats = clf.score(cats_test_2, cats_test_2_labels)

print('train accuracy (after cv): ', rc_train_accuracy_cats)
print('test accuracy (after cv): ', rc_test_accuracy_cats)
rc_f1_cats = f1_score(cats_test_2_labels, adopt_pred_cv, average='macro')
print(rc_f1_cats)
print(clf.best_params_)

train accuracy (after cv):  0.7325146823278164
test accuracy (after cv):  0.7522017614091273
0.45758953079622255
{'alpha': 0.01, 'solver': 'auto'}


In [15]:
# TRAINING VIA NEAREST NEIGHBORS - DOGS

neigh = KNeighborsClassifier()
parameters = {'n_neighbors':(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}
clf = GridSearchCV(neigh, parameters, cv=5)

clf.fit(dogs_train_2,dogs_train_2_labels.ravel())

train_pred_cv = clf.predict(dogs_train_2)
adopt_pred_cv = clf.predict(dogs_test_2)
nn_train_accuracy_dogs = clf.score(dogs_train_2, dogs_train_2_labels)
nn_test_accuracy_dogs = clf.score(dogs_test_2, dogs_test_2_labels)

print('train accuracy (after cv): ', nn_train_accuracy_dogs)
print('test accuracy (after cv): ', nn_test_accuracy_dogs)
nn_f1_dogs = f1_score(dogs_test_2_labels, adopt_pred_cv, average='macro')
print(nn_f1_dogs)
print(clf.best_params_)


train accuracy (after cv):  0.7818627450980392
test accuracy (after cv):  0.7195855614973262
0.6086489570755867
{'n_neighbors': 7}


In [16]:
# TRAINING VIA NEAREST NEIGHBORS - CATS

neigh = KNeighborsClassifier()
parameters = {'n_neighbors':(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}
clf = GridSearchCV(neigh, parameters, cv=5)

clf.fit(cats_train_2,cats_train_2_labels.ravel())

train_pred_cv = clf.predict(cats_train_2)
adopt_pred_cv = clf.predict(cats_test_2)
nn_train_accuracy_cats = clf.score(cats_train_2, cats_train_2_labels)
nn_test_accuracy_cats = clf.score(cats_test_2, cats_test_2_labels)

print('train accuracy (after cv): ', nn_train_accuracy_cats)
print('test accuracy (after cv): ', nn_test_accuracy_cats)
nn_f1_cats = f1_score(cats_test_2_labels, adopt_pred_cv, average='macro')
print(nn_f1_cats)
print(clf.best_params_)

train accuracy (after cv):  0.7688200747463961
test accuracy (after cv):  0.7413931144915933
0.5711107000399731
{'n_neighbors': 9}


In [17]:
# TRAINING VIA DECISION TREES - DOGS
dec_tree = DecisionTreeClassifier(random_state=0)

parameters = {'criterion':('gini', 'entropy')}
clf = GridSearchCV(dec_tree, parameters, cv=5)

clf.fit(dogs_train_2,dogs_train_2_labels.ravel())

train_pred_cv = clf.predict(dogs_train_2)
adopt_pred_cv = clf.predict(dogs_test_2)
dt_train_accuracy_dogs = clf.score(dogs_train_2, dogs_train_2_labels)
dt_test_accuracy_dogs = clf.score(dogs_test_2, dogs_test_2_labels)

print('train accuracy (after cv): ', dt_train_accuracy_dogs)
print('test accuracy (after cv): ', dt_test_accuracy_dogs)
dt_f1_dogs = f1_score(dogs_test_2_labels, adopt_pred_cv, average='macro')
print(dt_f1_dogs)
print(clf.best_params_)


train accuracy (after cv):  0.9750445632798574
test accuracy (after cv):  0.6908422459893048
0.6219815102027157
{'criterion': 'entropy'}


In [18]:
# TRAINING VIA DECISION TREES - CATS
dec_tree = DecisionTreeClassifier(random_state=0)

parameters = {'criterion':('gini', 'entropy')}
clf = GridSearchCV(dec_tree, parameters, cv=5)

clf.fit(cats_train_2,cats_train_2_labels.ravel())

train_pred_cv = clf.predict(cats_train_2)
adopt_pred_cv = clf.predict(cats_test_2)
dt_train_accuracy_cats = clf.score(cats_train_2, cats_train_2_labels)
dt_test_accuracy_cats = clf.score(cats_test_2, cats_test_2_labels)

print('train accuracy (after cv): ', dt_train_accuracy_cats)
print('test accuracy (after cv): ', dt_test_accuracy_cats)
dt_f1_cats = f1_score(cats_test_2_labels, adopt_pred_cv, average='macro')
print(dt_f1_cats)
print(clf.best_params_)

train accuracy (after cv):  0.9919914575547251
test accuracy (after cv):  0.677742193755004
0.578309096837984
{'criterion': 'gini'}


In [19]:
# TRAINING VIA LINEAR SVM - DOGS
svm = LinearSVC(random_state=0, tol=1e-5, max_iter=10000)

parameters = {'loss':('hinge', 'squared_hinge'), 'C':[0.1, 0.5, 1, 5, 10]}
clf = GridSearchCV(svm, parameters, cv=5)

clf.fit(dogs_train_2,dogs_train_2_labels.ravel())

train_pred_cv = clf.predict(dogs_train_2)
adopt_pred_cv = clf.predict(dogs_test_2)
svm_train_accuracy_dogs = clf.score(dogs_train_2, dogs_train_2_labels)
svm_test_accuracy_dogs = clf.score(dogs_test_2, dogs_test_2_labels)

print('train accuracy (after cv): ', svm_train_accuracy_dogs)
print('test accuracy (after cv): ', svm_test_accuracy_dogs)
svm_f1_dogs = f1_score(dogs_test_2_labels, adopt_pred_cv, average='macro')
print(svm_f1_dogs)
print(clf.best_params_)





train accuracy (after cv):  0.7181372549019608
test accuracy (after cv):  0.7192513368983957
0.4652846036994013
{'C': 0.1, 'loss': 'squared_hinge'}




In [20]:
# TRAINING VIA LINEAR SVM - CATS
svm = LinearSVC(random_state=0, tol=1e-5, max_iter=10000)

parameters = {'loss':('hinge', 'squared_hinge'), 'C':[0.1, 0.5, 1, 5, 10]}
clf = GridSearchCV(svm, parameters, cv=5)

clf.fit(cats_train_2,cats_train_2_labels.ravel())

train_pred_cv = clf.predict(cats_train_2)
adopt_pred_cv = clf.predict(cats_test_2)
svm_train_accuracy_cats = clf.score(cats_train_2, cats_train_2_labels)
svm_test_accuracy_cats = clf.score(cats_test_2, cats_test_2_labels)

print('train accuracy (after cv): ', svm_train_accuracy_cats)
print('test accuracy (after cv): ', svm_test_accuracy_cats)
svm_f1_cats = f1_score(cats_test_2_labels, adopt_pred_cv, average='macro')
print(svm_f1_cats)
print(clf.best_params_)



train accuracy (after cv):  0.7354511478910838
test accuracy (after cv):  0.754603682946357
0.4300707278120009
{'C': 0.1, 'loss': 'hinge'}


  'precision', 'predicted', average, warn_for)


In [None]:
# attempts at ensemble - voting classifier

clf1 = linear_model.LogisticRegression(solver='lbfgs', multi_class='ovr')
clf2 = DecisionTreeClassifier(random_state=0)
clf3 = KNeighborsClassifier()
eclf = VotingClassifier(estimators=[('lr', clf1), ('dtree', clf2), ('kneigh', clf3)], voting='hard')

params = {'lr__C': [1.0, 100.0], 'kneigh__n_neighbors': [2, 3, 4, 5]}

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(dogs_train_2, dogs_train_2_labels.ravel())

train_pred_cv = grid.predict(dogs_train_2)
adopt_pred_cv = grid.predict(dogs_test_2)
train_accuracy_cv = grid.score(dogs_train_2, dogs_train_2_labels)
test_accuracy_cv = grid.score(dogs_test_2, dogs_test_2_labels)

print('train accuracy (after cv): ', train_accuracy_cv)
print('test accuracy (after cv): ', test_accuracy_cv)
f1_cv = f1_score(dogs_test_2_labels, adopt_pred_cv, average='macro')
print(f1_cv)
print(grid.best_params_)

In [25]:
# COMPARE MODELS FOR DOGS
print("logistic regression train accuracy: ", lr_train_accuracy_dogs)
print("logistic regression test accuracy: ", lr_test_accuracy_dogs)
print("logistic regression f1 score: ", lr_f1_cv_dogs)

print("ridge regression train accuracy: ", rc_train_accuracy_dogs)
print("ridge regression test accuracy: ", rc_test_accuracy_dogs)
print("ridge regression f1 score: ", rc_f1_dogs)

print("nearest neighbors train accuracy: ", nn_train_accuracy_dogs)
print("nearest neighbors test accuracy: ", nn_test_accuracy_dogs)
print("nearest neighbors f1 score: ", nn_f1_dogs)

print("decision tree train accuracy: ", dt_train_accuracy_dogs)
print("decision tree test accuracy: ", dt_test_accuracy_dogs)
print("decision tree f1 score: ", dt_f1_dogs)

print("svm train accuracy: ", svm_train_accuracy_dogs)
print("svm test accuracy: ", svm_test_accuracy_dogs)
print("svm f1 score: ", svm_f1_dogs)

print("DOGS: Choose logistic regression for highest test accuracy 0.7219 (f1 score 0.486), choose neighest neighbors for best balance w/ f1 score 0.719 test accuracy (f1 score 0.609)")

logistic regression train accuracy:  0.7208110516934046
logistic regression test accuracy:  0.7219251336898396
logistic regression f1 score:  0.4864566236496851
ridge regression train accuracy:  0.7176916221033868
ridge regression test accuracy:  0.7185828877005348
ridge regression f1 score:  0.46306880183326055
nearest neighbors train accuracy:  0.7818627450980392
nearest neighbors test accuracy:  0.7195855614973262
nearest neighbors f1 score:  0.6086489570755867
decision tree train accuracy:  0.9750445632798574
decision tree test accuracy:  0.6908422459893048
decision tree f1 score:  0.6219815102027157
svm train accuracy:  0.7181372549019608
svm test accuracy:  0.7192513368983957
svm f1 score:  0.4652846036994013
DOGS: Choose logistic regression for highest test accuracy 0.7219 (f1 score 0.486), choose neighest neighbors for best balance w/ f1 score 0.719 test accuracy (f1 score 0.609)


In [26]:
# COMPARE MODELS FOR CATS
print("logistic regression train accuracy: ", lr_train_accuracy_cats)
print("logistic regression test accuracy: ", lr_test_accuracy_cats)
print("logistic regression f1 score: ", lr_f1_cats)

print("ridge regression train accuracy: ", rc_train_accuracy_cats)
print("ridge regression test accuracy: ", rc_test_accuracy_cats)
print("ridge regression f1 score: ", rc_f1_cats)

print("nearest neighbors train accuracy: ", nn_train_accuracy_cats)
print("nearest neighbors test accuracy: ", nn_test_accuracy_cats)
print("nearest neighbors f1 score: ", nn_f1_cats)

print("decision tree train accuracy: ", dt_train_accuracy_cats)
print("decision tree test accuracy: ", dt_test_accuracy_cats)
print("decision tree f1 score: ", dt_f1_cats)

print("svm train accuracy: ", svm_train_accuracy_cats)
print("svm test accuracy: ", svm_test_accuracy_cats)
print("svm f1 score: ", svm_f1_cats)

print("CATS: Choose svm for highest test accuracy 0.7546 (f1 score 0.430), choose neighest neighbors for best balance w/ f1 score 0.741 test accuracy (f1 score 0.571)")

logistic regression train accuracy:  0.7335824879871863
logistic regression test accuracy:  0.7526020816653323
logistic regression f1 score:  0.4713716305062459
ridge regression train accuracy:  0.7325146823278164
ridge regression test accuracy:  0.7522017614091273
ridge regression f1 score:  0.45758953079622255
nearest neighbors train accuracy:  0.7688200747463961
nearest neighbors test accuracy:  0.7413931144915933
nearest neighbors f1 score:  0.5711107000399731
decision tree train accuracy:  0.9919914575547251
decision tree test accuracy:  0.677742193755004
decision tree f1 score:  0.578309096837984
svm train accuracy:  0.7354511478910838
svm test accuracy:  0.754603682946357
svm f1 score:  0.4300707278120009
CATS: Choose svm for highest test accuracy 0.7546 (f1 score 0.430), choose neighest neighbors for best balance w/ f1 score 0.741 test accuracy (f1 score 0.571)
