In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn import linear_model
import sklearn.metrics as metrics
from sklearn.tree import DecisionTreeRegressor 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neural_network import MLPRegressor


In [2]:
#Set seed
seed = 32

# Get Subsets for Each Adoption Dataset -- All Features

In [3]:
#Load data and get data subsets
os.chdir(r"C:\Users\shbla\OneDrive\Documents\PSU\DAAN 888")
final_data = pd.read_csv("final_data.csv")
final_data = final_data.drop('Unnamed: 0', axis = 1)

#we are interested only in the Animals that are adopted
#So, filter the subset based on 'Outcome Type' == "Adoption"
filter_adoption = final_data['Outcome Type'] == "Adoption"
adoption_data = final_data[filter_adoption] 

#drop the Outcome Type
adoption_data = adoption_data.drop(['Outcome Type','Intake Date'], axis =1)
adoption_data

#prepare the list of the features to be used for modeeling i.e independent variables
#and the target feature i.e Duration
feat_cols = adoption_data.columns.to_list()
feat_cols.remove('Duration')

#Repeat for cat adoptions
cat_adoption_data = adoption_data[adoption_data["Animal Type_Cat"] == 1]
cat_adoption_data = cat_adoption_data.drop("Animal Type_Cat", axis = 1)
cat_adoption_data = cat_adoption_data.drop("Animal Type_Dog", axis = 1)

feat_cols_cats = cat_adoption_data.columns.to_list()
feat_cols_cats.remove('Duration')

#Repeat for dog adoptions
dog_adoption_data = adoption_data[adoption_data["Animal Type_Dog"] == 1]
dog_adoption_data = dog_adoption_data.drop("Animal Type_Dog", axis = 1)
dog_adoption_data = dog_adoption_data.drop("Animal Type_Cat", axis = 1)

feat_cols_dogs = dog_adoption_data.columns.to_list()
feat_cols_dogs.remove('Duration')

# Apply Results from Feature Selection to Each Adoption Subset

In [4]:
#Get dataset for selected features -- all adoptions
#remove irrelevant variables
adoption_data_select = adoption_data.drop(['Sex_Female','Intake Status_Unknown','Intake Type_Public Assist',
                                          'Sex_Unknown','Intake Type_Euthanasia Request','Intake Type_Abandoned',
                                          'Outcome Status_Unknown'], axis =1)
#Move duration to the first column
first_col_all = adoption_data_select.pop('Duration')
adoption_data_select.insert(0, 'Duration', first_col_all)

adoption_data_select

Unnamed: 0,Duration,Outcome Age,Intake Type_Owner Surrender,Intake Type_Stray,Intake Condition_0,Intake Condition_1,Animal Type_Cat,Animal Type_Dog,Intake Status_Intact,Intake Status_Sterile,Sex_Male,Outcome Status_Intact,Outcome Status_Sterile,New Breed_Mixed,New Breed_Multi,New Breed_Single
12,3.103873,14.916529,1,0,1,0,0,1,0,1,1,0,1,1,0,0
14,1.999925,15.114757,0,0,0,1,0,1,0,1,1,0,1,0,0,1
23,2.621074,14.783650,0,1,1,0,0,1,0,1,1,0,1,0,1,0
33,3.444960,14.871002,0,1,1,0,0,1,0,1,0,0,1,1,0,0
49,1.272469,14.008931,0,1,1,0,0,1,0,1,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126752,1.026284,6.838251,1,0,1,0,1,0,1,0,1,0,1,1,0,0
126753,1.026284,6.838251,1,0,1,0,1,0,1,0,0,0,1,1,0,0
126754,1.026284,6.838251,1,0,1,0,1,0,1,0,0,0,1,1,0,0
126756,1.026284,14.719901,1,0,0,1,0,1,0,1,1,0,1,1,0,0


In [5]:
#Get dataset for selected features -- cat adoptions
#use subset of relevant features determined during feature selection
cat_adoption_data_select = cat_adoption_data[["Duration","Outcome Age","Intake Type_Owner Surrender",
                                      "Intake Condition_0","Intake Condition_1","Intake Status_Intact",
                                      "Intake Status_Sterile","Outcome Status_Sterile","Intake Type_Stray"]]

cat_adoption_data_select

Unnamed: 0,Duration,Outcome Age,Intake Type_Owner Surrender,Intake Condition_0,Intake Condition_1,Intake Status_Intact,Intake Status_Sterile,Outcome Status_Sterile,Intake Type_Stray
56,2.920820,14.173198,0,1,0,0,1,1,1
57,1.604705,14.486417,1,1,0,0,1,1,0
73,3.801525,14.266655,0,1,0,0,1,1,1
81,1.920468,14.364476,0,1,0,0,1,1,1
90,2.562346,14.138201,1,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...
126738,1.457400,5.359999,1,1,0,1,0,1,0
126743,1.604705,9.888372,0,0,1,0,1,1,1
126752,1.026284,6.838251,1,1,0,1,0,1,0
126753,1.026284,6.838251,1,1,0,1,0,1,0


In [6]:
#Get dataset for selected features -- dog adoptions
#use subset of relevant features determined during feature selection
dog_adoption_data_select = dog_adoption_data[["Duration","Outcome Age","Intake Type_Owner Surrender",
                                              "Intake Condition_0","Intake Condition_1","Intake Status_Intact",
                                              "Intake Status_Sterile","Outcome Status_Intact","Intake Type_Public Assist",
                                              "Intake Type_Stray","Sex_Female"]]

dog_adoption_data_select

Unnamed: 0,Duration,Outcome Age,Intake Type_Owner Surrender,Intake Condition_0,Intake Condition_1,Intake Status_Intact,Intake Status_Sterile,Outcome Status_Intact,Intake Type_Public Assist,Intake Type_Stray,Sex_Female
12,3.103873,14.916529,1,1,0,0,1,0,0,0,0
14,1.999925,15.114757,0,0,1,0,1,0,1,0,0
23,2.621074,14.783650,0,1,0,0,1,0,0,1,0
33,3.444960,14.871002,0,1,0,0,1,0,0,1,1
49,1.272469,14.008931,0,1,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
126740,1.457400,8.128098,1,1,0,1,0,0,0,0,0
126750,0.663872,5.636692,1,1,0,1,0,0,0,0,0
126751,0.663872,5.636692,1,1,0,1,0,0,0,0,0
126756,1.026284,14.719901,1,0,1,0,1,0,0,0,0


# Define X and y variables for each subset, create 70/30 train/test split

In [7]:
#Get X and y values for each subset and set of features
X = adoption_data[feat_cols]
y = adoption_data['Duration']

X_fs = adoption_data_select.drop('Duration',axis = 1)
y_fs = adoption_data_select['Duration']

X_cat = cat_adoption_data[feat_cols_cats]
y_cat = cat_adoption_data['Duration']

X_cat_fs = cat_adoption_data_select.drop('Duration',axis = 1)
y_cat_fs = cat_adoption_data_select['Duration']

X_dog = dog_adoption_data[feat_cols_dogs]
y_dog = dog_adoption_data['Duration']

X_dog_fs = dog_adoption_data_select.drop('Duration',axis = 1)
y_dog_fs = dog_adoption_data_select['Duration']

In [8]:
#All adoptions
print("Shape of original data",X.shape)
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

Shape of original data (60407, 22)


In [9]:
#All adoptions -- with feature selection
print("Shape of original data",X_fs.shape)
# split into train and test sets
X_fs_train, X_fs_test, y_fs_train, y_fs_test = train_test_split(X_fs, y_fs, test_size=0.3, random_state=seed)

Shape of original data (60407, 15)


In [10]:
#All CAT adoptions
print("Shape of original data",X_cat.shape)
# split into train and test sets
X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(X_cat, y_cat, test_size=0.3, random_state=seed)

Shape of original data (24154, 20)


In [11]:
#All CAT adoptions -- with feature selection
print("Shape of original data",X_cat_fs.shape)
# split into train and test sets
X_cat_fs_train, X_cat_fs_test, y_cat_fs_train, y_cat_fs_test = train_test_split(X_cat_fs, y_cat_fs, 
                                                                                test_size=0.3, random_state=seed)

Shape of original data (24154, 8)


In [12]:
#All DOG adoptions
print("Shape of original data",X_dog.shape)
# split into train and test sets
X_dog_train, X_dog_test, y_dog_train, y_dog_test = train_test_split(X_dog, y_dog, test_size=0.3, random_state=seed)

Shape of original data (36253, 20)


In [13]:
#All DOG adoptions -- with feature selection
print("Shape of original data",X_cat_fs.shape)
# split into train and test sets
X_dog_fs_train, X_dog_fs_test, y_dog_fs_train, y_dog_fs_test = train_test_split(X_dog_fs, y_dog_fs, 
                                                                                test_size=0.3, random_state=seed)

Shape of original data (24154, 8)


# Scaling of data

In [14]:
#Performing Feature Scaling -- All adoptions
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [15]:
#Performing Feature Scaling -- All adoptions -- with feature selection
from sklearn.preprocessing import StandardScaler
sc_fs = StandardScaler()
X_fs_train = sc_fs.fit_transform(X_fs_train)
X_fs_test = sc_fs.transform(X_fs_test)

In [16]:
#Performing Feature Scaling -- All CAT adoptions
from sklearn.preprocessing import StandardScaler
sc_cat = StandardScaler()
X_cat_train = sc_cat.fit_transform(X_cat_train)
X_cat_test = sc_cat.transform(X_cat_test)

In [17]:
#Performing Feature Scaling -- All CAT adoptions -- with feature selection
from sklearn.preprocessing import StandardScaler
sc_fs_cat = StandardScaler()
X_fs_cat_train = sc_fs_cat.fit_transform(X_cat_fs_train)
X_fs_cat_test = sc_fs_cat.transform(X_cat_fs_test)

In [18]:
#Performing Feature Scaling -- All DOG adoptions
from sklearn.preprocessing import StandardScaler
sc_dog = StandardScaler()
X_dog_train = sc_dog.fit_transform(X_dog_train)
X_dog_test = sc_dog.transform(X_dog_test)

In [19]:
#Performing Feature Scaling -- All DOG adoptions -- with feature selection
from sklearn.preprocessing import StandardScaler
sc_fs_dog = StandardScaler()
X_fs_dog_train = sc_fs_dog.fit_transform(X_dog_fs_train)
X_fs_dog_test = sc_fs_dog.transform(X_dog_fs_test)

# Define Function for Computing Evaluation Metrics

In [20]:
# evaluate predictions
def compute_metrics(model,data,y_train_set, y_train_pred, y_test_set, y_test_pred):
    print("Model:",model)
    print("Data:",data)
    mae_train = mean_absolute_error(y_train_set, y_train_pred)
    print('MAE Train: %.3f' % mae_train)
    mae_test = mean_absolute_error(y_test_set, y_test_pred)
    print('MAE Test: %.3f' % mae_test)
    mse_train = mean_squared_error(y_train_set, y_train_pred)
    print('MSE Train: %.3f' % mse_train)
    mse_test = mean_squared_error(y_test_set, y_test_pred)
    print('MSE Test: %.3f' % mse_test)
    r2_train = r2_score(y_train_set, y_train_pred)
    print('R-squared Train: %.3f' % r2_train)
    r2_test = r2_score(y_test_set, y_test_pred)
    print('R-squared Test: %.3f' % r2_test)

# ALL ADOPTIONS

# Lasso Regression

In [21]:
#Train a lasso regression model
lasso_all = Lasso(alpha=1)
lasso_all.fit(X_train, y_train)

Lasso(alpha=1)

In [22]:
#Use the lasso model to predict duration
lasso_all_train_pred = lasso_all.predict(X_train)
lasso_all_test_pred = lasso_all.predict(X_test)

In [23]:
compute_metrics("Lasso Regression","All Adoption - all features",
                y_train, lasso_all_train_pred, y_test, lasso_all_test_pred)

Model: Lasso Regression
Data: All Adoption - all features
MAE Train: 0.707
MAE Test: 0.705
MSE Train: 0.682
MSE Test: 0.677
R-squared Train: 0.000
R-squared Test: -0.000


In [24]:
#Now use selected features
lasso_all_fs = Lasso(alpha=1)
lasso_all_fs.fit(X_fs_train, y_fs_train)

#Use the lasso model to predict duration
lasso_all_fs_train_pred = lasso_all_fs.predict(X_fs_train)
lasso_all_fs_test_pred = lasso_all_fs.predict(X_fs_test)

In [25]:
compute_metrics("Lasso Regression","All Adoption - selected features",
                y_fs_train, lasso_all_fs_train_pred, y_fs_test, lasso_all_fs_test_pred)

Model: Lasso Regression
Data: All Adoption - selected features
MAE Train: 0.707
MAE Test: 0.705
MSE Train: 0.682
MSE Test: 0.677
R-squared Train: 0.000
R-squared Test: -0.000


# Ridge Regression

In [26]:
#Train a ridge regression model
ridge_all = Ridge()
ridge_all.fit(X_train, y_train)

ridge_all_train_pred = ridge_all.predict(X_train)
ridge_all_test_pred = ridge_all.predict(X_test)

In [27]:
compute_metrics("Ridge Regression","All Adoption - all features",
                y_train, ridge_all_train_pred, y_test, ridge_all_test_pred)

Model: Ridge Regression
Data: All Adoption - all features
MAE Train: 0.619
MAE Test: 0.624
MSE Train: 0.556
MSE Test: 0.560
R-squared Train: 0.185
R-squared Test: 0.173


In [28]:
#Now use selected features
ridge_all_fs = Ridge()
ridge_all_fs.fit(X_fs_train, y_fs_train)

ridge_all_fs_train_pred = ridge_all_fs.predict(X_fs_train)
ridge_all_fs_test_pred = ridge_all_fs.predict(X_fs_test)

In [29]:
compute_metrics("Ridge Regression","All Adoption - selected features",
                y_fs_train, ridge_all_fs_train_pred, y_fs_test, ridge_all_fs_test_pred)

Model: Ridge Regression
Data: All Adoption - selected features
MAE Train: 0.620
MAE Test: 0.624
MSE Train: 0.556
MSE Test: 0.560
R-squared Train: 0.184
R-squared Test: 0.173


# Regression Tree

In [30]:
#train regression tree model
rt = DecisionTreeRegressor(max_depth = 10, random_state = seed) 
  
# fit the regressor with X and Y data
rt.fit(X_train, y_train)

#use regression tree to predict duration
rt_all_train_pred = rt.predict(X_train)
rt_all_test_pred = rt.predict(X_test)

In [31]:
compute_metrics("Regression Tree","All Adoption - all features",y_train, rt_all_train_pred, y_test, rt_all_test_pred)

Model: Regression Tree
Data: All Adoption - all features
MAE Train: 0.443
MAE Test: 0.467
MSE Train: 0.373
MSE Test: 0.414
R-squared Train: 0.453
R-squared Test: 0.389


In [32]:
#repeat with feature selection
rt_fs = DecisionTreeRegressor(max_depth = 10, random_state = seed) 
  
# fit the regressor with X and Y data
rt_fs.fit(X_fs_train, y_fs_train)

#use regression tree to predict duration
rt_all_fs_train_pred = rt_fs.predict(X_fs_train)
rt_all_fs_test_pred = rt_fs.predict(X_fs_test)

In [33]:
compute_metrics("Regression Tree","All Adoption - selected features",
                y_fs_train, rt_all_fs_train_pred, y_fs_test, rt_all_fs_test_pred)

Model: Regression Tree
Data: All Adoption - selected features
MAE Train: 0.443
MAE Test: 0.467
MSE Train: 0.373
MSE Test: 0.413
R-squared Train: 0.453
R-squared Test: 0.390


# Random Forest

In [34]:
#Train random forest model
rf = RandomForestRegressor(max_depth = 10, random_state = seed)
rf.fit(X_train, y_train)

# Make prediction
rf_all_train_pred = rf.predict(X_train)
rf_all_test_pred = rf.predict(X_test)

In [35]:
compute_metrics("Random Forest","All Adoption - all features",y_train, rf_all_train_pred , y_test, rf_all_test_pred)

Model: Random Forest
Data: All Adoption - all features
MAE Train: 0.436
MAE Test: 0.457
MSE Train: 0.356
MSE Test: 0.392
R-squared Train: 0.478
R-squared Test: 0.421


In [36]:
#Repeat with selected features
rf_fs = RandomForestRegressor(max_depth = 10, random_state = seed)
rf_fs.fit(X_fs_train, y_fs_train)

# Make prediction
rf_all_fs_train_pred = rf_fs.predict(X_fs_train)
rf_all_fs_test_pred = rf_fs.predict(X_fs_test)

In [37]:
compute_metrics("Random Forest","All Adoption - selected features",y_fs_train, rf_all_fs_train_pred,
                y_fs_test, rf_all_fs_test_pred)

Model: Random Forest
Data: All Adoption - selected features
MAE Train: 0.436
MAE Test: 0.457
MSE Train: 0.356
MSE Test: 0.392
R-squared Train: 0.478
R-squared Test: 0.421


# Ada Boost

In [38]:
#build AdaBoost model
ada = AdaBoostRegressor(random_state=seed)
ada.fit(X_train, y_train)

param_gridada = {'n_estimators': (10, 50, 100),
              'learning_rate': (.1, .5, 1),
             }

gridada = GridSearchCV(ada, param_gridada, scoring = 'r2' )
gridada.fit(X_train,y_train)


# Make prediction
ada_all_test_pred = gridada.predict(X_test)
ada_all_train_pred = gridada.predict(X_train)

print("Best: %f using %s" % (gridada.best_score_, gridada.best_params_))

Best: 0.209443 using {'learning_rate': 0.1, 'n_estimators': 10}


In [39]:
compute_metrics("Adaboost","All Adoptions - all features",y_train, ada_all_train_pred, y_test, ada_all_test_pred)

Model: Adaboost
Data: All Adoptions - all features
MAE Train: 0.589
MAE Test: 0.592
MSE Train: 0.539
MSE Test: 0.540
R-squared Train: 0.209
R-squared Test: 0.202


In [40]:
#Repeat with feature selection
ada_fs = AdaBoostRegressor(random_state=seed)
ada_fs.fit(X_fs_train, y_fs_train)

gridadafs = GridSearchCV(ada_fs, param_gridada)
gridadafs.fit(X_fs_train, y_fs_train)

# Make prediction
ada_all_fs_test_pred = gridadafs.predict(X_fs_test)
ada_all_fs_train_pred = gridadafs.predict(X_fs_train)

print("Best: %f using %s" % (gridadafs.best_score_, gridadafs.best_params_))

Best: 0.209443 using {'learning_rate': 0.1, 'n_estimators': 10}


In [41]:
compute_metrics("Adaboost","All Adoption - selected features",y_fs_train,
                ada_all_fs_train_pred, y_fs_test, ada_all_fs_test_pred)

Model: Adaboost
Data: All Adoption - selected features
MAE Train: 0.589
MAE Test: 0.592
MSE Train: 0.539
MSE Test: 0.540
R-squared Train: 0.209
R-squared Test: 0.202


# Neural Network

In [42]:
#Train a neural network model
nn = MLPRegressor(activation = 'relu', max_iter = 10000, random_state = seed)
nn.fit(X_train, y_train)

param_gridnn = {'hidden_layer_sizes': [(20,20),(25,25), (30,30),(35,35), (40,40)],
                'learning_rate': ('constant', 'invscaling', 'adaptive')
             }

gridnn = GridSearchCV(nn, param_gridnn, scoring = 'r2' )
gridnn.fit(X_train,y_train)


# Make prediction
nn_all_test_pred = gridnn.predict(X_test)
nn_all_train_pred = gridnn.predict(X_train)

print("Best: %f using %s" % (gridnn.best_score_, gridnn.best_params_))


Best: 0.242276 using {'hidden_layer_sizes': (25, 25), 'learning_rate': 'constant'}


In [43]:
compute_metrics("Neural Network","All Adoption - all features",y_train, nn_all_train_pred, y_test, nn_all_test_pred)

Model: Neural Network
Data: All Adoption - all features
MAE Train: 0.580
MAE Test: 0.587
MSE Train: 0.512
MSE Test: 0.521
R-squared Train: 0.249
R-squared Test: 0.231


In [44]:
#Repeat with selected features
nn_fs = MLPRegressor(activation = 'relu', max_iter = 10000, random_state = seed)
nn_fs.fit(X_fs_train, y_fs_train)


gridnnfs = GridSearchCV(nn_fs, param_gridnn, scoring = 'r2' )
gridnnfs.fit(X_fs_train,y_fs_train)

# Make prediction
nn_all_fs_test_pred = nn_fs.predict(X_fs_test)
nn_all_fs_train_pred = nn_fs.predict(X_fs_train)

print("Best: %f using %s" % (gridnnfs.best_score_, gridnnfs.best_params_))


Best: 0.244742 using {'hidden_layer_sizes': (30, 30), 'learning_rate': 'constant'}


In [45]:
compute_metrics("Neural Network","All Adoption - all features",
                y_fs_train, nn_all_fs_train_pred, y_fs_test, nn_all_fs_test_pred)

Model: Neural Network
Data: All Adoption - all features
MAE Train: 0.579
MAE Test: 0.585
MSE Train: 0.513
MSE Test: 0.521
R-squared Train: 0.247
R-squared Test: 0.230


# CAT ADOPTIONS

# Lasso Regression

In [46]:
#Train a lasso regression model
lasso_cat = Lasso(alpha=1)
lasso_cat.fit(X_cat_train, y_cat_train)

Lasso(alpha=1)

In [47]:
#Use the lasso model to predict duration
lasso_cat_train_pred = lasso_cat.predict(X_cat_train)
lasso_cat_test_pred = lasso_cat.predict(X_cat_test)

In [48]:
compute_metrics("Lasso Regression","Cat Adoption - all features",
                y_cat_train, lasso_cat_train_pred, y_cat_test, lasso_cat_test_pred)

Model: Lasso Regression
Data: Cat Adoption - all features
MAE Train: 0.663
MAE Test: 0.660
MSE Train: 0.602
MSE Test: 0.601
R-squared Train: 0.000
R-squared Test: -0.000


In [49]:
#Now use selected features
lasso_cat_fs = Lasso(alpha=1)
lasso_cat_fs.fit(X_cat_fs_train, y_cat_fs_train)

#Use the lasso model to predict duration
lasso_cat_fs_train_pred = lasso_cat_fs.predict(X_cat_fs_train)
lasso_cat_fs_test_pred = lasso_cat_fs.predict(X_cat_fs_test)

In [50]:
compute_metrics("Lasso Regression","Cat Adoption - selected features",
                y_cat_fs_train, lasso_cat_fs_train_pred, y_cat_fs_test, lasso_cat_fs_test_pred)

Model: Lasso Regression
Data: Cat Adoption - selected features
MAE Train: 0.663
MAE Test: 0.660
MSE Train: 0.602
MSE Test: 0.601
R-squared Train: 0.000
R-squared Test: -0.000


# Ridge Regression

In [51]:
#Train a ridge regression model
ridge_cat = Ridge()
ridge_cat.fit(X_cat_train, y_cat_train)

ridge_cat_train_pred = ridge_cat.predict(X_cat_train)
ridge_cat_test_pred = ridge_cat.predict(X_cat_test)

In [52]:
compute_metrics("Ridge Regression","Cat Adoption - all features",
                y_cat_train, ridge_cat_train_pred, y_cat_test, ridge_cat_test_pred)

Model: Ridge Regression
Data: Cat Adoption - all features
MAE Train: 0.611
MAE Test: 0.608
MSE Train: 0.514
MSE Test: 0.512
R-squared Train: 0.145
R-squared Test: 0.149


In [53]:
#Now use selected features
ridge_cat_fs = Ridge()
ridge_cat_fs.fit(X_cat_fs_train, y_cat_fs_train)

ridge_cat_fs_train_pred = ridge_cat_fs.predict(X_cat_fs_train)
ridge_cat_fs_test_pred = ridge_cat_fs.predict(X_cat_fs_test)

In [54]:
compute_metrics("Ridge Regression","Cat Adoption - selected features",
                y_cat_fs_train, ridge_cat_fs_train_pred, y_cat_fs_test, ridge_cat_fs_test_pred)

Model: Ridge Regression
Data: Cat Adoption - selected features
MAE Train: 0.613
MAE Test: 0.610
MSE Train: 0.516
MSE Test: 0.514
R-squared Train: 0.142
R-squared Test: 0.145


# Regression Tree

In [55]:
#train regression tree model
rt_cat = DecisionTreeRegressor(max_depth = 10, random_state = 0) 
  
# fit the regressor with X and Y data
rt_cat.fit(X_cat_train, y_cat_train)

#use regression tree to predict duration
rt_cat_train_pred = rt_cat.predict(X_cat_train)
rt_cat_test_pred = rt_cat.predict(X_cat_test)

In [56]:
compute_metrics("Regression Tree","Cat Adoption - all features",y_cat_train, 
                rt_cat_train_pred, y_cat_test, rt_cat_test_pred)

Model: Regression Tree
Data: Cat Adoption - all features
MAE Train: 0.503
MAE Test: 0.542
MSE Train: 0.391
MSE Test: 0.458
R-squared Train: 0.351
R-squared Test: 0.238


In [57]:
#repeat with feature selection
rt_cat_fs = DecisionTreeRegressor(max_depth = 10, random_state = 0) 
  
# fit the regressor with X and Y data
rt_cat_fs.fit(X_cat_fs_train, y_cat_fs_train)

#use regression tree to predict duration
rt_cat_fs_train_pred = rt_cat_fs.predict(X_cat_fs_train)
rt_cat_fs_test_pred = rt_cat_fs.predict(X_cat_fs_test)

In [58]:
compute_metrics("Regression Tree","Cat Adoption - selected features",
                y_cat_fs_train, rt_cat_fs_train_pred, y_cat_fs_test, rt_cat_fs_test_pred)

Model: Regression Tree
Data: Cat Adoption - selected features
MAE Train: 0.504
MAE Test: 0.538
MSE Train: 0.392
MSE Test: 0.454
R-squared Train: 0.348
R-squared Test: 0.245


# Random Forest

In [59]:
#Train random forest model
rf_cat = RandomForestRegressor(max_depth = 10, random_state = 101)
rf_cat.fit(X_cat_train, y_cat_train)

# Make prediction
rf_cat_train_pred = rf_cat.predict(X_cat_train)
rf_cat_test_pred = rf_cat.predict(X_cat_test)

In [60]:
compute_metrics("Random Forest","Cat Adoption - all features",y_cat_train, 
                rf_cat_train_pred , y_cat_test, rf_cat_test_pred)

Model: Random Forest
Data: Cat Adoption - all features
MAE Train: 0.490
MAE Test: 0.521
MSE Train: 0.363
MSE Test: 0.418
R-squared Train: 0.397
R-squared Test: 0.304


In [61]:
#Repeat with selected features
rf_cat_fs = RandomForestRegressor(max_depth = 10, random_state = 101)
rf_cat_fs.fit(X_cat_fs_train, y_cat_fs_train)

# Make prediction
rf_cat_fs_train_pred = rf_cat_fs.predict(X_cat_fs_train)
rf_cat_fs_test_pred = rf_cat_fs.predict(X_cat_fs_test)

In [62]:
compute_metrics("Random Forest","Cat Adoption - selected features",y_cat_fs_train, rf_cat_fs_train_pred,
                y_cat_fs_test, rf_cat_fs_test_pred)

Model: Random Forest
Data: Cat Adoption - selected features
MAE Train: 0.492
MAE Test: 0.519
MSE Train: 0.366
MSE Test: 0.417
R-squared Train: 0.391
R-squared Test: 0.307


# Ada Boost

In [96]:
#build AdaBoost model
ada_cat = AdaBoostRegressor(random_state=seed)
ada_cat.fit(X_cat_train, y_cat_train)

param_gridadacat = {'n_estimators': (10, 50, 100),
              'learning_rate': (.1, .5, 1),
             }

gridadacat = GridSearchCV(ada_cat, param_gridadacat, scoring = 'r2' )
gridadacat.fit(X_cat_train,y_cat_train)


# Make prediction
ada_cat_test_pred = ada_cat.predict(X_cat_test)
ada_cat_train_pred = ada_cat.predict(X_cat_train)

print("Best: %f using %s" % (gridadacat.best_score_, gridadacat.best_params_))

Best: 0.162291 using {'learning_rate': 0.5, 'n_estimators': 10}


In [97]:
compute_metrics("Adaboost","Cat Adoption - all features",y_cat_train, 
                ada_cat_train_pred, y_cat_test, ada_cat_test_pred)

Model: Adaboost
Data: Cat Adoption - all features
MAE Train: 0.625
MAE Test: 0.622
MSE Train: 0.513
MSE Test: 0.515
R-squared Train: 0.147
R-squared Test: 0.143


In [98]:
#Repeat with feature selection
ada_cat_fs = AdaBoostRegressor(random_state=seed)
ada_cat_fs.fit(X_cat_fs_train, y_cat_fs_train)

gridadacatfs = GridSearchCV(ada_cat_fs, param_gridadacat, scoring = 'r2' )
gridadacatfs.fit(X_cat_fs_train,y_cat_fs_train)

# Make prediction
ada_cat_fs_test_pred = ada_cat_fs.predict(X_cat_fs_test)
ada_cat_fs_train_pred = ada_cat_fs.predict(X_cat_fs_train)

print("Best: %f using %s" % (gridadacatfs.best_score_, gridadacatfs.best_params_))

Best: 0.161500 using {'learning_rate': 0.5, 'n_estimators': 10}


In [99]:
compute_metrics("Adaboost","Cat Adoption - selected features",y_cat_fs_train,
                ada_cat_fs_train_pred, y_cat_fs_test, ada_cat_fs_test_pred)

Model: Adaboost
Data: Cat Adoption - selected features
MAE Train: 0.627
MAE Test: 0.625
MSE Train: 0.515
MSE Test: 0.517
R-squared Train: 0.144
R-squared Test: 0.141


# Neural Network

In [100]:
#Train a neural network model
nn_cat = MLPRegressor(activation = 'relu', max_iter = 10000, 
                                    random_state = seed)
nn_cat.fit(X_cat_train, y_cat_train)

param_gridnncat = {'hidden_layer_sizes': [(20,20),(25,25), (30,30),(35,35), (40,40)],
                'learning_rate': ('constant', 'invscaling', 'adaptive')
             }

gridnncat = GridSearchCV(nn_cat, param_gridnncat, scoring = 'r2' )
gridnncat.fit(X_cat_train,y_cat_train)

# Make prediction
nn_cat_test_pred = nn_cat.predict(X_cat_test)
nn_cat_train_pred = nn_cat.predict(X_cat_train)

print("Best: %f using %s" % (gridnncat.best_score_, gridnncat.best_params_))


Best: 0.188686 using {'hidden_layer_sizes': (40, 40), 'learning_rate': 'constant'}


In [101]:
compute_metrics("Neural Network","Cat Adoption - all features",y_cat_train, 
                nn_cat_train_pred, y_cat_test, nn_cat_test_pred)

Model: Neural Network
Data: Cat Adoption - all features
MAE Train: 0.582
MAE Test: 0.582
MSE Train: 0.482
MSE Test: 0.487
R-squared Train: 0.199
R-squared Test: 0.190


In [102]:
#Repeat with selected features
nn_cat_fs = MLPRegressor(activation = 'relu', max_iter = 10000, 
                                    random_state = seed)
nn_cat_fs.fit(X_cat_fs_train, y_cat_fs_train)

gridnncatfs = GridSearchCV(nn_cat_fs, param_gridnncat, scoring = 'r2' )
gridnncatfs.fit(X_cat_fs_train,y_cat_fs_train)

# Make prediction
nn_cat_fs_test_pred = nn_cat_fs.predict(X_cat_fs_test)
nn_cat_fs_train_pred = nn_cat_fs.predict(X_cat_fs_train)

print("Best: %f using %s" % (gridnncatfs.best_score_, gridnncatfs.best_params_))

Best: 0.187734 using {'hidden_layer_sizes': (35, 35), 'learning_rate': 'constant'}


In [103]:
compute_metrics("Neural Network","Cat Adoption - selected features",
                y_cat_fs_train, nn_cat_fs_train_pred, y_cat_fs_test, nn_cat_fs_test_pred)

Model: Neural Network
Data: Cat Adoption - selected features
MAE Train: 0.584
MAE Test: 0.580
MSE Train: 0.488
MSE Test: 0.486
R-squared Train: 0.188
R-squared Test: 0.192


# DOG ADOPTIONS

# Lasso Regression

In [71]:
#Train a lasso regression model
lasso_dog = Lasso(alpha=1)
lasso_dog.fit(X_dog_train, y_dog_train)

Lasso(alpha=1)

In [72]:
#Use the lasso model to predict duration
lasso_dog_train_pred = lasso_dog.predict(X_dog_train)
lasso_dog_test_pred = lasso_dog.predict(X_dog_test)

In [73]:
compute_metrics("Lasso Regression","Dog Adoption - all features",
                y_dog_train, lasso_dog_train_pred, y_dog_test, lasso_dog_test_pred)

Model: Lasso Regression
Data: Dog Adoption - all features
MAE Train: 0.662
MAE Test: 0.666
MSE Train: 0.658
MSE Test: 0.663
R-squared Train: 0.000
R-squared Test: -0.000


In [74]:
#Now use selected features
lasso_dog_fs = Lasso(alpha=1)
lasso_dog_fs.fit(X_dog_fs_train, y_dog_fs_train)

#Use the lasso model to predict duration
lasso_dog_fs_train_pred = lasso_dog_fs.predict(X_dog_fs_train)
lasso_dog_fs_test_pred = lasso_dog_fs.predict(X_dog_fs_test)

In [75]:
compute_metrics("Lasso Regression","Dog Adoption - selected features",
                y_dog_fs_train, lasso_dog_fs_train_pred, y_dog_fs_test, lasso_dog_fs_test_pred)

Model: Lasso Regression
Data: Dog Adoption - selected features
MAE Train: 0.662
MAE Test: 0.666
MSE Train: 0.658
MSE Test: 0.663
R-squared Train: 0.000
R-squared Test: -0.000


# Ridge Regression

In [76]:
#Train a ridge regression model
ridge_dog = Ridge()
ridge_dog.fit(X_dog_train, y_dog_train)

ridge_dog_train_pred = ridge_dog.predict(X_dog_train)
ridge_dog_test_pred = ridge_dog.predict(X_dog_test)

In [77]:
compute_metrics("Ridge Regression","Dog Adoption - all features",
                y_dog_train, ridge_dog_train_pred, y_dog_test, ridge_dog_test_pred)

Model: Ridge Regression
Data: Dog Adoption - all features
MAE Train: 0.601
MAE Test: 0.599
MSE Train: 0.563
MSE Test: 0.559
R-squared Train: 0.143
R-squared Test: 0.156


In [78]:
#Now use selected features
ridge_dog_fs = Ridge()
ridge_dog_fs.fit(X_dog_fs_train, y_dog_fs_train)

ridge_dog_fs_train_pred = ridge_dog_fs.predict(X_dog_fs_train)
ridge_dog_fs_test_pred = ridge_dog_fs.predict(X_dog_fs_test)

In [79]:
compute_metrics("Ridge Regression","Dog Adoption - selected features",
                y_dog_fs_train, ridge_dog_fs_train_pred, y_dog_fs_test, ridge_dog_fs_test_pred)

Model: Ridge Regression
Data: Dog Adoption - selected features
MAE Train: 0.602
MAE Test: 0.599
MSE Train: 0.564
MSE Test: 0.560
R-squared Train: 0.142
R-squared Test: 0.155


# Regression Tree

In [80]:
#train regression tree model
rt_dog = DecisionTreeRegressor(max_depth = 10, random_state = 0) 
  
# fit the regressor with X and Y data
rt_dog.fit(X_dog_train, y_dog_train)

#use regression tree to predict duration
rt_dog_train_pred = rt_dog.predict(X_dog_train)
rt_dog_test_pred = rt_dog.predict(X_dog_test)

In [81]:
compute_metrics("Regression Tree","Dog Adoption - all features",y_dog_train, 
                rt_dog_train_pred, y_dog_test, rt_dog_test_pred)

Model: Regression Tree
Data: Dog Adoption - all features
MAE Train: 0.385
MAE Test: 0.419
MSE Train: 0.341
MSE Test: 0.409
R-squared Train: 0.481
R-squared Test: 0.382


In [82]:
#repeat with feature selection
rt_dog_fs = DecisionTreeRegressor(max_depth = 10, random_state = 0) 
  
# fit the regressor with X and Y data
rt_dog_fs.fit(X_dog_fs_train, y_dog_fs_train)

#use regression tree to predict duration
rt_dog_fs_train_pred = rt_dog_fs.predict(X_dog_fs_train)
rt_dog_fs_test_pred = rt_dog_fs.predict(X_dog_fs_test)

In [83]:
compute_metrics("Regression Tree","Dog Adoption - selected features",
                y_dog_fs_train, rt_dog_fs_train_pred, y_dog_fs_test, rt_dog_fs_test_pred)

Model: Regression Tree
Data: Dog Adoption - selected features
MAE Train: 0.387
MAE Test: 0.417
MSE Train: 0.344
MSE Test: 0.405
R-squared Train: 0.477
R-squared Test: 0.388


# Random Forest

In [84]:
#Train random forest model
rf_dog = RandomForestRegressor(max_depth = 10, random_state = 101)
rf_dog.fit(X_dog_train, y_dog_train)

# Make prediction
rf_dog_train_pred = rf_dog.predict(X_dog_train)
rf_dog_test_pred = rf_dog.predict(X_dog_test)

In [85]:
compute_metrics("Random Forest","Dog Adoption - all features",y_dog_train, 
                rf_dog_train_pred , y_dog_test, rf_dog_test_pred)

Model: Random Forest
Data: Dog Adoption - all features
MAE Train: 0.377
MAE Test: 0.406
MSE Train: 0.321
MSE Test: 0.377
R-squared Train: 0.511
R-squared Test: 0.431


In [86]:
#Repeat with selected features
rf_dog_fs = RandomForestRegressor(max_depth = 10, random_state = 101)
rf_dog_fs.fit(X_dog_fs_train, y_dog_fs_train)

# Make prediction
rf_dog_fs_train_pred = rf_dog_fs.predict(X_dog_fs_train)
rf_dog_fs_test_pred = rf_dog_fs.predict(X_dog_fs_test)

In [87]:
compute_metrics("Random Forest","Dog Adoption - selected features",y_dog_fs_train, rf_dog_fs_train_pred,
                y_dog_fs_test, rf_dog_fs_test_pred)

Model: Random Forest
Data: Dog Adoption - selected features
MAE Train: 0.380
MAE Test: 0.406
MSE Train: 0.326
MSE Test: 0.377
R-squared Train: 0.505
R-squared Test: 0.431


# Ada Boost

In [104]:
#build AdaBoost model
ada_dog = AdaBoostRegressor(random_state=seed)
ada_dog.fit(X_dog_train, y_dog_train)

param_gridadadog = {'n_estimators': (10, 50, 100),
              'learning_rate': (.1, .5, 1),
             }

gridadadog = GridSearchCV(ada_dog, param_gridadadog, scoring = 'r2' )
gridadadog.fit(X_dog_train,y_dog_train)


# Make prediction
ada_dog_test_pred = ada_dog.predict(X_dog_test)
ada_dog_train_pred = ada_dog.predict(X_dog_train)

print("Best: %f using %s" % (gridadadog.best_score_, gridadadog.best_params_))

Best: 0.224041 using {'learning_rate': 0.1, 'n_estimators': 10}


In [105]:
compute_metrics("Adaboost","Dog Adoption - all features",y_dog_train, 
                ada_dog_train_pred, y_dog_test, ada_dog_test_pred)

Model: Adaboost
Data: Dog Adoption - all features
MAE Train: 0.581
MAE Test: 0.583
MSE Train: 0.512
MSE Test: 0.514
R-squared Train: 0.221
R-squared Test: 0.225


In [106]:
#Repeat with feature selection
ada_dog_fs = AdaBoostRegressor(random_state=seed)
ada_dog_fs.fit(X_dog_fs_train, y_dog_fs_train)

gridadadogfs = GridSearchCV(ada_dog_fs, param_gridadadog, scoring = 'r2' )
gridadadogfs.fit(X_dog_fs_train,y_dog_fs_train)

# Make prediction
ada_dog_fs_test_pred = ada_dog_fs.predict(X_dog_fs_test)
ada_dog_fs_train_pred = ada_dog_fs.predict(X_dog_fs_train)

print("Best: %f using %s" % (gridadadogfs.best_score_, gridadadogfs.best_params_))

Best: 0.224017 using {'learning_rate': 0.1, 'n_estimators': 10}


In [107]:
compute_metrics("Adaboost","Dog Adoption - selected features",y_dog_fs_train,
                ada_dog_fs_train_pred, y_dog_fs_test, ada_dog_fs_test_pred)

Model: Adaboost
Data: Dog Adoption - selected features
MAE Train: 0.611
MAE Test: 0.612
MSE Train: 0.540
MSE Test: 0.541
R-squared Train: 0.179
R-squared Test: 0.184


# Neural Network

In [108]:
#Train a neural network model
nn_dog = MLPRegressor(activation = 'relu', max_iter = 10000, 
                                    random_state = seed)
nn_dog.fit(X_dog_train, y_dog_train)

param_gridnndog = {'hidden_layer_sizes': [(20,20),(25,25), (30,30),(35,35), (40,40)],
                'learning_rate': ('constant', 'invscaling', 'adaptive')
             }

gridnndog = GridSearchCV(nn_dog, param_gridnndog, scoring = 'r2' )
gridnndog.fit(X_dog_train,y_dog_train)

# Make prediction
nn_dog_test_pred = nn_dog.predict(X_dog_test)
nn_dog_train_pred = nn_dog.predict(X_dog_train)

print("Best: %f using %s" % (gridnndog.best_score_, gridnndog.best_params_))


Best: 0.174282 using {'hidden_layer_sizes': (35, 35), 'learning_rate': 'constant'}


In [109]:
compute_metrics("Neural Network","Dog Adoption - all features",y_dog_train, 
                nn_dog_train_pred, y_dog_test, nn_dog_test_pred)

Model: Neural Network
Data: Dog Adoption - all features
MAE Train: 0.579
MAE Test: 0.583
MSE Train: 0.536
MSE Test: 0.544
R-squared Train: 0.184
R-squared Test: 0.178


In [110]:
#Repeat with selected features
nn_dog_fs = MLPRegressor(activation = 'relu', max_iter = 10000, 
                                    random_state = seed)
nn_dog_fs.fit(X_dog_fs_train, y_dog_fs_train)

gridnndogfs = GridSearchCV(nn_dog_fs, param_gridnndog, scoring = 'r2' )
gridnndogfs.fit(X_dog_fs_train,y_dog_fs_train)


# Make prediction
nn_dog_fs_test_pred = nn_dog_fs.predict(X_dog_fs_test)
nn_dog_fs_train_pred = nn_dog_fs.predict(X_dog_fs_train)

print("Best: %f using %s" % (gridnndogfs.best_score_, gridnndogfs.best_params_))

Best: 0.175950 using {'hidden_layer_sizes': (35, 35), 'learning_rate': 'constant'}


In [111]:
compute_metrics("Neural Network","Dog Adoption - selected features",
                y_dog_fs_train, nn_dog_fs_train_pred, y_dog_fs_test, nn_dog_fs_test_pred)

Model: Neural Network
Data: Dog Adoption - selected features
MAE Train: 0.590
MAE Test: 0.591
MSE Train: 0.541
MSE Test: 0.545
R-squared Train: 0.177
R-squared Test: 0.178
