In [1]:
# Import the required modules
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [2]:
# Read the preprocessed_doggy file into a DataFrame
doggy_df = pd.read_csv(
    Path("preprocessed_doggy.csv")
)

# Review the DataFrame
display(doggy_df.head())
display(doggy_df.tail())

Unnamed: 0.1,Unnamed: 0,Borough,dog_friendly,income_cat,grooming_frequency,shedding,energy_level,trainability,demeanor,size,life_expectancy,lifetime_cost,Breed
0,0,Bronx,Yes,middle,0.4,0.4,0.8,0.8,1.0,Small 9-35lb,0.6,Low,Boxer
1,1,Manhattan,Yes,high,0.8,0.2,0.6,0.6,1.0,Toy >9lb,0.8,Medium,Maltese
2,2,Manhattan,Yes,high,1.0,0.2,0.6,0.2,0.8,Toy >9lb,0.8,Medium,Yorkshire Terrier
3,3,Brooklyn,Yes,low,0.4,0.8,0.6,1.0,0.6,Small 9-35lb,0.4,Low,German Shepherd Dog
4,4,Brooklyn,Yes,high,0.4,0.4,0.4,1.0,0.8,Toy >9lb,0.8,Medium,Cavalier King Charles Spaniel


Unnamed: 0.1,Unnamed: 0,Borough,dog_friendly,income_cat,grooming_frequency,shedding,energy_level,trainability,demeanor,size,life_expectancy,lifetime_cost,Breed
275750,275750,Queens,Yes,middle,1.0,0.2,0.6,0.2,0.8,Toy >9lb,0.8,Medium,Yorkshire Terrier
275751,275751,Manhattan,Yes,high,0.4,0.4,0.4,1.0,0.8,Toy >9lb,0.8,Medium,Cavalier King Charles Spaniel
275752,275752,Brooklyn,No,middle,0.2,0.4,0.6,0.4,0.6,Toy >9lb,1.0,High,Chihuahua
275753,275753,Staten Island,No,middle,0.4,0.8,1.0,1.0,1.0,Small 9-35lb,0.6,Medium,Labrador Retriever
275754,275754,Queens,No,middle,0.4,0.2,0.6,1.0,1.0,Toy >9lb,0.8,Low,Boston Terrier


In [3]:
# Remove the Unnamed col
doggy_df = doggy_df[['Borough', 'dog_friendly', 'income_cat',
       'grooming_frequency', 'shedding', 'energy_level', 'trainability',
       'demeanor', 'size', 'life_expectancy', 'lifetime_cost', 'Breed']]

doggy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275755 entries, 0 to 275754
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Borough             275755 non-null  object 
 1   dog_friendly        275755 non-null  object 
 2   income_cat          275755 non-null  object 
 3   grooming_frequency  275755 non-null  float64
 4   shedding            275755 non-null  float64
 5   energy_level        275755 non-null  float64
 6   trainability        275755 non-null  float64
 7   demeanor            275755 non-null  float64
 8   size                275755 non-null  object 
 9   life_expectancy     275755 non-null  float64
 10  lifetime_cost       275755 non-null  object 
 11  Breed               275755 non-null  object 
dtypes: float64(6), object(6)
memory usage: 25.2+ MB


In [4]:
# Split the data into features(X) and targets(y)
# First y
y = doggy_df['Breed']

# Next X
X = doggy_df.drop(columns=['Breed'])

In [5]:
# Use LabelEncoder to convert targets(50) into a single column
# Use LabelEncoder on the y DataFrame to encode the Dog Breeds
label_encoder_df = LabelEncoder()

# Encode Labels
y_encoded = label_encoder_df.fit_transform(y)

# Test the results of the encoder
# Display first 50 of encoded column
display(y_encoded[0:50])

# Get the original labels back
display(label_encoder_df.inverse_transform(y_encoded[0:50]))

array([10, 30, 49, 24, 14, 14, 10, 10,  3, 28,  3, 25,  3, 49, 29,  9,  6,
       25, 26,  6, 14, 10, 15, 30, 15, 29, 15, 49,  3,  6, 44, 29, 44,  3,
       26, 16,  3, 23, 45, 37, 10, 15, 29, 12, 43, 39, 49, 28, 15, 30])

array(['Boxer', 'Maltese', 'Yorkshire Terrier', 'German Shepherd Dog',
       'Cavalier King Charles Spaniel', 'Cavalier King Charles Spaniel',
       'Boxer', 'Boxer', 'Beagle', 'Havanese', 'Beagle',
       'German Shorthaired Pointer', 'Beagle', 'Yorkshire Terrier',
       'Labrador Retriever', 'Boston Terrier', 'Bichon Frise',
       'German Shorthaired Pointer', 'Golden Retriever', 'Bichon Frise',
       'Cavalier King Charles Spaniel', 'Boxer', 'Chihuahua', 'Maltese',
       'Chihuahua', 'Labrador Retriever', 'Chihuahua',
       'Yorkshire Terrier', 'Beagle', 'Bichon Frise', 'Shih Tzu',
       'Labrador Retriever', 'Shih Tzu', 'Beagle', 'Golden Retriever',
       'Cocker Spaniel', 'Beagle', 'French Bulldog', 'Siberian Husky',
       'Poodle', 'Boxer', 'Chihuahua', 'Labrador Retriever', 'Bulldog',
       'Shiba Inu', 'Pug', 'Yorkshire Terrier', 'Havanese', 'Chihuahua',
       'Maltese'], dtype=object)

In [6]:
# Convert Features/Targets into a form suitable for Modeling
y = y_encoded

X = pd.get_dummies(X, dtype=int)

In [7]:
# Look at X, y data
display(y[0:10])
display(X.head(3))

array([10, 30, 49, 24, 14, 14, 10, 10,  3, 28])

Unnamed: 0,grooming_frequency,shedding,energy_level,trainability,demeanor,life_expectancy,Borough_Bronx,Borough_Brooklyn,Borough_Manhattan,Borough_Queens,...,income_cat_low,income_cat_middle,size_Giant <75lb,size_Large 55-75lb,size_Medium 35-55lb,size_Small 9-35lb,size_Toy >9lb,lifetime_cost_High,lifetime_cost_Low,lifetime_cost_Medium
0,0.4,0.4,0.8,0.8,1.0,0.6,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0.8,0.2,0.6,0.6,1.0,0.8,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
2,1.0,0.2,0.6,0.2,0.8,0.8,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1


In [8]:
# Features
X.shape

(275755, 24)

In [9]:
# Target
y.shape

(275755,)

In [10]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {'n_estimators': [50, 100, 200, 300]}

# Perform a grid search with 5-fold cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameter and corresponding accuracy
print("Best n_estimators:", grid_search.best_params_['n_estimators'])
print("Best Accuracy:", grid_search.best_score_)

Best n_estimators: 50
Best Accuracy: 0.9995938418924419


In [12]:
# Run the same analysis using results from previous model run to determine best 'n_estimators' param

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {'n_estimators': [20, 35, 50, 70]}

# Perform a grid search with 5-fold cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameter and corresponding accuracy
print("Best n_estimators:", grid_search.best_params_['n_estimators'])
print("Best Accuracy:", grid_search.best_score_)

Best n_estimators: 20
Best Accuracy: 0.9995938418924419


In [13]:
# Run the same analysis using results from previous model run to determine best 'n_estimators' param, 3rd attempt

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {'n_estimators': [10, 15, 20, 25]}

# Perform a grid search with 5-fold cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameter and corresponding accuracy
print("Best n_estimators:", grid_search.best_params_['n_estimators'])
print("Best Accuracy:", grid_search.best_score_)

Best n_estimators: 10
Best Accuracy: 0.9995938418924419


In [14]:
# Run the same analysis using results from previous model run to determine best 'n_estimators' param, final attempt

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Define the parameter grid
param_grid = {'n_estimators': [4, 8, 10, 12]}

# Perform a grid search with 5-fold cross-validation
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best parameter and corresponding accuracy
print("Best n_estimators:", grid_search.best_params_['n_estimators'])
print("Best Accuracy:", grid_search.best_score_)

Best n_estimators: 8
Best Accuracy: 0.9995938418924419


In [15]:
# Evaluate the Metrics for model performance
from sklearn.metrics import accuracy_score

# Create a RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=8, random_state=42)

# Fit the model on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display additional metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9996953828747154
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       302
           1       1.00      1.00      1.00      1389
           2       1.00      1.00      1.00       152
           3       1.00      1.00      1.00      2312
           4       1.00      1.00      1.00       150
           5       1.00      1.00      1.00       373
           6       1.00      1.00      1.00      1230
           7       0.00      0.00      0.00        21
           8       1.00      1.00      1.00       494
           9       1.00      1.00      1.00      1315
          10       1.00      1.00      1.00       973
          11       1.00      1.00      1.00        36
          12       1.00      1.00      1.00       256
          13       1.00      1.00      1.00       244
          14       1.00      1.00      1.00      1758
          15       1.00      1.00      1.00      6050
          16       1.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Perfect Accuracy on 48 Breeds, 90% and 0% on the remaining 2. What are those Breeds?

# !!!Work on This!!!

print("100% accuracy for 48 Breeds")

print("0% Accuracy in predicting the Breed:")
display(label_encoder_df.inverse_transform([7]))

print("90 Accuracy for predicting the Breed:")
display(label_encoder_df.inverse_transform([40]))

100% accuracy for 48 Breeds
0% Accuracy in predicting the Breed:


array(['Bloodhound'], dtype=object)

90 Accuracy for predicting the Breed:


array(['Rhodesian Ridgeback'], dtype=object)

In [17]:
print(np.min(y), np.max(y))

0 49


In [18]:
encoded = list(range(50))
labels = label_encoder_df.inverse_transform(encoded)

labels_encoded_df = pd.DataFrame()
labels_encoded_df['encoded'] = encoded
labels_encoded_df['labels'] = labels

display(labels_encoded_df.head(50))

Unnamed: 0,encoded,labels
0,0,Australian Cattle Dog
1,1,Australian Shepherd
2,2,Basset Hound
3,3,Beagle
4,4,Belgian Malinois
5,5,Bernese Mountain Dog
6,6,Bichon Frise
7,7,Bloodhound
8,8,Border Collie
9,9,Boston Terrier


In [19]:
# Export Label DataFrame as csv
labels_encoded_df.to_csv('encoded_labels.csv', index=False)

In [20]:
# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Create a DataFrame to display feature importances
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Display the DataFrame
print(feature_importance_df)

                  Feature  Importance
0      grooming_frequency    0.168181
3            trainability    0.146335
5         life_expectancy    0.135072
4                demeanor    0.118704
1                shedding    0.113735
19      size_Small 9-35lb    0.068005
2            energy_level    0.065982
21     lifetime_cost_High    0.062561
22      lifetime_cost_Low    0.039194
20          size_Toy >9lb    0.036196
23   lifetime_cost_Medium    0.032763
18    size_Medium 35-55lb    0.005551
17     size_Large 55-75lb    0.002017
16       size_Giant <75lb    0.001781
13        income_cat_high    0.001005
8       Borough_Manhattan    0.000774
14         income_cat_low    0.000448
9          Borough_Queens    0.000448
15      income_cat_middle    0.000390
7        Borough_Brooklyn    0.000255
6           Borough_Bronx    0.000218
10  Borough_Staten Island    0.000175
12       dog_friendly_Yes    0.000114
11        dog_friendly_No    0.000094


In [21]:
# Model Attribution by Breed, Zipcode, and Borough info

# Sum up contributions of the Breed Level data
indices_breed = [3, 0, 1, 5, 4, 2, 23, 21, 19, 22, 20, 18, 17, 16]
breed_features_contribution = round(feature_importance_df.loc[indices_breed, 'Importance'].sum() * 100, 2)

# For ZipCode level data
indices_zip = [13, 14, 15, 12, 11]
zip_features_contribution = round(feature_importance_df.loc[indices_zip, 'Importance'].sum() * 100, 2)

# For Borough level data
indices_boro = [8, 9, 6, 7, 10]
borough_features_contribution = round(feature_importance_df.loc[indices_boro, 'Importance'].sum() * 100, 2)

# Print out contributions
print(f'breed level contribution: {breed_features_contribution}%')
print(f'zipcode level contribution: {zip_features_contribution}%')
print(f'Borough level contribution: {borough_features_contribution}%')

breed level contribution: 99.61%
zipcode level contribution: 0.21%
Borough level contribution: 0.19%


In [22]:
# Save the Random Forest model
import joblib

# Save the model to a file
joblib.dump(rf_classifier, 'breed_rf_model.pkl')

['breed_rf_model.pkl']

In [23]:
loaded_model = joblib.load('breed_rf_model.pkl')

print(loaded_model)

RandomForestClassifier(n_estimators=8, random_state=42)


In [24]:
# Display row 100 as a list
test_list = list(X.loc[100])
display(X.loc[100].index)
display(test_list)

Index(['grooming_frequency', 'shedding', 'energy_level', 'trainability',
       'demeanor', 'life_expectancy', 'Borough_Bronx', 'Borough_Brooklyn',
       'Borough_Manhattan', 'Borough_Queens', 'Borough_Staten Island',
       'dog_friendly_No', 'dog_friendly_Yes', 'income_cat_high',
       'income_cat_low', 'income_cat_middle', 'size_Giant <75lb',
       'size_Large 55-75lb', 'size_Medium 35-55lb', 'size_Small 9-35lb',
       'size_Toy >9lb', 'lifetime_cost_High', 'lifetime_cost_Low',
       'lifetime_cost_Medium'],
      dtype='object')

[0.4,
 0.8,
 1.0,
 1.0,
 1.0,
 0.6,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0]

In [25]:
# Find the possible values in each column
multi_col_list = ['grooming_frequency', 'shedding', 'energy_level', 'trainability','demeanor', 'life_expectancy']

for col in multi_col_list:
  print(sorted(doggy_df[col].unique()), len(doggy_df[col].unique()))

[0.2, 0.4, 0.6, 0.8, 1.0] 5
[0.2, 0.4, 0.6, 0.8, 1.0] 5
[0.2, 0.4, 0.6, 0.8, 1.0] 5
[0.2, 0.4, 0.6, 0.8, 1.0] 5
[0.4, 0.6, 0.8, 1.0] 4
[0.4, 0.6, 0.8, 1.0] 4


In [26]:
loaded_model.feature_names_

AttributeError: 'RandomForestClassifier' object has no attribute 'feature_names_'

In [27]:
# Test Prediction

# Choose traits = > [grooming_frequency, shedding, energy_level, trainability, demeanor, life_expectancy],
# choose [0.2(only available for the first 4), 0.4, 0.6, 0.8, 1.0] for each trait in order
traits = [0.6, 0.6, 0.6, 0.6, 0.6, 0.6]

# Choose for Borough => ['Borough_Bronx', 'Borough_Brooklyn', 'Borough_Manhattan', 'Borough_Queens', 'Borough_Staten Island'], place a 1 on boro
# and 0 on all other entries
borough = [0, 1, 0, 0, 0]

# Choose Dog Friendly Area = [No, Yes], place a 1 on choice and 0 on all other entries
dog_friendly = [0, 1]

# Choose income area => [High, Low, Middle], place a 1 on choice and 0 on all other entries
income = [0, 0, 1]

# Choose Dog Size => ['Giant <75lb', 'Large 55-75lb', 'Medium 35-55lb', 'Small 9-35lb', 'Toy >9lb'], place a 1 on choice and 0 on all other entries
dog_size = [0, 0, 1, 0, 0]

# Choose Lifetime Cost = ['High', 'Low', 'Medium'], place a 1 on choice and 0 on all other entries
lifetime_cost = [0, 0, 1]

# Combine the lists to create a single input to the model
choices = np.array(traits + borough + dog_friendly + income + dog_size + lifetime_cost)

# Change the shape of the array
choices = choices.reshape(1, -1)

# Model Prediction
predictions = loaded_model.predict(choices)

# Display the predictions
print(predictions)
print(labels_encoded_df['labels'].loc[predictions[0]])


[41]
Rottweiler




In [28]:
# from sklearn.ensemble import RandomForestClassifier
# loaded_model.n_features_in_

In [29]:
import joblib

# Load the model
model = joblib.load('breed_rf_model.pkl')

# Re-save the model with protocol version 4
joblib.dump(model, 'breed_rf_model.pkl', protocol=4)


['breed_rf_model.pkl']

In [30]:
import pandas as pd

# Load the encoded labels from the CSV file
labels_encoded_df = pd.read_csv('encoded_labels.csv')

# Assuming your CSV file has columns 'encoded' and 'labels'
# Display the DataFrame to ensure it's loaded correctly
print(labels_encoded_df.head())


   encoded                 labels
0        0  Australian Cattle Dog
1        1    Australian Shepherd
2        2           Basset Hound
3        3                 Beagle
4        4       Belgian Malinois


In [32]:
from flask import Flask

# Create Flask app instance
app = Flask(__name__)

# Set the template folder
app.template_folder = '/Users/sam/Desktop/FINAL_PROJECT/Project_4/templates'  # Replace '/path/to/templates' with the actual path to your templates folder

# Now you can print the template folder
print(app.template_folder)


/Users/sam/Desktop/FINAL_PROJECT/Project_4/templates
