In [1]:
#import dependencies
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np

#sklearn dependencies
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
file_path = "Resources/NYC_dogs_clean.csv"

dogs_df = pd.read_csv(file_path)

dogs_df.head()

Unnamed: 0.1,Unnamed: 0,X,RowNumber,AnimalName,AnimalGender,AnimalBirthMonth,BreedName,Borough,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate
0,1,1,1753,SHADOW,M,01/01/2000 12:00:00 AM,Beagle,Brooklyn,11236,318.0,1014.0,BK50,46.0,8.0,19.0,12/29/2014,01/30/2016
1,2,2,2415,ROCCO,M,10/01/2011 12:00:00 AM,Boxer,Brooklyn,11210,314.0,756.0,BK43,45.0,9.0,17.0,01/07/2015,01/30/2016
2,3,3,3328,LUIGI,M,09/01/2005 12:00:00 AM,Maltese,Bronx,10464,210.0,516.0,BX10,13.0,14.0,34.0,01/17/2015,02/02/2016
3,4,4,7537,PETUNIA,F,08/01/2013 12:00:00 AM,Pug,Brooklyn,11221,304.0,419.0,BK78,34.0,7.0,18.0,03/01/2015,03/28/2016
4,5,5,8487,ROMEO,M,10/01/2008 12:00:00 AM,Maltese,Bronx,10451,201.0,65.0,BX34,17.0,15.0,32.0,03/09/2015,03/09/2016


In [3]:
dogs_df.dtypes

Unnamed: 0                   int64
X                            int64
RowNumber                    int64
AnimalName                  object
AnimalGender                object
AnimalBirthMonth            object
BreedName                   object
Borough                     object
ZipCode                      int64
CommunityDistrict          float64
CensusTract2010            float64
NTA                         object
CityCouncilDistrict        float64
CongressionalDistrict      float64
StateSenatorialDistrict    float64
LicenseIssuedDate           object
LicenseExpiredDate          object
dtype: object

In [4]:
dogs_df.nunique()

Unnamed: 0                 121862
X                          121862
RowNumber                  121862
AnimalName                  20728
AnimalGender                    3
AnimalBirthMonth              217
BreedName                     299
Borough                         5
ZipCode                       348
CommunityDistrict              65
CensusTract2010              1309
NTA                           192
CityCouncilDistrict            51
CongressionalDistrict          13
StateSenatorialDistrict        26
LicenseIssuedDate             842
LicenseExpiredDate           2212
dtype: int64

In [6]:
#convert date columns to datetime
dogs_df["LicenseIssuedDate"] = pd.to_datetime(dogs_df["LicenseIssuedDate"])
dogs_df["LicenseExpiredDate"] = pd.to_datetime(dogs_df["LicenseExpiredDate"])
dogs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121862 entries, 0 to 121861
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   Unnamed: 0               121862 non-null  int64         
 1   X                        121862 non-null  int64         
 2   RowNumber                121862 non-null  int64         
 3   AnimalName               121627 non-null  object        
 4   AnimalGender             121862 non-null  object        
 5   AnimalBirthMonth         121862 non-null  object        
 6   BreedName                121862 non-null  object        
 7   Borough                  121862 non-null  object        
 8   ZipCode                  121862 non-null  int64         
 9   CommunityDistrict        118546 non-null  float64       
 10  CensusTract2010          118546 non-null  float64       
 11  NTA                      118546 non-null  object        
 12  CityCouncilDistr

In [7]:
#convert animal brith month column to month only
dogs_df["AnimalBirthMonth"] = pd.DatetimeIndex(dogs_df['AnimalBirthMonth']).month

dogs_df.head(10)

Unnamed: 0.1,Unnamed: 0,X,RowNumber,AnimalName,AnimalGender,AnimalBirthMonth,BreedName,Borough,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate
0,1,1,1753,SHADOW,M,1,Beagle,Brooklyn,11236,318.0,1014.0,BK50,46.0,8.0,19.0,2014-12-29,2016-01-30
1,2,2,2415,ROCCO,M,10,Boxer,Brooklyn,11210,314.0,756.0,BK43,45.0,9.0,17.0,2015-01-07,2016-01-30
2,3,3,3328,LUIGI,M,9,Maltese,Bronx,10464,210.0,516.0,BX10,13.0,14.0,34.0,2015-01-17,2016-02-02
3,4,4,7537,PETUNIA,F,8,Pug,Brooklyn,11221,304.0,419.0,BK78,34.0,7.0,18.0,2015-03-01,2016-03-28
4,5,5,8487,ROMEO,M,10,Maltese,Bronx,10451,201.0,65.0,BX34,17.0,15.0,32.0,2015-03-09,2016-03-09
5,6,6,10503,BRANDY,M,1,Unknown,Brooklyn,11225,309.0,800.0,BK60,40.0,9.0,20.0,2015-03-27,2016-03-29
6,7,7,11682,SAM,M,5,Pug,Manhattan,10021,108.0,124.0,MN31,5.0,12.0,28.0,2015-04-06,2016-04-06
7,8,8,12307,MAY,F,5,Unknown,Staten Island,10305,502.0,74.0,SI14,50.0,11.0,23.0,2015-04-10,2016-03-30
8,9,9,12652,RUBY,F,4,Boxer,Brooklyn,11220,310.0,34.0,BK31,43.0,11.0,22.0,2015-04-13,2018-03-06
9,10,10,13839,LEO,M,1,Beagle,Bronx,10468,207.0,269.0,BX30,14.0,13.0,33.0,2015-04-21,2016-03-24


In [8]:
#what are the genders for dogs
dogs_df["AnimalGender"].value_counts()

M    66670
F    55185
         7
Name: AnimalGender, dtype: int64

In [12]:
# create a dictionary for gender
gender_enc = {
    "M" : 1,
    "F" : 2,
    " " : 3
}

#create a dictionary for Borough
borough_enc = {
    "Bronx" : 1,
    "Brooklyn" : 2,
    "Manhattan" : 3,
    "Queens" : 4,
    "Staten Island" : 5
}

In [16]:
#ecode the boroughs and the genders
dogs_df["gender_enc"] = dogs_df["AnimalGender"].apply(lambda x: gender_enc[x])
dogs_df["Borough_enc"] = dogs_df["Borough"].apply(lambda x: borough_enc[x])

dogs_df = dogs_df.drop(["AnimalGender", "Borough"], axis=1)

dogs_df.head()

Unnamed: 0.1,Unnamed: 0,X,RowNumber,AnimalName,AnimalBirthMonth,BreedName,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate,gender_enc,Borough_enc
0,1,1,1753,SHADOW,1,Beagle,11236,318.0,1014.0,BK50,46.0,8.0,19.0,2014-12-29,2016-01-30,1,2
1,2,2,2415,ROCCO,10,Boxer,11210,314.0,756.0,BK43,45.0,9.0,17.0,2015-01-07,2016-01-30,1,2
2,3,3,3328,LUIGI,9,Maltese,10464,210.0,516.0,BX10,13.0,14.0,34.0,2015-01-17,2016-02-02,1,1
3,4,4,7537,PETUNIA,8,Pug,11221,304.0,419.0,BK78,34.0,7.0,18.0,2015-03-01,2016-03-28,2,2
4,5,5,8487,ROMEO,10,Maltese,10451,201.0,65.0,BX34,17.0,15.0,32.0,2015-03-09,2016-03-09,1,1


In [17]:
#get the list of object type columns

dogs_cat = dogs_df.dtypes[dogs_df.dtypes == "object"].index.tolist()

dogs_cat

['AnimalName', 'BreedName', 'NTA']

In [18]:
#encode the rest of the non-numerical data
le = LabelEncoder()
dogs_enc = dogs_df.copy()
dogs_enc["BreedName"] = le.fit_transform(dogs_enc["BreedName"])
dogs_enc["AnimalName"] = le.fit_transform(dogs_enc["AnimalName"])
dogs_enc["NTA"] = le.fit_transform(dogs_enc["NTA"])

dogs_enc.head(10)

Unnamed: 0.1,Unnamed: 0,X,RowNumber,AnimalName,AnimalBirthMonth,BreedName,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate,gender_enc,Borough_enc
0,1,1,1753,16858,1,28,11236,318.0,1014.0,25,46.0,8.0,19.0,2014-12-29,2016-01-30,1,2
1,2,2,2415,15814,10,54,11210,314.0,756.0,21,45.0,9.0,17.0,2015-01-07,2016-01-30,1,2
2,3,3,3328,11035,9,174,10464,210.0,516.0,58,13.0,14.0,34.0,2015-01-17,2016-02-02,1,1
3,4,4,7537,14433,8,226,11221,304.0,419.0,38,34.0,7.0,18.0,2015-03-01,2016-03-28,2,2
4,5,5,8487,15914,10,174,10451,201.0,65.0,70,17.0,15.0,32.0,2015-03-09,2016-03-09,1,1
5,6,6,10503,2332,1,283,11225,309.0,800.0,27,40.0,9.0,20.0,2015-03-27,2016-03-29,1,2
6,7,7,11682,16383,5,226,10021,108.0,124.0,108,5.0,12.0,28.0,2015-04-06,2016-04-06,1,3
7,8,8,12307,11835,5,283,10305,502.0,74.0,180,50.0,11.0,23.0,2015-04-10,2016-03-30,2,5
8,9,9,12652,16104,4,54,11220,310.0,34.0,11,43.0,11.0,22.0,2015-04-13,2018-03-06,2,2
9,10,10,13839,10487,1,28,10468,207.0,269.0,67,14.0,13.0,33.0,2015-04-21,2016-03-24,1,1


In [21]:
#drop the last rows
dogs_enc = dogs_enc.drop(["Unnamed: 0", "X"], axis=1)
dogs_enc

Unnamed: 0,RowNumber,AnimalName,AnimalBirthMonth,BreedName,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate,gender_enc,Borough_enc
0,1753,16858,1,28,11236,318.0,1014.0,25,46.0,8.0,19.0,2014-12-29,2016-01-30,1,2
1,2415,15814,10,54,11210,314.0,756.0,21,45.0,9.0,17.0,2015-01-07,2016-01-30,1,2
2,3328,11035,9,174,10464,210.0,516.0,58,13.0,14.0,34.0,2015-01-17,2016-02-02,1,1
3,7537,14433,8,226,11221,304.0,419.0,38,34.0,7.0,18.0,2015-03-01,2016-03-28,2,2
4,8487,15914,10,174,10451,201.0,65.0,70,17.0,15.0,32.0,2015-03-09,2016-03-09,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121857,122225,15853,6,52,10022,106.0,98.0,99,4.0,12.0,28.0,2016-12-31,2018-12-31,1,3
121858,122226,16069,10,89,11219,311.0,250.0,8,38.0,10.0,22.0,2016-12-31,2018-09-18,2,2
121859,122227,19543,5,128,10312,503.0,17008.0,190,51.0,11.0,24.0,2016-12-31,2017-12-31,2,5
121860,122228,19620,12,297,10455,201.0,79.0,70,8.0,15.0,29.0,2016-12-31,2017-06-06,2,1


In [40]:
#define the features
#NOTE: many of the features directly relate to where the dog is from (ie ZIP code, districts, etc), I've removed these to make it harder for the model to guess
X = dogs_enc.copy()
X = X[["AnimalName", "AnimalBirthMonth", "BreedName", "gender_enc"]]
y = dogs_enc["Borough_enc"]

In [41]:
#test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [42]:
#confirm split
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(91396, 4)
(30466, 4)
(91396,)
(30466,)


In [43]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [44]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [45]:
#first model - simple decision tree
model = tree.DecisionTreeClassifier()

In [46]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [47]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

predictions

array([4, 3, 3, ..., 3, 3, 3], dtype=int64)

In [48]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

[[ 788  733  835  589  247]
 [ 815 2755 2292 1302  568]
 [ 924 2337 4935 1660  717]
 [ 683 1433 1763 1872  437]
 [ 279  660  794  469  579]]


In [49]:
# Displaying results of the first model
print("Confusion Matrix")
display(cm)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


array([[ 788,  733,  835,  589,  247],
       [ 815, 2755, 2292, 1302,  568],
       [ 924, 2337, 4935, 1660,  717],
       [ 683, 1433, 1763, 1872,  437],
       [ 279,  660,  794,  469,  579]], dtype=int64)

Accuracy Score : 0.35872776209545065
Classification Report
              precision    recall  f1-score   support

           1       0.23      0.25      0.24      3192
           2       0.35      0.36      0.35      7732
           3       0.46      0.47      0.47     10573
           4       0.32      0.30      0.31      6188
           5       0.23      0.21      0.22      2781

    accuracy                           0.36     30466
   macro avg       0.32      0.32      0.32     30466
weighted avg       0.36      0.36      0.36     30466



In [50]:
#create random forest classifier for second model
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [51]:
#fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [52]:
#make predictions using the data
predictions_rf = rf_model.predict(X_test_scaled)

predictions_rf

array([4, 3, 3, ..., 3, 3, 3], dtype=int64)

In [53]:
#calculate the accuracy score
acc_score = accuracy_score(y_test, predictions_rf)
#calculate the confusion matrix
cm = confusion_matrix(y_test, predictions)

#display the results
print("Confusion Matrix")
print(cm)
print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix
[[ 788  733  835  589  247]
 [ 815 2755 2292 1302  568]
 [ 924 2337 4935 1660  717]
 [ 683 1433 1763 1872  437]
 [ 279  660  794  469  579]]
Accuracy Score: 0.36772139434123285
Classification Report
              precision    recall  f1-score   support

           1       0.23      0.25      0.24      3192
           2       0.35      0.36      0.35      7732
           3       0.46      0.47      0.47     10573
           4       0.32      0.30      0.31      6188
           5       0.23      0.21      0.22      2781

    accuracy                           0.36     30466
   macro avg       0.32      0.32      0.32     30466
weighted avg       0.36      0.36      0.36     30466

