In [1]:
#import dependencies
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np

#sklearn dependencies
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
file_path = "Resources/NYC_dogs_clean_forclass.csv"

dogs_df = pd.read_csv(file_path)

dogs_df.head()

Unnamed: 0.1,Unnamed: 0,AnimalName,AnimalGender,AnimalBirthMonth,BreedName,Borough,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate
0,0,SHADOW,M,1,Beagle,Brooklyn,11236,318.0,1014.0,BK50,46.0,8.0,19.0,2014-12-29,2016-01-30
1,1,ROCCO,M,10,Boxer,Brooklyn,11210,314.0,756.0,BK43,45.0,9.0,17.0,2015-01-07,2016-01-30
2,2,LUIGI,M,9,Maltese,Bronx,10464,210.0,516.0,BX10,13.0,14.0,34.0,2015-01-17,2016-02-02
3,3,PETUNIA,F,8,Pug,Brooklyn,11221,304.0,419.0,BK78,34.0,7.0,18.0,2015-03-01,2016-03-28
4,4,ROMEO,M,10,Maltese,Bronx,10451,201.0,65.0,BX34,17.0,15.0,32.0,2015-03-09,2016-03-09


In [3]:
dogs_df.dtypes

Unnamed: 0                   int64
AnimalName                  object
AnimalGender                object
AnimalBirthMonth             int64
BreedName                   object
Borough                     object
ZipCode                      int64
CommunityDistrict          float64
CensusTract2010            float64
NTA                         object
CityCouncilDistrict        float64
CongressionalDistrict      float64
StateSenatorialDistrict    float64
LicenseIssuedDate           object
LicenseExpiredDate          object
dtype: object

In [4]:
dogs_df.nunique()

Unnamed: 0                 117624
AnimalName                  20726
AnimalGender                    2
AnimalBirthMonth               12
BreedName                     299
Borough                         5
ZipCode                       339
CommunityDistrict              65
CensusTract2010              1308
NTA                           192
CityCouncilDistrict            51
CongressionalDistrict          13
StateSenatorialDistrict        26
LicenseIssuedDate             842
LicenseExpiredDate           2212
dtype: int64

In [5]:
#convert date columns to datetime
dogs_df["LicenseIssuedDate"] = pd.to_datetime(dogs_df["LicenseIssuedDate"])
dogs_df["LicenseExpiredDate"] = pd.to_datetime(dogs_df["LicenseExpiredDate"])
dogs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117624 entries, 0 to 117623
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   Unnamed: 0               117624 non-null  int64         
 1   AnimalName               117389 non-null  object        
 2   AnimalGender             117624 non-null  object        
 3   AnimalBirthMonth         117624 non-null  int64         
 4   BreedName                117624 non-null  object        
 5   Borough                  117624 non-null  object        
 6   ZipCode                  117624 non-null  int64         
 7   CommunityDistrict        114516 non-null  float64       
 8   CensusTract2010          114516 non-null  float64       
 9   NTA                      114516 non-null  object        
 10  CityCouncilDistrict      114516 non-null  float64       
 11  CongressionalDistrict    114516 non-null  float64       
 12  StateSenatorialD

In [6]:
#convert animal brith month column to month only
dogs_df["AnimalBirthMonth"] = pd.DatetimeIndex(dogs_df['AnimalBirthMonth']).month

dogs_df.head(10)

Unnamed: 0.1,Unnamed: 0,AnimalName,AnimalGender,AnimalBirthMonth,BreedName,Borough,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate
0,0,SHADOW,M,1,Beagle,Brooklyn,11236,318.0,1014.0,BK50,46.0,8.0,19.0,2014-12-29,2016-01-30
1,1,ROCCO,M,1,Boxer,Brooklyn,11210,314.0,756.0,BK43,45.0,9.0,17.0,2015-01-07,2016-01-30
2,2,LUIGI,M,1,Maltese,Bronx,10464,210.0,516.0,BX10,13.0,14.0,34.0,2015-01-17,2016-02-02
3,3,PETUNIA,F,1,Pug,Brooklyn,11221,304.0,419.0,BK78,34.0,7.0,18.0,2015-03-01,2016-03-28
4,4,ROMEO,M,1,Maltese,Bronx,10451,201.0,65.0,BX34,17.0,15.0,32.0,2015-03-09,2016-03-09
5,5,BRANDY,M,1,Unknown,Brooklyn,11225,309.0,800.0,BK60,40.0,9.0,20.0,2015-03-27,2016-03-29
6,6,SAM,M,1,Pug,Manhattan,10021,108.0,124.0,MN31,5.0,12.0,28.0,2015-04-06,2016-04-06
7,7,MAY,F,1,Unknown,Staten Island,10305,502.0,74.0,SI14,50.0,11.0,23.0,2015-04-10,2016-03-30
8,8,RUBY,F,1,Boxer,Brooklyn,11220,310.0,34.0,BK31,43.0,11.0,22.0,2015-04-13,2018-03-06
9,9,LEO,M,1,Beagle,Bronx,10468,207.0,269.0,BX30,14.0,13.0,33.0,2015-04-21,2016-03-24


In [7]:
#what are the genders for dogs
dogs_df["AnimalGender"].value_counts()

M    64177
F    53447
Name: AnimalGender, dtype: int64

In [8]:
# create a dictionary for gender
gender_enc = {
    "M" : 1,
    "F" : 2,
    " " : 3
}

#create a dictionary for Borough
borough_enc = {
    "Bronx" : 1,
    "Brooklyn" : 2,
    "Manhattan" : 3,
    "Queens" : 4,
    "Staten Island" : 5
}

In [9]:
#ecode the boroughs and the genders
dogs_df["gender_enc"] = dogs_df["AnimalGender"].apply(lambda x: gender_enc[x])
dogs_df["Borough_enc"] = dogs_df["Borough"].apply(lambda x: borough_enc[x])

dogs_df = dogs_df.drop(["AnimalGender", "Borough"], axis=1)

dogs_df.head()

Unnamed: 0.1,Unnamed: 0,AnimalName,AnimalBirthMonth,BreedName,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate,gender_enc,Borough_enc
0,0,SHADOW,1,Beagle,11236,318.0,1014.0,BK50,46.0,8.0,19.0,2014-12-29,2016-01-30,1,2
1,1,ROCCO,1,Boxer,11210,314.0,756.0,BK43,45.0,9.0,17.0,2015-01-07,2016-01-30,1,2
2,2,LUIGI,1,Maltese,10464,210.0,516.0,BX10,13.0,14.0,34.0,2015-01-17,2016-02-02,1,1
3,3,PETUNIA,1,Pug,11221,304.0,419.0,BK78,34.0,7.0,18.0,2015-03-01,2016-03-28,2,2
4,4,ROMEO,1,Maltese,10451,201.0,65.0,BX34,17.0,15.0,32.0,2015-03-09,2016-03-09,1,1


In [10]:
#get the list of object type columns

dogs_cat = dogs_df.dtypes[dogs_df.dtypes == "object"].index.tolist()

dogs_cat

['AnimalName', 'BreedName', 'NTA']

In [11]:
#encode the rest of the non-numerical data
le = LabelEncoder()
dogs_enc = dogs_df.copy()
dogs_enc["BreedName"] = le.fit_transform(dogs_enc["BreedName"])
dogs_enc["AnimalName"] = le.fit_transform(dogs_enc["AnimalName"])
dogs_enc["NTA"] = le.fit_transform(dogs_enc["NTA"])

dogs_enc.head(10)

Unnamed: 0.1,Unnamed: 0,AnimalName,AnimalBirthMonth,BreedName,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate,gender_enc,Borough_enc
0,0,16857,1,28,11236,318.0,1014.0,25,46.0,8.0,19.0,2014-12-29,2016-01-30,1,2
1,1,15813,1,54,11210,314.0,756.0,21,45.0,9.0,17.0,2015-01-07,2016-01-30,1,2
2,2,11035,1,174,10464,210.0,516.0,58,13.0,14.0,34.0,2015-01-17,2016-02-02,1,1
3,3,14432,1,226,11221,304.0,419.0,38,34.0,7.0,18.0,2015-03-01,2016-03-28,2,2
4,4,15913,1,174,10451,201.0,65.0,70,17.0,15.0,32.0,2015-03-09,2016-03-09,1,1
5,5,2332,1,283,11225,309.0,800.0,27,40.0,9.0,20.0,2015-03-27,2016-03-29,1,2
6,6,16382,1,226,10021,108.0,124.0,108,5.0,12.0,28.0,2015-04-06,2016-04-06,1,3
7,7,11835,1,283,10305,502.0,74.0,180,50.0,11.0,23.0,2015-04-10,2016-03-30,2,5
8,8,16103,1,54,11220,310.0,34.0,11,43.0,11.0,22.0,2015-04-13,2018-03-06,2,2
9,9,10487,1,28,10468,207.0,269.0,67,14.0,13.0,33.0,2015-04-21,2016-03-24,1,1


In [13]:
#drop the last rows
dogs_enc = dogs_enc.drop(["Unnamed: 0"], axis=1)
dogs_enc

Unnamed: 0,AnimalName,AnimalBirthMonth,BreedName,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate,gender_enc,Borough_enc
0,16857,1,28,11236,318.0,1014.0,25,46.0,8.0,19.0,2014-12-29,2016-01-30,1,2
1,15813,1,54,11210,314.0,756.0,21,45.0,9.0,17.0,2015-01-07,2016-01-30,1,2
2,11035,1,174,10464,210.0,516.0,58,13.0,14.0,34.0,2015-01-17,2016-02-02,1,1
3,14432,1,226,11221,304.0,419.0,38,34.0,7.0,18.0,2015-03-01,2016-03-28,2,2
4,15913,1,174,10451,201.0,65.0,70,17.0,15.0,32.0,2015-03-09,2016-03-09,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117619,15852,1,52,10022,106.0,98.0,99,4.0,12.0,28.0,2016-12-31,2018-12-31,1,3
117620,16068,1,89,11219,311.0,250.0,8,38.0,10.0,22.0,2016-12-31,2018-09-18,2,2
117621,19541,1,128,10312,503.0,17008.0,190,51.0,11.0,24.0,2016-12-31,2017-12-31,2,5
117622,19618,1,297,10455,201.0,79.0,70,8.0,15.0,29.0,2016-12-31,2017-06-06,2,1


In [14]:
#define the features
#NOTE: many of the features directly relate to where the dog is from (ie ZIP code, districts, etc), I've removed these to make it harder for the model to guess
X = dogs_enc.copy()
X = X[["AnimalName", "AnimalBirthMonth", "BreedName", "gender_enc"]]
y = dogs_enc["Borough_enc"]

In [15]:
#test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [16]:
#confirm split
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(88218, 4)
(29406, 4)
(88218,)
(29406,)


In [17]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [18]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
#first model - simple decision tree
model = tree.DecisionTreeClassifier()

In [20]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [21]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

predictions

array([3, 2, 2, ..., 5, 2, 3], dtype=int64)

In [22]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

[[ 753  824  792  558  185]
 [ 809 2486 2289 1196  421]
 [ 920 2475 4859 1482  554]
 [ 721 1515 1843 1597  395]
 [ 315  720  864  459  374]]


In [23]:
# Displaying results of the first model
print("Confusion Matrix")
display(cm)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


array([[ 753,  824,  792,  558,  185],
       [ 809, 2486, 2289, 1196,  421],
       [ 920, 2475, 4859, 1482,  554],
       [ 721, 1515, 1843, 1597,  395],
       [ 315,  720,  864,  459,  374]], dtype=int64)

Accuracy Score : 0.34241311297014215
Classification Report
              precision    recall  f1-score   support

           1       0.21      0.24      0.23      3112
           2       0.31      0.35      0.33      7201
           3       0.46      0.47      0.46     10290
           4       0.30      0.26      0.28      6071
           5       0.19      0.14      0.16      2732

    accuracy                           0.34     29406
   macro avg       0.30      0.29      0.29     29406
weighted avg       0.34      0.34      0.34     29406



In [24]:
#create random forest classifier for second model
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [25]:
#fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [26]:
#make predictions using the data
predictions_rf = rf_model.predict(X_test_scaled)

predictions_rf

array([3, 2, 2, ..., 3, 2, 3], dtype=int64)

In [28]:
#calculate the accuracy score
acc_score = accuracy_score(y_test, predictions_rf)
#calculate the confusion matrix
cm = confusion_matrix(y_test, predictions_rf)

#display the results
print("Confusion Matrix")
print(cm)
print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions_rf))

Confusion Matrix
[[ 628  788  868  619  209]
 [ 623 2327 2470 1315  466]
 [ 672 2165 5130 1672  651]
 [ 539 1359 1945 1788  440]
 [ 239  644  899  525  425]]
Accuracy Score: 0.35020063932530776
Classification Report
              precision    recall  f1-score   support

           1       0.23      0.20      0.22      3112
           2       0.32      0.32      0.32      7201
           3       0.45      0.50      0.47     10290
           4       0.30      0.29      0.30      6071
           5       0.19      0.16      0.17      2732

    accuracy                           0.35     29406
   macro avg       0.30      0.29      0.30     29406
weighted avg       0.34      0.35      0.35     29406

