In [1]:
#import dependencies
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np

#sklearn dependencies
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
file_path = "Resources/NYC_dogs_clean.csv"

dogs_df = pd.read_csv(file_path)

dogs_df.head()

Unnamed: 0.1,Unnamed: 0,X,RowNumber,AnimalName,AnimalGender,AnimalBirthMonth,BreedName,Borough,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate
0,1,1,1753,SHADOW,M,01/01/2000 12:00:00 AM,Beagle,Brooklyn,11236,318.0,1014.0,BK50,46.0,8.0,19.0,12/29/2014,01/30/2016
1,2,2,2415,ROCCO,M,10/01/2011 12:00:00 AM,Boxer,Brooklyn,11210,314.0,756.0,BK43,45.0,9.0,17.0,01/07/2015,01/30/2016
2,3,3,3328,LUIGI,M,09/01/2005 12:00:00 AM,Maltese,Bronx,10464,210.0,516.0,BX10,13.0,14.0,34.0,01/17/2015,02/02/2016
3,4,4,7537,PETUNIA,F,08/01/2013 12:00:00 AM,Pug,Brooklyn,11221,304.0,419.0,BK78,34.0,7.0,18.0,03/01/2015,03/28/2016
4,5,5,8487,ROMEO,M,10/01/2008 12:00:00 AM,Maltese,Bronx,10451,201.0,65.0,BX34,17.0,15.0,32.0,03/09/2015,03/09/2016


In [3]:
dogs_df.dtypes

Unnamed: 0                   int64
X                            int64
RowNumber                    int64
AnimalName                  object
AnimalGender                object
AnimalBirthMonth            object
BreedName                   object
Borough                     object
ZipCode                      int64
CommunityDistrict          float64
CensusTract2010            float64
NTA                         object
CityCouncilDistrict        float64
CongressionalDistrict      float64
StateSenatorialDistrict    float64
LicenseIssuedDate           object
LicenseExpiredDate          object
dtype: object

In [4]:
dogs_df.nunique()

Unnamed: 0                 121862
X                          121862
RowNumber                  121862
AnimalName                  20728
AnimalGender                    3
AnimalBirthMonth              217
BreedName                     299
Borough                         5
ZipCode                       348
CommunityDistrict              65
CensusTract2010              1309
NTA                           192
CityCouncilDistrict            51
CongressionalDistrict          13
StateSenatorialDistrict        26
LicenseIssuedDate             842
LicenseExpiredDate           2212
dtype: int64

In [6]:
#convert date columns to datetime
dogs_df["LicenseIssuedDate"] = pd.to_datetime(dogs_df["LicenseIssuedDate"])
dogs_df["LicenseExpiredDate"] = pd.to_datetime(dogs_df["LicenseExpiredDate"])
dogs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121862 entries, 0 to 121861
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   Unnamed: 0               121862 non-null  int64         
 1   X                        121862 non-null  int64         
 2   RowNumber                121862 non-null  int64         
 3   AnimalName               121627 non-null  object        
 4   AnimalGender             121862 non-null  object        
 5   AnimalBirthMonth         121862 non-null  object        
 6   BreedName                121862 non-null  object        
 7   Borough                  121862 non-null  object        
 8   ZipCode                  121862 non-null  int64         
 9   CommunityDistrict        118546 non-null  float64       
 10  CensusTract2010          118546 non-null  float64       
 11  NTA                      118546 non-null  object        
 12  CityCouncilDistr

In [7]:
#convert animal brith month column to month only
dogs_df["AnimalBirthMonth"] = pd.DatetimeIndex(dogs_df['AnimalBirthMonth']).month

dogs_df.head(10)

Unnamed: 0.1,Unnamed: 0,X,RowNumber,AnimalName,AnimalGender,AnimalBirthMonth,BreedName,Borough,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate
0,1,1,1753,SHADOW,M,1,Beagle,Brooklyn,11236,318.0,1014.0,BK50,46.0,8.0,19.0,2014-12-29,2016-01-30
1,2,2,2415,ROCCO,M,10,Boxer,Brooklyn,11210,314.0,756.0,BK43,45.0,9.0,17.0,2015-01-07,2016-01-30
2,3,3,3328,LUIGI,M,9,Maltese,Bronx,10464,210.0,516.0,BX10,13.0,14.0,34.0,2015-01-17,2016-02-02
3,4,4,7537,PETUNIA,F,8,Pug,Brooklyn,11221,304.0,419.0,BK78,34.0,7.0,18.0,2015-03-01,2016-03-28
4,5,5,8487,ROMEO,M,10,Maltese,Bronx,10451,201.0,65.0,BX34,17.0,15.0,32.0,2015-03-09,2016-03-09
5,6,6,10503,BRANDY,M,1,Unknown,Brooklyn,11225,309.0,800.0,BK60,40.0,9.0,20.0,2015-03-27,2016-03-29
6,7,7,11682,SAM,M,5,Pug,Manhattan,10021,108.0,124.0,MN31,5.0,12.0,28.0,2015-04-06,2016-04-06
7,8,8,12307,MAY,F,5,Unknown,Staten Island,10305,502.0,74.0,SI14,50.0,11.0,23.0,2015-04-10,2016-03-30
8,9,9,12652,RUBY,F,4,Boxer,Brooklyn,11220,310.0,34.0,BK31,43.0,11.0,22.0,2015-04-13,2018-03-06
9,10,10,13839,LEO,M,1,Beagle,Bronx,10468,207.0,269.0,BX30,14.0,13.0,33.0,2015-04-21,2016-03-24


In [8]:
#what are the genders for dogs
dogs_df["AnimalGender"].value_counts()

M    66670
F    55185
         7
Name: AnimalGender, dtype: int64

In [12]:
# create a dictionary for gender
gender_enc = {
    "M" : 1,
    "F" : 2,
    " " : 3
}

#create a dictionary for Borough
borough_enc = {
    "Bronx" : 1,
    "Brooklyn" : 2,
    "Manhattan" : 3,
    "Queens" : 4,
    "Staten Island" : 5
}

In [13]:
#ecode the boroughs and the genders
dogs_df["gender_enc"] = dogs_df["AnimalGender"].apply(lambda x: gender_enc[x])
dogs_df["Borough_enc"] = dogs_df["Borough"].apply(lambda x: borough_enc[x])

dogs_df.head()

Unnamed: 0.1,Unnamed: 0,X,RowNumber,AnimalName,AnimalGender,AnimalBirthMonth,BreedName,Borough,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate,gender_enc,Borough_enc
0,1,1,1753,SHADOW,M,1,Beagle,Brooklyn,11236,318.0,1014.0,BK50,46.0,8.0,19.0,2014-12-29,2016-01-30,1,2
1,2,2,2415,ROCCO,M,10,Boxer,Brooklyn,11210,314.0,756.0,BK43,45.0,9.0,17.0,2015-01-07,2016-01-30,1,2
2,3,3,3328,LUIGI,M,9,Maltese,Bronx,10464,210.0,516.0,BX10,13.0,14.0,34.0,2015-01-17,2016-02-02,1,1
3,4,4,7537,PETUNIA,F,8,Pug,Brooklyn,11221,304.0,419.0,BK78,34.0,7.0,18.0,2015-03-01,2016-03-28,2,2
4,5,5,8487,ROMEO,M,10,Maltese,Bronx,10451,201.0,65.0,BX34,17.0,15.0,32.0,2015-03-09,2016-03-09,1,1


In [None]:
#encode the rest of the non-numerical data
le = LabelEncoder()
dogs_enc = dogs_df.copy()
dogs_enc["BreedName"] = le.fit_transform(dogs_enc["BreedName"])