In [1]:
#import dependencies
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np

#sklearn dependencies
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

#database
from sqlalchemy import create_engine

In [2]:
file_path = "Resources/NYC_dogs_clean_forclass.csv"

dogs_df = pd.read_csv(file_path)

dogs_df

Unnamed: 0.1,Unnamed: 0,AnimalName,AnimalGender,AnimalBirthMonth,BreedName,Borough,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate
0,0,SHADOW,M,1,Beagle,Brooklyn,11236,318.0,1014.0,BK50,46.0,8.0,19.0,2014-12-29,2016-01-30
1,1,ROCCO,M,10,Boxer,Brooklyn,11210,314.0,756.0,BK43,45.0,9.0,17.0,2015-01-07,2016-01-30
2,2,LUIGI,M,9,Maltese,Bronx,10464,210.0,516.0,BX10,13.0,14.0,34.0,2015-01-17,2016-02-02
3,3,PETUNIA,F,8,Pug,Brooklyn,11221,304.0,419.0,BK78,34.0,7.0,18.0,2015-03-01,2016-03-28
4,4,ROMEO,M,10,Maltese,Bronx,10451,201.0,65.0,BX34,17.0,15.0,32.0,2015-03-09,2016-03-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117619,121857,ROCKY,M,6,Boston Terrier,Manhattan,10022,106.0,98.0,MN19,4.0,12.0,28.0,2016-12-31,2018-12-31
117620,121858,ROXY,F,10,"Collie, Border",Brooklyn,11219,311.0,250.0,BK28,38.0,10.0,22.0,2016-12-31,2018-09-18
117621,121859,VALENTINA,F,5,German Shepherd Crossbreed,Staten Island,10312,503.0,17008.0,SI48,51.0,11.0,24.0,2016-12-31,2017-12-31
117622,121860,VENUS,F,12,Yorkshire Terrier,Bronx,10455,201.0,79.0,BX34,8.0,15.0,29.0,2016-12-31,2017-06-06


In [3]:
dogs_df.dtypes

Unnamed: 0                   int64
AnimalName                  object
AnimalGender                object
AnimalBirthMonth             int64
BreedName                   object
Borough                     object
ZipCode                      int64
CommunityDistrict          float64
CensusTract2010            float64
NTA                         object
CityCouncilDistrict        float64
CongressionalDistrict      float64
StateSenatorialDistrict    float64
LicenseIssuedDate           object
LicenseExpiredDate          object
dtype: object

In [4]:
dogs_df.nunique()

Unnamed: 0                 117624
AnimalName                  20726
AnimalGender                    2
AnimalBirthMonth               12
BreedName                     299
Borough                         5
ZipCode                       339
CommunityDistrict              65
CensusTract2010              1308
NTA                           192
CityCouncilDistrict            51
CongressionalDistrict          13
StateSenatorialDistrict        26
LicenseIssuedDate             842
LicenseExpiredDate           2212
dtype: int64

In [5]:
#convert date columns to datetime
dogs_df["LicenseIssuedDate"] = pd.to_datetime(dogs_df["LicenseIssuedDate"])
dogs_df["LicenseExpiredDate"] = pd.to_datetime(dogs_df["LicenseExpiredDate"])
dogs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117624 entries, 0 to 117623
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   Unnamed: 0               117624 non-null  int64         
 1   AnimalName               117389 non-null  object        
 2   AnimalGender             117624 non-null  object        
 3   AnimalBirthMonth         117624 non-null  int64         
 4   BreedName                117624 non-null  object        
 5   Borough                  117624 non-null  object        
 6   ZipCode                  117624 non-null  int64         
 7   CommunityDistrict        114516 non-null  float64       
 8   CensusTract2010          114516 non-null  float64       
 9   NTA                      114516 non-null  object        
 10  CityCouncilDistrict      114516 non-null  float64       
 11  CongressionalDistrict    114516 non-null  float64       
 12  StateSenatorialD

In [6]:
#convert datetime to integer
dogs_df["LicenseIssuedDate"] = dogs_df["LicenseIssuedDate"].apply(lambda x: x.value/ 10**9)
dogs_df["LicenseExpiredDate"] = dogs_df["LicenseExpiredDate"].apply(lambda x: x.value/ 10**9)

In [7]:
#convert animal brith month column to month only
dogs_df["AnimalBirthMonth"] = pd.DatetimeIndex(dogs_df['AnimalBirthMonth']).month

dogs_df.head(10)

Unnamed: 0.1,Unnamed: 0,AnimalName,AnimalGender,AnimalBirthMonth,BreedName,Borough,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate
0,0,SHADOW,M,1,Beagle,Brooklyn,11236,318.0,1014.0,BK50,46.0,8.0,19.0,1419811000.0,1454112000.0
1,1,ROCCO,M,1,Boxer,Brooklyn,11210,314.0,756.0,BK43,45.0,9.0,17.0,1420589000.0,1454112000.0
2,2,LUIGI,M,1,Maltese,Bronx,10464,210.0,516.0,BX10,13.0,14.0,34.0,1421453000.0,1454371000.0
3,3,PETUNIA,F,1,Pug,Brooklyn,11221,304.0,419.0,BK78,34.0,7.0,18.0,1425168000.0,1459123000.0
4,4,ROMEO,M,1,Maltese,Bronx,10451,201.0,65.0,BX34,17.0,15.0,32.0,1425859000.0,1457482000.0
5,5,BRANDY,M,1,Unknown,Brooklyn,11225,309.0,800.0,BK60,40.0,9.0,20.0,1427414000.0,1459210000.0
6,6,SAM,M,1,Pug,Manhattan,10021,108.0,124.0,MN31,5.0,12.0,28.0,1428278000.0,1459901000.0
7,7,MAY,F,1,Unknown,Staten Island,10305,502.0,74.0,SI14,50.0,11.0,23.0,1428624000.0,1459296000.0
8,8,RUBY,F,1,Boxer,Brooklyn,11220,310.0,34.0,BK31,43.0,11.0,22.0,1428883000.0,1520294000.0
9,9,LEO,M,1,Beagle,Bronx,10468,207.0,269.0,BX30,14.0,13.0,33.0,1429574000.0,1458778000.0


In [8]:
#check nan values
dogs_df.isna().sum()

Unnamed: 0                    0
AnimalName                  235
AnimalGender                  0
AnimalBirthMonth              0
BreedName                     0
Borough                       0
ZipCode                       0
CommunityDistrict          3108
CensusTract2010            3108
NTA                        3108
CityCouncilDistrict        3108
CongressionalDistrict      3108
StateSenatorialDistrict    3108
LicenseIssuedDate             0
LicenseExpiredDate            0
dtype: int64

In [9]:
#since there are over 100,000 instances, we can drop 235 nan animal name values
dogs_df = dogs_df.dropna()

dogs_df

Unnamed: 0.1,Unnamed: 0,AnimalName,AnimalGender,AnimalBirthMonth,BreedName,Borough,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate
0,0,SHADOW,M,1,Beagle,Brooklyn,11236,318.0,1014.0,BK50,46.0,8.0,19.0,1.419811e+09,1.454112e+09
1,1,ROCCO,M,1,Boxer,Brooklyn,11210,314.0,756.0,BK43,45.0,9.0,17.0,1.420589e+09,1.454112e+09
2,2,LUIGI,M,1,Maltese,Bronx,10464,210.0,516.0,BX10,13.0,14.0,34.0,1.421453e+09,1.454371e+09
3,3,PETUNIA,F,1,Pug,Brooklyn,11221,304.0,419.0,BK78,34.0,7.0,18.0,1.425168e+09,1.459123e+09
4,4,ROMEO,M,1,Maltese,Bronx,10451,201.0,65.0,BX34,17.0,15.0,32.0,1.425859e+09,1.457482e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117619,121857,ROCKY,M,1,Boston Terrier,Manhattan,10022,106.0,98.0,MN19,4.0,12.0,28.0,1.483142e+09,1.546214e+09
117620,121858,ROXY,F,1,"Collie, Border",Brooklyn,11219,311.0,250.0,BK28,38.0,10.0,22.0,1.483142e+09,1.537229e+09
117621,121859,VALENTINA,F,1,German Shepherd Crossbreed,Staten Island,10312,503.0,17008.0,SI48,51.0,11.0,24.0,1.483142e+09,1.514678e+09
117622,121860,VENUS,F,1,Yorkshire Terrier,Bronx,10455,201.0,79.0,BX34,8.0,15.0,29.0,1.483142e+09,1.496707e+09


In [10]:
#what are the genders for dogs
dogs_df["AnimalGender"].value_counts()

M    62282
F    52008
Name: AnimalGender, dtype: int64

In [11]:
# create a dictionary for gender
gender_enc = {
    "M" : 1,
    "F" : 2,
    " " : 3
}

#create a dictionary for Borough
borough_enc = {
    "Bronx" : 1,
    "Brooklyn" : 2,
    "Manhattan" : 3,
    "Queens" : 4,
    "Staten Island" : 5
}

In [12]:
#ecode the boroughs and the genders
dogs_df["gender_enc"] = dogs_df["AnimalGender"].apply(lambda x: gender_enc[x])
dogs_df["Borough_enc"] = dogs_df["Borough"].apply(lambda x: borough_enc[x])

dogs_df = dogs_df.drop(["AnimalGender", "Borough"], axis=1)

dogs_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0.1,Unnamed: 0,AnimalName,AnimalBirthMonth,BreedName,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate,gender_enc,Borough_enc
0,0,SHADOW,1,Beagle,11236,318.0,1014.0,BK50,46.0,8.0,19.0,1419811000.0,1454112000.0,1,2
1,1,ROCCO,1,Boxer,11210,314.0,756.0,BK43,45.0,9.0,17.0,1420589000.0,1454112000.0,1,2
2,2,LUIGI,1,Maltese,10464,210.0,516.0,BX10,13.0,14.0,34.0,1421453000.0,1454371000.0,1,1
3,3,PETUNIA,1,Pug,11221,304.0,419.0,BK78,34.0,7.0,18.0,1425168000.0,1459123000.0,2,2
4,4,ROMEO,1,Maltese,10451,201.0,65.0,BX34,17.0,15.0,32.0,1425859000.0,1457482000.0,1,1


In [13]:
#get the list of object type columns
#it's ok that NTA is listed as an object because it will be dropped before analysis anyway

dogs_cat = dogs_df.dtypes[dogs_df.dtypes == "object"].index.tolist()

dogs_cat

['AnimalName', 'BreedName', 'NTA']

In [14]:
#encode the rest of the non-numerical data
le = LabelEncoder()
dogs_enc = dogs_df.copy()
dogs_enc["BreedName"] = le.fit_transform(dogs_enc["BreedName"])
dogs_enc["AnimalName"] = le.fit_transform(dogs_enc["AnimalName"])
dogs_enc["NTA"] = le.fit_transform(dogs_enc["NTA"])


dogs_enc.head(10)

Unnamed: 0.1,Unnamed: 0,AnimalName,AnimalBirthMonth,BreedName,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate,gender_enc,Borough_enc
0,0,16555,1,28,11236,318.0,1014.0,25,46.0,8.0,19.0,1419811000.0,1454112000.0,1,2
1,1,15530,1,54,11210,314.0,756.0,21,45.0,9.0,17.0,1420589000.0,1454112000.0,1,2
2,2,10845,1,174,10464,210.0,516.0,58,13.0,14.0,34.0,1421453000.0,1454371000.0,1,1
3,3,14177,1,226,11221,304.0,419.0,38,34.0,7.0,18.0,1425168000.0,1459123000.0,2,2
4,4,15626,1,174,10451,201.0,65.0,70,17.0,15.0,32.0,1425859000.0,1457482000.0,1,1
5,5,2302,1,283,11225,309.0,800.0,27,40.0,9.0,20.0,1427414000.0,1459210000.0,1,2
6,6,16084,1,226,10021,108.0,124.0,108,5.0,12.0,28.0,1428278000.0,1459901000.0,1,3
7,7,11632,1,283,10305,502.0,74.0,180,50.0,11.0,23.0,1428624000.0,1459296000.0,2,5
8,8,15811,1,54,11220,310.0,34.0,11,43.0,11.0,22.0,1428883000.0,1520294000.0,2,2
9,9,10304,1,28,10468,207.0,269.0,67,14.0,13.0,33.0,1429574000.0,1458778000.0,1,1


In [15]:
#drop the first column
dogs_enc = dogs_enc.drop(["Unnamed: 0"], axis=1)
dogs_enc

Unnamed: 0,AnimalName,AnimalBirthMonth,BreedName,ZipCode,CommunityDistrict,CensusTract2010,NTA,CityCouncilDistrict,CongressionalDistrict,StateSenatorialDistrict,LicenseIssuedDate,LicenseExpiredDate,gender_enc,Borough_enc
0,16555,1,28,11236,318.0,1014.0,25,46.0,8.0,19.0,1.419811e+09,1.454112e+09,1,2
1,15530,1,54,11210,314.0,756.0,21,45.0,9.0,17.0,1.420589e+09,1.454112e+09,1,2
2,10845,1,174,10464,210.0,516.0,58,13.0,14.0,34.0,1.421453e+09,1.454371e+09,1,1
3,14177,1,226,11221,304.0,419.0,38,34.0,7.0,18.0,1.425168e+09,1.459123e+09,2,2
4,15626,1,174,10451,201.0,65.0,70,17.0,15.0,32.0,1.425859e+09,1.457482e+09,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117619,15568,1,52,10022,106.0,98.0,99,4.0,12.0,28.0,1.483142e+09,1.546214e+09,1,3
117620,15778,1,89,11219,311.0,250.0,8,38.0,10.0,22.0,1.483142e+09,1.537229e+09,2,2
117621,19190,1,128,10312,503.0,17008.0,190,51.0,11.0,24.0,1.483142e+09,1.514678e+09,2,5
117622,19262,1,297,10455,201.0,79.0,70,8.0,15.0,29.0,1.483142e+09,1.496707e+09,2,1


In [16]:
#define the features
#NOTE: many of the features directly relate to where the dog is from (ie ZIP code, districts, etc), I've removed these to make it harder for the model to guess
X = dogs_enc.copy()
X = X[["AnimalName", "BreedName", "gender_enc", "AnimalBirthMonth", "LicenseIssuedDate"]]
y = dogs_enc["Borough_enc"]

In [17]:
#test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [18]:
#confirm split
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(85717, 5)
(28573, 5)
(85717,)
(28573,)


In [19]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [20]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
#first model - simple decision tree
model = tree.DecisionTreeClassifier()

In [22]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [23]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

print(predictions)

[2 3 3 ... 3 4 4]


In [24]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Acutal Bronx", "Actual Brooklyn", "Actual Manhattan", "Actual Queens", "Actual Staten Island"], 
    columns=["Predicted Bronx", "Predicted Brooklyn", "Predicted Manhattan", "Predicted Queens", "Predicted Staten Island"]
)
cm_df

Unnamed: 0,Predicted Bronx,Predicted Brooklyn,Predicted Manhattan,Predicted Queens,Predicted Staten Island
Acutal Bronx,418,779,917,599,266
Actual Brooklyn,804,1824,2317,1404,677
Actual Manhattan,968,2308,3875,1985,895
Actual Queens,639,1380,1956,1353,599
Actual Staten Island,321,633,800,552,304


In [25]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [26]:
# Displaying results of the first model
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted Bronx,Predicted Brooklyn,Predicted Manhattan,Predicted Queens,Predicted Staten Island
Acutal Bronx,418,779,917,599,266
Actual Brooklyn,804,1824,2317,1404,677
Actual Manhattan,968,2308,3875,1985,895
Actual Queens,639,1380,1956,1353,599
Actual Staten Island,321,633,800,552,304


Accuracy Score : 0.272075035873027
Classification Report
              precision    recall  f1-score   support

           1       0.13      0.14      0.14      2979
           2       0.26      0.26      0.26      7026
           3       0.39      0.39      0.39     10031
           4       0.23      0.23      0.23      5927
           5       0.11      0.12      0.11      2610

    accuracy                           0.27     28573
   macro avg       0.23      0.23      0.23     28573
weighted avg       0.27      0.27      0.27     28573



## Random Forest Model -- This will take some time to run!!

In [27]:
#create random forest classifier for second model
rf_model = RandomForestClassifier(n_estimators=1000, random_state=78)

In [28]:
#fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [29]:
#make predictions using the data
predictions_rf = rf_model.predict(X_test_scaled)

predictions_rf

array([2, 3, 3, ..., 3, 4, 4], dtype=int64)

In [30]:
#calculate the accuracy score
acc_score = accuracy_score(y_test, predictions_rf)
#calculate the confusion matrix
cm_rf = confusion_matrix(y_test, predictions_rf)
cm_rf_df = pd.DataFrame(
    cm_rf, index=["Acutal Bronx", "Actual Brooklyn", "Actual Manhattan", "Actual Queens", "Actual Staten Island"], 
    columns=["Predicted Bronx", "Predicted Brooklyn", "Predicted Manhattan", "Predicted Queens", "Predicted Staten Island"]
)

#display the results
print("Confusion Matrix")
print(f"Accuracy Score: {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions_rf))

Confusion Matrix
Accuracy Score: 0.3110628915409652
Classification Report
              precision    recall  f1-score   support

           1       0.16      0.09      0.12      2979
           2       0.27      0.26      0.26      7026
           3       0.40      0.55      0.46     10031
           4       0.23      0.19      0.21      5927
           5       0.12      0.06      0.08      2610

    accuracy                           0.31     28573
   macro avg       0.24      0.23      0.23     28573
weighted avg       0.28      0.31      0.29     28573



In [31]:
#print the confusion matrix database
cm_rf_df

Unnamed: 0,Predicted Bronx,Predicted Brooklyn,Predicted Manhattan,Predicted Queens,Predicted Staten Island
Acutal Bronx,283,755,1247,550,144
Actual Brooklyn,443,1799,3235,1223,326
Actual Manhattan,478,2109,5491,1534,419
Actual Queens,361,1370,2744,1154,298
Actual Staten Island,190,597,1142,520,161


In [None]:
# Postgres username, password, and database name
POSTGRES_ADDRESS = 
POSTGRES_PORT = '5439'
POSTGRES_USERNAME = 'username' ## CHANGE THIS TO YOUR PANOPLY/POSTGRES USERNAME
POSTGRES_PASSWORD = '*****' ## CHANGE THIS TO YOUR PANOPLY/POSTGRES PASSWORD POSTGRES_DBNAME = 'database' ## CHANGE THIS TO YOUR DATABASE NAME
# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'
  .format(username=POSTGRES_USERNAME,
   password=POSTGRES_PASSWORD,
   ipaddress=POSTGRES_ADDRESS,
   port=POSTGRES_PORT,
   dbname=POSTGRES_DBNAME))
# Create the connection
cnx = create_engine(postgres_str)

create_engine() creates an engine objec

In [None]:
# export to database
# Connect to postgres/pgAdmin
sql_pgadmin = psycopg2.connect(user="postgres", password="Galile0", host="localhost", port="5432", database="dog_licenses_db")