# Dataset Information  
Name: DOHMH Dog Bite Data  
Author: New York City Department of Health and Mental Hygiene (NYC DOHMH)  
Source: https://data.cityofnewyork.us/Health/DOHMH-Dog-Bite-Data/rsgh-akpg/about_data  
Accessed: 2024 November 2  
Method of Data Collection:  
* Reports received online, mail, fax or by phone to 311
* NYC DOHMH Animal Bite Unit

# Feature Information
<table style='margin-left: auto; margin-right: auto'>
    <tr>
        <th colspan='3'> DOHMH Dog Bite Data </th>
    <tr>
    <tr>
        <th> Column Name </th>
        <th> Description </th>
        <th> Data Type </th>
    </tr>
    <tr>
        <td> UniqueID </th>
        <td> Unique dog bite case identifier </th>
        <td> Text </th>
    </tr>
    <tr>
        <td> DateOfBite </th>
        <td> Date bitten </th>
        <td> Floating Timestamp </th>
    </tr>
    <tr>
        <td> Species </th>
        <td> Animal Type (Dog) </th>
        <td> Text </th>
    </tr>
    <tr>
        <td> Breed </th>
        <td> Breed type </th>
        <td> Text </th>
    </tr>
        <tr>
        <td> Age </th>
        <td> Dog's age at time of bite. Numbers with 'M' indicate months. </th>
        <td> Text </th>
    </tr>
    <tr>
        <td> Gender </th>
        <td> Sex of Dog. M=Male, F=Female, U=Unknown </th>
        <td> Text </th>
    </tr>
    <tr>
        <td> SpayNeuter </th>
        <td> Surgical removal of dog's reproductive organs. True (reported to DOHMH as Spayed or Neutered), False (Unknown or Not Spayed or Neutered) </th>
        <td> Boolean </th>
    </tr>
    <tr>
        <td> Borough </th>
        <td> Dog bite Borough. 'Other' indicates that the bite took place outside New York City </th>
        <td> Text </th>
    </tr>
    <tr>
        <td> ZipCode </th>
        <td> Dog bite Zipcode. Blank ZipCode indicates that information was not available </th>
        <td> Text </th>
    </tr>
</table>

# Import and Initializing Cleaning

In [125]:
# libraries
import pandas as pd
from utils import breed_mapping, useless_breed_words

In [126]:
# import data
raw = pd.read_csv('../data/raw/DOHMH_Dog_Bite_Data_20241102.csv')

# display
raw.head()

Unnamed: 0,UniqueID,DateOfBite,Species,Breed,Age,Gender,SpayNeuter,Borough,ZipCode
0,1,January 01 2018,DOG,UNKNOWN,,U,False,Brooklyn,11220.0
1,2,January 04 2018,DOG,UNKNOWN,,U,False,Brooklyn,
2,3,January 06 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0
3,4,January 08 2018,DOG,Mixed/Other,4.0,M,False,Brooklyn,11231.0
4,5,January 09 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0


In [127]:
# copy raw to init_cleaned
init_cleaned = raw.copy()

# snake case column names
snake_case = {
    'UniqueID': 'unique_id',
    'DateOfBite': 'date_of_bite',
    'SpayNeuter': 'spay_neuter',
    'ZipCode': 'zip_code',
}

init_cleaned.rename(columns=snake_case, inplace=True)
init_cleaned.rename(columns=str.lower, inplace=True)

# display
init_cleaned.head()

Unnamed: 0,unique_id,date_of_bite,species,breed,age,gender,spay_neuter,borough,zip_code
0,1,January 01 2018,DOG,UNKNOWN,,U,False,Brooklyn,11220.0
1,2,January 04 2018,DOG,UNKNOWN,,U,False,Brooklyn,
2,3,January 06 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0
3,4,January 08 2018,DOG,Mixed/Other,4.0,M,False,Brooklyn,11231.0
4,5,January 09 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0


In [128]:
# drop columns
# UniqueID: not useful
# Species: only has one value (dog)
# Age: too many missing values
# Gender: too many missing values

init_cleaned = init_cleaned.drop(columns=['unique_id', 'species', 'age', 'gender'])

# display
init_cleaned.head()

Unnamed: 0,date_of_bite,breed,spay_neuter,borough,zip_code
0,January 01 2018,UNKNOWN,False,Brooklyn,11220.0
1,January 04 2018,UNKNOWN,False,Brooklyn,
2,January 06 2018,Pit Bull,False,Brooklyn,11224.0
3,January 08 2018,Mixed/Other,False,Brooklyn,11231.0
4,January 09 2018,Pit Bull,False,Brooklyn,11224.0


In [129]:
# convert date_of_bite to datetime
init_cleaned['date_of_bite'] = pd.to_datetime(init_cleaned['date_of_bite'])

# convert spay_neuter to boolean
init_cleaned['spay_neuter'] = init_cleaned['spay_neuter'].astype('bool')

# lower case all string columns
string_columns = init_cleaned.select_dtypes(include='object').columns
init_cleaned[string_columns] = init_cleaned[string_columns].apply(lambda x: x.str.lower())

# convert all nan into None
init_cleaned = init_cleaned.where(pd.notnull(init_cleaned), None)

# display
init_cleaned.head()

Unnamed: 0,date_of_bite,breed,spay_neuter,borough,zip_code
0,2018-01-01,unknown,False,brooklyn,11220.0
1,2018-01-04,unknown,False,brooklyn,
2,2018-01-06,pit bull,False,brooklyn,11224.0
3,2018-01-08,mixed/other,False,brooklyn,11231.0
4,2018-01-09,pit bull,False,brooklyn,11224.0


In [130]:
# check for missing values
init_cleaned.isna().sum()

date_of_bite       0
breed           2263
spay_neuter        0
borough            0
zip_code        7167
dtype: int64

# Preprocessing Borough
First because some data might be invalid.  
Remove 'other' because it refers to report outside of NYC.  

In [131]:
# copy inir_cleaned to prep_borough
prep_borough = init_cleaned.copy()

# drop rows with 'other' Borough
prep_borough = prep_borough[prep_borough['borough'] != 'other']

# display borough count
prep_borough['borough'].value_counts()

borough
queens           6693
manhattan        6081
brooklyn         5698
bronx            4375
staten island    2140
Name: count, dtype: int64

# Preprocessing Date of Bite
Extract date values

In [132]:
# copy prep_borough to prep_date
prep_date = prep_borough.copy()

In [133]:
# extract date values
prep_date['year'] = pd.to_datetime(prep_date['date_of_bite']).dt.year
prep_date['month'] = pd.to_datetime(prep_date['date_of_bite']).dt.month
prep_date['day_of_week'] = pd.to_datetime(prep_date['date_of_bite']).dt.dayofweek

# drop date_of_bite
prep_date.drop('date_of_bite', axis=1, inplace=True)

# display
prep_date.head()

Unnamed: 0,breed,spay_neuter,borough,zip_code,year,month,day_of_week
0,unknown,False,brooklyn,11220.0,2018,1,0
1,unknown,False,brooklyn,,2018,1,3
2,pit bull,False,brooklyn,11224.0,2018,1,5
3,mixed/other,False,brooklyn,11231.0,2018,1,0
4,pit bull,False,brooklyn,11224.0,2018,1,1


# Preprocessing Zip Code
Dropping rows with missing zip code  
Mapping zip code to longitude and latitude

In [134]:
# copy prep_date to prep_zip
prep_zip = prep_date.copy()

In [135]:
# check for missing zip_code values percentage
(prep_zip['zip_code'].isna().sum() / prep_zip.shape[0]) * 100

26.169608196262057

In [136]:
# drop rows with missing zip_code values
prep_zip = prep_zip.dropna(subset=['zip_code'])

# check for missing zip_code values
(prep_zip['zip_code'].isna().sum() / prep_zip.shape[0]) * 100

0.0

In [137]:
# display non-numeric zip_code values
prep_zip[~prep_zip['zip_code'].str.isnumeric()]

Unnamed: 0,breed,spay_neuter,borough,zip_code,year,month,day_of_week
21303,unknown,False,queens,?,2017,7,1
25122,pit bull,False,bronx,1o458,2022,9,5


In [138]:
# manually fix non-numeric zip_code values
prep_zip.loc[prep_zip['zip_code'] == '1o458', 'zip_code'] = '10458'
prep_zip.drop(prep_zip[prep_zip['zip_code'] == '?'].index, inplace=True)

In [139]:
# display non-numeric zip_code values
prep_zip[~prep_zip['zip_code'].str.isnumeric()]

Unnamed: 0,breed,spay_neuter,borough,zip_code,year,month,day_of_week


In [140]:
# read zip code data
zip_data = pd.read_csv('../data/raw/2024_Gaz_zcta_national.txt', sep='\t')

# display
zip_data.head()

Unnamed: 0,GEOID,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG
0,601,166836392,798613,64.416,0.308,18.180555,-66.749961
1,602,78546711,4428428,30.327,1.71,18.361945,-67.175597
2,603,88980555,6253316,34.356,2.414,18.457399,-67.124867
3,606,114825641,12228,44.334,0.005,18.158327,-66.932928
4,610,96150194,4289688,37.124,1.656,18.295304,-67.12518


In [141]:
# lower_case column names
zip_col_names = []

for col in zip_data.columns:
    zip_col_names.append(col.strip().lower())

zip_data.columns = zip_col_names

# keep necessary columns
zip_data = zip_data[['geoid', 'intptlat', 'intptlong']]

# rename columns
zip_data.columns = ['zip_code', 'latitude', 'longitude']

# display
zip_data.head()

Unnamed: 0,zip_code,latitude,longitude
0,601,18.180555,-66.749961
1,602,18.361945,-67.175597
2,603,18.457399,-67.124867
3,606,18.158327,-66.932928
4,610,18.295304,-67.12518


In [142]:
# convert zip_code to int
prep_zip['zip_code'] = prep_zip['zip_code'].astype('Int64')

# map zip_code to latitude and longitude
prep_zip = prep_zip.merge(zip_data, on='zip_code')

# drop zip_code 
prep_zip.drop(['zip_code'], axis=1, inplace=True)

# display
prep_zip.head()

Unnamed: 0,breed,spay_neuter,borough,year,month,day_of_week,latitude,longitude
0,unknown,False,brooklyn,2018,1,0,40.641026,-74.016688
1,pit bull,False,brooklyn,2018,1,5,40.577372,-73.988706
2,mixed/other,False,brooklyn,2018,1,0,40.677916,-74.005154
3,pit bull,False,brooklyn,2018,1,1,40.577372,-73.988706
4,basenji,False,brooklyn,2018,1,2,40.677916,-74.005154


# Preprocessing breed
Take top 3 breeds for one-hot encoding.  
Other or mixed breeds will be in others/mixed

In [143]:
# copy prep_zip to prep_breed
prep_breed = prep_zip.copy()

In [144]:
# dataframe for wrangling breed
breed_values = prep_breed['breed'].copy()

# separate breed by '/', ',' and ' '
breed_values = breed_values.str.split('/')
breed_values = breed_values.apply(lambda x: [y.strip() for y in x] if x is not None else x)

# remove useless words
for word in useless_breed_words:
    breed_values = breed_values.apply(lambda x: [y.replace(word, '').strip() for y in x] if x is not None else x)

# remove white spaces
breed_values = breed_values.apply(lambda x: [y.strip() for y in x] if x is not None else x)

# map breed names to standard names
breed_values = breed_values.apply(lambda x: [breed_mapping.get(y, y) for y in x] if x is not None else x)

# map repeating values to one value, ex: ['pit bull', 'pit bull'] to ['pit bull']
breed_values = breed_values.apply(lambda x: [x[0]] if x is not None and len(x) == 2 and x[0] == x[1] else x)

# fill missing values with ['mixed/other']
breed_values = breed_values.apply(lambda x: ['mixed/other'] if x is None else x)

# map lists with more than one value to ['mixed/other']
breed_values = breed_values.apply(lambda x: ['mixed/other'] if len(x) > 1 else x)

# display breed counts
breed_counts = breed_values.explode().value_counts()
breed_counts.head()

breed
mixed/other        4936
pit bull           4403
shih tzu            661
chihuahua           640
german shepherd     629
Name: count, dtype: int64

In [145]:
# top 3 breeds based on count, with mixed/others
top_breeds = breed_counts.head(6).index.tolist()
top_breeds

['mixed/other',
 'pit bull',
 'shih tzu',
 'chihuahua',
 'german shepherd',
 'yorkshire terrier']

In [146]:
# convert breed to top 5 breeds, with others
breed_values = breed_values.apply(lambda x: [y if y in top_breeds else 'mixed/other' for y in x] if x is not None else x)

# display
breed_values.head()

0    [mixed/other]
1       [pit bull]
2    [mixed/other]
3       [pit bull]
4    [mixed/other]
Name: breed, dtype: object

In [147]:
# one-hot encode breed
for breed in top_breeds:
    prep_breed[breed] = breed_values.apply(lambda x: breed in x if x is not None else False)

# drop breed column
prep_breed = prep_breed.drop(columns='breed')

# display
prep_breed.head()

Unnamed: 0,spay_neuter,borough,year,month,day_of_week,latitude,longitude,mixed/other,pit bull,shih tzu,chihuahua,german shepherd,yorkshire terrier
0,False,brooklyn,2018,1,0,40.641026,-74.016688,True,False,False,False,False,False
1,False,brooklyn,2018,1,5,40.577372,-73.988706,False,True,False,False,False,False
2,False,brooklyn,2018,1,0,40.677916,-74.005154,True,False,False,False,False,False
3,False,brooklyn,2018,1,1,40.577372,-73.988706,False,True,False,False,False,False
4,False,brooklyn,2018,1,2,40.677916,-74.005154,True,False,False,False,False,False


# Export Data

In [148]:
# copy prep_breed to prep_final
prep_final = prep_breed.copy()

# display
prep_final.head()

Unnamed: 0,spay_neuter,borough,year,month,day_of_week,latitude,longitude,mixed/other,pit bull,shih tzu,chihuahua,german shepherd,yorkshire terrier
0,False,brooklyn,2018,1,0,40.641026,-74.016688,True,False,False,False,False,False
1,False,brooklyn,2018,1,5,40.577372,-73.988706,False,True,False,False,False,False
2,False,brooklyn,2018,1,0,40.677916,-74.005154,True,False,False,False,False,False
3,False,brooklyn,2018,1,1,40.577372,-73.988706,False,True,False,False,False,False
4,False,brooklyn,2018,1,2,40.677916,-74.005154,True,False,False,False,False,False


In [149]:
# convert column names to snake_case
prep_final.columns = prep_final.columns.str.replace(' ', '_')

In [150]:
# save cleaned data
prep_final.to_csv('../data/processed/dog_bite_preprocessed.csv', index=False)