# Dataset Information  
Name: DOHMH Dog Bite Data  
Author: New York City Department of Health and Mental Hygiene (NYC DOHMH)  
Source: https://data.cityofnewyork.us/Health/DOHMH-Dog-Bite-Data/rsgh-akpg/about_data  
Accessed: 2024 November 2  
Method of Data Collection:  
* Reports received online, mail, fax or by phone to 311
* NYC DOHMH Animal Bite Unit

# Feature Information
<table style='margin-left: auto; margin-right: auto'>
    <tr>
        <th colspan='3'> DOHMH Dog Bite Data </th>
    <tr>
    <tr>
        <th> Column Name </th>
        <th> Description </th>
        <th> Data Type </th>
    </tr>
    <tr>
        <td> UniqueID </th>
        <td> Unique dog bite case identifier </th>
        <td> Text </th>
    </tr>
    <tr>
        <td> DateOfBite </th>
        <td> Date bitten </th>
        <td> Floating Timestamp </th>
    </tr>
    <tr>
        <td> Species </th>
        <td> Animal Type (Dog) </th>
        <td> Text </th>
    </tr>
    <tr>
        <td> Breed </th>
        <td> Breed type </th>
        <td> Text </th>
    </tr>
        <tr>
        <td> Age </th>
        <td> Dog's age at time of bite. Numbers with 'M' indicate months. </th>
        <td> Text </th>
    </tr>
    <tr>
        <td> Gender </th>
        <td> Sex of Dog. M=Male, F=Female, U=Unknown </th>
        <td> Text </th>
    </tr>
    <tr>
        <td> SpayNeuter </th>
        <td> Surgical removal of dog's reproductive organs. True (reported to DOHMH as Spayed or Neutered), False (Unknown or Not Spayed or Neutered) </th>
        <td> Boolean </th>
    </tr>
    <tr>
        <td> Borough </th>
        <td> Dog bite Borough. Other' indicates that the bite took place outside New York City </th>
        <td> Text </th>
    </tr>
    <tr>
        <td> ZipCode </th>
        <td> Dog bite Zipcode. Blank ZipCode indicates that information was not available </th>
        <td> Text </th>
    </tr>
</table>

# Import and Initializing Cleaning

In [1]:
# libraries
import pandas as pd
import numpy as np

In [2]:
# import data
raw = pd.read_csv('../data/raw/DOHMH_Dog_Bite_Data_20241102.csv')

# initiliaze cleaned data
cleaned = raw.copy()

# display
raw.head()

Unnamed: 0,UniqueID,DateOfBite,Species,Breed,Age,Gender,SpayNeuter,Borough,ZipCode
0,1,January 01 2018,DOG,UNKNOWN,,U,False,Brooklyn,11220.0
1,2,January 04 2018,DOG,UNKNOWN,,U,False,Brooklyn,
2,3,January 06 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0
3,4,January 08 2018,DOG,Mixed/Other,4.0,M,False,Brooklyn,11231.0
4,5,January 09 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0


In [3]:
# snake case column names
snake_case = {
    'UniqueID': 'unique_id',
    'DateOfBite': 'date_of_bite',
    'SpayNeuter': 'spay_neuter',
    'ZipCode': 'zip_code',
}

cleaned.rename(columns=snake_case, inplace=True)
cleaned.rename(columns=str.lower, inplace=True)

# display
cleaned.head()

Unnamed: 0,unique_id,date_of_bite,species,breed,age,gender,spay_neuter,borough,zip_code
0,1,January 01 2018,DOG,UNKNOWN,,U,False,Brooklyn,11220.0
1,2,January 04 2018,DOG,UNKNOWN,,U,False,Brooklyn,
2,3,January 06 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0
3,4,January 08 2018,DOG,Mixed/Other,4.0,M,False,Brooklyn,11231.0
4,5,January 09 2018,DOG,Pit Bull,,U,False,Brooklyn,11224.0


In [4]:
# drop columns
# UniqueID: not useful
# Species: only has one value (dog)
cleaned = cleaned.drop(columns=['unique_id', 'species'])
    
# display
cleaned.head()

Unnamed: 0,date_of_bite,breed,age,gender,spay_neuter,borough,zip_code
0,January 01 2018,UNKNOWN,,U,False,Brooklyn,11220.0
1,January 04 2018,UNKNOWN,,U,False,Brooklyn,
2,January 06 2018,Pit Bull,,U,False,Brooklyn,11224.0
3,January 08 2018,Mixed/Other,4.0,M,False,Brooklyn,11231.0
4,January 09 2018,Pit Bull,,U,False,Brooklyn,11224.0


In [5]:
# convert date_of_bite to datetime
cleaned['date_of_bite'] = pd.to_datetime(cleaned['date_of_bite'])

# convert spay_neuter to boolean
cleaned['spay_neuter'] = cleaned['spay_neuter'].astype('bool')

# lower case all string columns
string_columns = cleaned.select_dtypes(include='object').columns
cleaned[string_columns] = cleaned[string_columns].apply(lambda x: x.str.lower())

# convert all nan into None
cleaned = cleaned.where(pd.notnull(cleaned), None)

# display
cleaned.head()

Unnamed: 0,date_of_bite,breed,age,gender,spay_neuter,borough,zip_code
0,2018-01-01,unknown,,u,False,brooklyn,11220.0
1,2018-01-04,unknown,,u,False,brooklyn,
2,2018-01-06,pit bull,,u,False,brooklyn,11224.0
3,2018-01-08,mixed/other,4.0,m,False,brooklyn,11231.0
4,2018-01-09,pit bull,,u,False,brooklyn,11224.0


In [6]:
# check for missing values
cleaned.isna().sum()

date_of_bite        0
breed            2263
age             13259
gender              0
spay_neuter         0
borough             0
zip_code         7167
dtype: int64

# Wrangling breed

In [7]:
# dataframe for wrangling breed
breed_counts = cleaned['breed'].copy()

# separate breed by '/', ',' and ' '
breed_counts = breed_counts.str.split('/')
breed_counts = breed_counts.apply(lambda x: [y.strip() for y in x] if x is not None else x)

# map pit bull aliases to pit bull
pitbull_aliases = ['american pit bull', 'american pit bull terrier', 'pitbull']	
breed_counts = breed_counts.apply(lambda x: ['pit bull' if y in pitbull_aliases else y for y in x] if x is not None else x)

# map unknown aliases to unknown
unknown_aliases = ['refused to provide', 'uknown', 'un', 'unc', 'uncertain', 'unknown', 'unkown', 'unnkown', 'unsure', 'unknown']
breed_counts = breed_counts.apply(lambda x: ['other' if y in unknown_aliases else y for y in x] if x is not None else x)

# map mixed breed aliases to mixed
mix_aliases = ['mix', 'mix breed', 'mix ', 'mixed breed']
breed_counts = breed_counts.apply(lambda x: ['mixed' if y in mix_aliases else y for y in x] if x is not None else x)

# map husky aliases to husky
husky_aliases = ['siberian husky']
breed_counts = breed_counts.apply(lambda x: ['husky' if y in husky_aliases else y for y in x] if x is not None else x)

# map bull dog aliases to bull dog
bulldog_aliases = ['american bull dog', 'english bull dog', 'french bull dog', 'bull dog, french', 'bull dog, english', 'bull dog, american', 'bulldog'] 
breed_counts = breed_counts.apply(lambda x: ['bull dog' if y in bulldog_aliases else y for y in x] if x is not None else x)

# map poodle aliases to poodle
poodle_aliases = ['standard poodle', 'miniature poodle', 'toy poodle', 'poodle, standard', 'poodle, miniature', 'poodle, toy']
breed_counts = breed_counts.apply(lambda x: ['poodle' if y in poodle_aliases else y for y in x] if x is not None else x)

breed_counts = breed_counts.apply(lambda x: [y.replace(' mix', '').strip() for y in x] if x is not None else x)
breed_counts = breed_counts.apply(lambda x: [y.replace('mixed ', '').strip() for y in x] if x is not None else x)
breed_counts = breed_counts.apply(lambda x: [y.replace('mix ', '').strip() for y in x] if x is not None else x)
breed_counts = breed_counts.apply(lambda x: [y.replace(' x', '').strip() for y in x] if x is not None else x)
breed_counts = breed_counts.apply(lambda x: [y.replace(' crossbreed', '').strip() for y in x] if x is not None else x)
breed_counts = breed_counts.apply(lambda x: [y.strip() for y in x] if x is not None else x)

In [8]:
# display breed values
breed_counts.explode().value_counts().head(11)

breed
pit bull             6524
other                4066
mixed                1485
shih tzu             1020
chihuahua             964
german shepherd       859
poodle                654
bull dog              629
yorkshire terrier     603
american pit bull     568
maltese               564
Name: count, dtype: int64

In [9]:
# suppose top 10 is the following
top_10 = breed_counts.explode().value_counts().head(11).index.tolist()
top_10.remove('mixed')

top_10

['pit bull',
 'other',
 'shih tzu',
 'chihuahua',
 'german shepherd',
 'poodle',
 'bull dog',
 'yorkshire terrier',
 'american pit bull',
 'maltese']

In [10]:
# convert breed into a list
cleaned['breed'] = cleaned['breed'].apply(lambda x: [y.strip() for y in x.split('/')] if x is not None else x)

# multi-hot encode breed
for breed in top_10:
    cleaned[breed] = cleaned['breed'].apply(lambda x: breed in x if x is not None else False)

for breed in cleaned['breed']:
    if breed is not None and not any(top_10):
        cleaned['other'] = True

# drop breed column
cleaned = cleaned.drop(columns='breed')

# display
cleaned.head()

Unnamed: 0,date_of_bite,age,gender,spay_neuter,borough,zip_code,pit bull,other,shih tzu,chihuahua,german shepherd,poodle,bull dog,yorkshire terrier,american pit bull,maltese
0,2018-01-01,,u,False,brooklyn,11220.0,False,False,False,False,False,False,False,False,False,False
1,2018-01-04,,u,False,brooklyn,,False,False,False,False,False,False,False,False,False,False
2,2018-01-06,,u,False,brooklyn,11224.0,True,False,False,False,False,False,False,False,False,False
3,2018-01-08,4.0,m,False,brooklyn,11231.0,False,True,False,False,False,False,False,False,False,False
4,2018-01-09,,u,False,brooklyn,11224.0,True,False,False,False,False,False,False,False,False,False


# Wrangling age
Some records have month values instead of year. We will use months for consistency and avoiding float values, thereby improving percision and computing cost

In [11]:
# check for missing values
cleaned.isna().sum()

date_of_bite             0
age                  13259
gender                   0
spay_neuter              0
borough                  0
zip_code              7167
pit bull                 0
other                    0
shih tzu                 0
chihuahua                0
german shepherd          0
poodle                   0
bull dog                 0
yorkshire terrier        0
american pit bull        0
maltese                  0
dtype: int64

In [12]:
# check datatype of age
cleaned['age'].dtype

dtype('O')

In [13]:
# display the non numeric values in age
nonnumeric_ages = cleaned['age'][~cleaned['age'].astype(str).str.isnumeric()].value_counts()

nonnumeric_ages

age
8m            92
10m           89
11m           81
3m            77
4m            77
              ..
1 yr 8 mon     1
1yr 8mons      1
y rs           1
2 1/2          1
17w            1
Name: count, Length: 197, dtype: int64

In [14]:
year_aliases = ['years', 'year', 'yrs', 'yr', 'y', 'yo', 'y.o.', 'y.o']

def remove_year_aliases(age: str) -> str:
    if age is None:
        return age
    
    for alias in year_aliases:
        age = age.replace(alias, '').strip()
    return age

ages = cleaned['age'].copy()

# remove year aliases
cleaned_ages = ages.apply(remove_year_aliases)

# display the non numeric values in age
nonnumeric_ages = cleaned_ages[~cleaned_ages.astype(str).str.isnumeric()].value_counts()

print(nonnumeric_ages)

age
8m         92
10m        89
11m        81
3m         77
4m         77
           ..
11wks       1
10 wks      1
6mths       1
16 mths     1
17w         1
Name: count, Length: 153, dtype: int64
