In [85]:
import pandas as pd
import numpy as np

In [86]:
# load the large dataset into a pandas dataframe hate_crime.csv
data = pd.read_csv('hate_crime.csv', encoding='latin1')
data

Unnamed: 0,incident_id,data_year,ori,pug_agency_name,pub_agency_unit,agency_type_name,state_abbr,state_name,division_name,region_name,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
0,43,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,1,Aggravated Assault,1.0,Residence/Home,Anti-Black or African American,Individual,S,S
1,44,1991,AR0350100,Pine Bluff,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,M,S
2,45,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,2,Aggravated Assault;Murder and Nonnegligent Man...,2.0,Residence/Home,Anti-White,Individual,M,S
3,46,1991,AR0600300,North Little Rock,,City,AR,Arkansas,West South Central,South,...,Black or African American,Not Specified,1,Intimidation,1.0,Residence/Home,Anti-White,Individual,S,S
4,47,1991,AR0670000,Sevier,,County,AR,Arkansas,West South Central,South,...,White,Not Specified,1,Intimidation,1.0,School/College,Anti-Black or African American,Individual,S,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241658,1473449,2022,WY0190200,Rock Springs,,City,WY,Wyoming,Mountain,West,...,White,Hispanic or Latino,2,Simple Assault,2.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,S,S
241659,1473450,2022,WY0010200,University of Wyoming,,University or College,WY,Wyoming,Mountain,West,...,White,Not Hispanic or Latino,1,Intimidation,1.0,School-College/University,Anti-Gay (Male),Individual,S,S
241660,1476554,2022,WY0190000,Sweetwater,,County,WY,Wyoming,Mountain,West,...,White,Multiple,2,Simple Assault,2.0,Highway/Road/Alley/Street/Sidewalk,Anti-Hispanic or Latino,Individual,S,S
241661,1476555,2022,WY0110100,Cheyenne,,City,WY,Wyoming,Mountain,West,...,White,Not Hispanic or Latino,1,Intimidation,1.0,Residence/Home,Anti-Black or African American,Individual,S,S


# Data Cleaning

In [87]:
# get general information about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241663 entries, 0 to 241662
Data columns (total 28 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   incident_id                   241663 non-null  int64  
 1   data_year                     241663 non-null  int64  
 2   ori                           241663 non-null  object 
 3   pug_agency_name               241663 non-null  object 
 4   pub_agency_unit               7189 non-null    object 
 5   agency_type_name              241663 non-null  object 
 6   state_abbr                    241663 non-null  object 
 7   state_name                    241663 non-null  object 
 8   division_name                 241663 non-null  object 
 9   region_name                   241663 non-null  object 
 10  population_group_code         241108 non-null  object 
 11  population_group_description  241108 non-null  object 
 12  incident_date                 241663 non-nul

In [88]:
# find the amount of missing values in each column
data.isnull().sum()

incident_id                          0
data_year                            0
ori                                  0
pug_agency_name                      0
pub_agency_unit                 234474
agency_type_name                     0
state_abbr                           0
state_name                           0
division_name                        0
region_name                          0
population_group_code              555
population_group_description       555
incident_date                        0
adult_victim_count              170538
juvenile_victim_count           172978
total_offender_count                 0
adult_offender_count            177148
juvenile_offender_count         177155
offender_race                        0
offender_ethnicity                   0
victim_count                         0
offense_name                         0
total_individual_victims          4859
location_name                        0
bias_desc                            0
victim_types             

# The following columns will be dropped:
1. pub_agency_unit - because it has 234474 missing values
2. adult_victim_count - because it has 170538 missing values
3. juvenile_victim_count - because it has 172978 missing values
4. adult_offender_count - because it has 177148 missing values
5. juvenile_offender_count - because it has 177155 missing values

In [89]:
# drop the columns with missing values
data = data.drop(['pub_agency_unit', 'adult_victim_count', 'juvenile_victim_count', 'adult_offender_count', 'juvenile_offender_count'], axis=1)
data

Unnamed: 0,incident_id,data_year,ori,pug_agency_name,agency_type_name,state_abbr,state_name,division_name,region_name,population_group_code,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
0,43,1991,AR0350100,Pine Bluff,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,1,Aggravated Assault,1.0,Residence/Home,Anti-Black or African American,Individual,S,S
1,44,1991,AR0350100,Pine Bluff,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,M,S
2,45,1991,AR0600300,North Little Rock,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,2,Aggravated Assault;Murder and Nonnegligent Man...,2.0,Residence/Home,Anti-White,Individual,M,S
3,46,1991,AR0600300,North Little Rock,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,1,Intimidation,1.0,Residence/Home,Anti-White,Individual,S,S
4,47,1991,AR0670000,Sevier,County,AR,Arkansas,West South Central,South,8D,...,White,Not Specified,1,Intimidation,1.0,School/College,Anti-Black or African American,Individual,S,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241658,1473449,2022,WY0190200,Rock Springs,City,WY,Wyoming,Mountain,West,5,...,White,Hispanic or Latino,2,Simple Assault,2.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,S,S
241659,1473450,2022,WY0010200,University of Wyoming,University or College,WY,Wyoming,Mountain,West,7,...,White,Not Hispanic or Latino,1,Intimidation,1.0,School-College/University,Anti-Gay (Male),Individual,S,S
241660,1476554,2022,WY0190000,Sweetwater,County,WY,Wyoming,Mountain,West,8D,...,White,Multiple,2,Simple Assault,2.0,Highway/Road/Alley/Street/Sidewalk,Anti-Hispanic or Latino,Individual,S,S
241661,1476555,2022,WY0110100,Cheyenne,City,WY,Wyoming,Mountain,West,3,...,White,Not Hispanic or Latino,1,Intimidation,1.0,Residence/Home,Anti-Black or African American,Individual,S,S


The following records with missing values will be dropped:
1. population_group_code - because it has 555 missing values
2. population_group_description - because it has 555 missing values
3. total_individual_victims - because it has 4859 missing values

The records with missing values will be dropped because they are few compared to the total number of records in the dataset.

In [90]:
# drop the records with missing values
data = data.dropna()
data

Unnamed: 0,incident_id,data_year,ori,pug_agency_name,agency_type_name,state_abbr,state_name,division_name,region_name,population_group_code,...,offender_race,offender_ethnicity,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
0,43,1991,AR0350100,Pine Bluff,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,1,Aggravated Assault,1.0,Residence/Home,Anti-Black or African American,Individual,S,S
1,44,1991,AR0350100,Pine Bluff,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,M,S
2,45,1991,AR0600300,North Little Rock,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,2,Aggravated Assault;Murder and Nonnegligent Man...,2.0,Residence/Home,Anti-White,Individual,M,S
3,46,1991,AR0600300,North Little Rock,City,AR,Arkansas,West South Central,South,3,...,Black or African American,Not Specified,1,Intimidation,1.0,Residence/Home,Anti-White,Individual,S,S
4,47,1991,AR0670000,Sevier,County,AR,Arkansas,West South Central,South,8D,...,White,Not Specified,1,Intimidation,1.0,School/College,Anti-Black or African American,Individual,S,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241658,1473449,2022,WY0190200,Rock Springs,City,WY,Wyoming,Mountain,West,5,...,White,Hispanic or Latino,2,Simple Assault,2.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,S,S
241659,1473450,2022,WY0010200,University of Wyoming,University or College,WY,Wyoming,Mountain,West,7,...,White,Not Hispanic or Latino,1,Intimidation,1.0,School-College/University,Anti-Gay (Male),Individual,S,S
241660,1476554,2022,WY0190000,Sweetwater,County,WY,Wyoming,Mountain,West,8D,...,White,Multiple,2,Simple Assault,2.0,Highway/Road/Alley/Street/Sidewalk,Anti-Hispanic or Latino,Individual,S,S
241661,1476555,2022,WY0110100,Cheyenne,City,WY,Wyoming,Mountain,West,3,...,White,Not Hispanic or Latino,1,Intimidation,1.0,Residence/Home,Anti-Black or African American,Individual,S,S


In [91]:
# find the info for the offender_ethnicity column
data['offender_ethnicity'].value_counts()

offender_ethnicity
Not Specified             201698
Unknown                    17396
Not Hispanic or Latino     13174
Hispanic or Latino          3192
Multiple                     829
Name: count, dtype: int64

The offender ethnicity column can be dropped because it mostly contains missing values and won't be used in the analysis.

In [92]:
# drop the offender_ethnicity column
data = data.drop(['offender_ethnicity'], axis=1)
data

Unnamed: 0,incident_id,data_year,ori,pug_agency_name,agency_type_name,state_abbr,state_name,division_name,region_name,population_group_code,...,total_offender_count,offender_race,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
0,43,1991,AR0350100,Pine Bluff,City,AR,Arkansas,West South Central,South,3,...,1,Black or African American,1,Aggravated Assault,1.0,Residence/Home,Anti-Black or African American,Individual,S,S
1,44,1991,AR0350100,Pine Bluff,City,AR,Arkansas,West South Central,South,3,...,1,Black or African American,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,M,S
2,45,1991,AR0600300,North Little Rock,City,AR,Arkansas,West South Central,South,3,...,1,Black or African American,2,Aggravated Assault;Murder and Nonnegligent Man...,2.0,Residence/Home,Anti-White,Individual,M,S
3,46,1991,AR0600300,North Little Rock,City,AR,Arkansas,West South Central,South,3,...,2,Black or African American,1,Intimidation,1.0,Residence/Home,Anti-White,Individual,S,S
4,47,1991,AR0670000,Sevier,County,AR,Arkansas,West South Central,South,8D,...,1,White,1,Intimidation,1.0,School/College,Anti-Black or African American,Individual,S,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241658,1473449,2022,WY0190200,Rock Springs,City,WY,Wyoming,Mountain,West,5,...,1,White,2,Simple Assault,2.0,Highway/Road/Alley/Street/Sidewalk,Anti-White,Individual,S,S
241659,1473450,2022,WY0010200,University of Wyoming,University or College,WY,Wyoming,Mountain,West,7,...,1,White,1,Intimidation,1.0,School-College/University,Anti-Gay (Male),Individual,S,S
241660,1476554,2022,WY0190000,Sweetwater,County,WY,Wyoming,Mountain,West,8D,...,2,White,2,Simple Assault,2.0,Highway/Road/Alley/Street/Sidewalk,Anti-Hispanic or Latino,Individual,S,S
241661,1476555,2022,WY0110100,Cheyenne,City,WY,Wyoming,Mountain,West,3,...,8,White,1,Intimidation,1.0,Residence/Home,Anti-Black or African American,Individual,S,S


In [93]:
# output the info for the bias_desc column
data['bias_desc'].value_counts()

bias_desc
Anti-Black or African American                                               80210
Anti-Jewish                                                                  27792
Anti-White                                                                   27008
Anti-Gay (Male)                                                              23659
Anti-Hispanic or Latino                                                      15374
                                                                             ...  
Anti-Black or African American;Anti-Gay (Male);Anti-Islamic (Muslim)             1
Anti-Hindu;Anti-Islamic (Muslim);Anti-Other Race/Ethnicity/Ancestry              1
Anti-Gay (Male);Anti-Hispanic or Latino;Anti-Male                                1
Anti-Jewish;Anti-Mental Disability;Anti-White                                    1
Anti-American Indian or Alaska Native;Anti-Female;Anti-Hispanic or Latino        1
Name: count, Length: 345, dtype: int64

There are over 300 unique values in the bias_desc column. The column consists of one or more categories separated by a comma or semicolon. The column will converted into a list of categories.

In [94]:
# remove the special characters from the bias_desc column
data['bias_desc'] = data['bias_desc'].str.replace('(', '')
data['bias_desc'] = data['bias_desc'].str.replace(')', '')

# convert the bias_desc column into a list of categories
data['bias_desc'] = data['bias_desc'].str.replace(';', ',').str.split(',')
data

Unnamed: 0,incident_id,data_year,ori,pug_agency_name,agency_type_name,state_abbr,state_name,division_name,region_name,population_group_code,...,total_offender_count,offender_race,victim_count,offense_name,total_individual_victims,location_name,bias_desc,victim_types,multiple_offense,multiple_bias
0,43,1991,AR0350100,Pine Bluff,City,AR,Arkansas,West South Central,South,3,...,1,Black or African American,1,Aggravated Assault,1.0,Residence/Home,[Anti-Black or African American],Individual,S,S
1,44,1991,AR0350100,Pine Bluff,City,AR,Arkansas,West South Central,South,3,...,1,Black or African American,2,Aggravated Assault;Destruction/Damage/Vandalis...,1.0,Highway/Road/Alley/Street/Sidewalk,[Anti-White],Individual,M,S
2,45,1991,AR0600300,North Little Rock,City,AR,Arkansas,West South Central,South,3,...,1,Black or African American,2,Aggravated Assault;Murder and Nonnegligent Man...,2.0,Residence/Home,[Anti-White],Individual,M,S
3,46,1991,AR0600300,North Little Rock,City,AR,Arkansas,West South Central,South,3,...,2,Black or African American,1,Intimidation,1.0,Residence/Home,[Anti-White],Individual,S,S
4,47,1991,AR0670000,Sevier,County,AR,Arkansas,West South Central,South,8D,...,1,White,1,Intimidation,1.0,School/College,[Anti-Black or African American],Individual,S,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241658,1473449,2022,WY0190200,Rock Springs,City,WY,Wyoming,Mountain,West,5,...,1,White,2,Simple Assault,2.0,Highway/Road/Alley/Street/Sidewalk,[Anti-White],Individual,S,S
241659,1473450,2022,WY0010200,University of Wyoming,University or College,WY,Wyoming,Mountain,West,7,...,1,White,1,Intimidation,1.0,School-College/University,[Anti-Gay Male],Individual,S,S
241660,1476554,2022,WY0190000,Sweetwater,County,WY,Wyoming,Mountain,West,8D,...,2,White,2,Simple Assault,2.0,Highway/Road/Alley/Street/Sidewalk,[Anti-Hispanic or Latino],Individual,S,S
241661,1476555,2022,WY0110100,Cheyenne,City,WY,Wyoming,Mountain,West,3,...,8,White,1,Intimidation,1.0,Residence/Home,[Anti-Black or African American],Individual,S,S


In [95]:
# one hot encode the bias_desc column
def one_hot_encode(column):
    #transforms elements in lists to rows
    types = column.explode() 
    
    # returns the frequency of each type for each index
    oneHotEncode = pd.crosstab(types.index, types)
    
    return oneHotEncode

In [96]:
one_hot_bias_desc = one_hot_encode(data['bias_desc'])
one_hot_bias_desc

bias_desc,Bisexual,Gay,Greek,Group,Other,or Transgender Mixed Group,Anti-American Indian or Alaska Native,Anti-Arab,Anti-Asian,Anti-Atheism/Agnosticism,...,Anti-Multiple Religions,Anti-Native Hawaiian or Other Pacific Islander,Anti-Other Christian,Anti-Other Race/Ethnicity/Ancestry,Anti-Other Religion,Anti-Physical Disability,Anti-Protestant,Anti-Sikh,Anti-Transgender,Anti-White
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241658,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
241659,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
241660,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
241661,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
# merge the one hot encoded bias_desc column with the original dataset
data = pd.concat([data, one_hot_bias_desc], axis=1)
data

Unnamed: 0,incident_id,data_year,ori,pug_agency_name,agency_type_name,state_abbr,state_name,division_name,region_name,population_group_code,...,Anti-Multiple Religions,Anti-Native Hawaiian or Other Pacific Islander,Anti-Other Christian,Anti-Other Race/Ethnicity/Ancestry,Anti-Other Religion,Anti-Physical Disability,Anti-Protestant,Anti-Sikh,Anti-Transgender,Anti-White
0,43,1991,AR0350100,Pine Bluff,City,AR,Arkansas,West South Central,South,3,...,0,0,0,0,0,0,0,0,0,0
1,44,1991,AR0350100,Pine Bluff,City,AR,Arkansas,West South Central,South,3,...,0,0,0,0,0,0,0,0,0,1
2,45,1991,AR0600300,North Little Rock,City,AR,Arkansas,West South Central,South,3,...,0,0,0,0,0,0,0,0,0,1
3,46,1991,AR0600300,North Little Rock,City,AR,Arkansas,West South Central,South,3,...,0,0,0,0,0,0,0,0,0,1
4,47,1991,AR0670000,Sevier,County,AR,Arkansas,West South Central,South,8D,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241658,1473449,2022,WY0190200,Rock Springs,City,WY,Wyoming,Mountain,West,5,...,0,0,0,0,0,0,0,0,0,1
241659,1473450,2022,WY0010200,University of Wyoming,University or College,WY,Wyoming,Mountain,West,7,...,0,0,0,0,0,0,0,0,0,0
241660,1476554,2022,WY0190000,Sweetwater,County,WY,Wyoming,Mountain,West,8D,...,0,0,0,0,0,0,0,0,0,0
241661,1476555,2022,WY0110100,Cheyenne,City,WY,Wyoming,Mountain,West,3,...,0,0,0,0,0,0,0,0,0,0
