In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

# File to Load
homicide_csv = Path("Resources/homicide_data.csv")

# Read homicide data file and store into Pandas DataFrame
homicide_data = pd.read_csv(homicide_csv, encoding='latin1')

In [2]:
homicide_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52179 entries, 0 to 52178
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   uid            52179 non-null  object 
 1   reported_date  52179 non-null  int64  
 2   victim_last    52178 non-null  object 
 3   victim_first   52179 non-null  object 
 4   victim_race    52179 non-null  object 
 5   victim_age     52179 non-null  object 
 6   victim_sex     52179 non-null  object 
 7   city           52179 non-null  object 
 8   state          52179 non-null  object 
 9   lat            52119 non-null  float64
 10  lon            52119 non-null  float64
 11  disposition    52179 non-null  object 
dtypes: float64(2), int64(1), object(9)
memory usage: 4.8+ MB


In [3]:
homicide_data['city'].unique()

array(['Albuquerque', 'Atlanta', 'Baltimore', 'Baton Rouge', 'Birmingham',
       'Boston', 'Buffalo', 'Charlotte', 'Chicago', 'Cincinnati',
       'Columbus', 'Dallas', 'Denver', 'Detroit', 'Durham', 'Fort Worth',
       'Fresno', 'Houston', 'Indianapolis', 'Jacksonville', 'Kansas City',
       'Las Vegas', 'Long Beach', 'Los Angeles', 'Louisville', 'Memphis',
       'Miami', 'Milwaukee', 'Minneapolis', 'Nashville', 'New Orleans',
       'New York', 'Oakland', 'Oklahoma City', 'Omaha', 'Philadelphia',
       'Phoenix', 'Pittsburgh', 'Richmond', 'San Antonio', 'Sacramento',
       'Savannah', 'San Bernardino', 'San Diego', 'San Francisco',
       'St. Louis', 'Stockton', 'Tampa', 'Tulsa', 'Washington'],
      dtype=object)

In [4]:
#Wisconsin is WI no wI
homicide_data['state'] = homicide_data['state'].replace('wI', 'WI')

In [5]:
#Some states/cities did not disclose victim names
#Since the victim names do no affect our analysis, we dropped the columns
homicide_data =  homicide_data.drop(columns = ["victim_last", "victim_first"])

In [6]:
# Replace "unknown" values with NaN
homicide_data.replace('Unknown', pd.NA, inplace=True)

# Drop rows with NaN values in any column
homicide_data.dropna(axis=0, how='any', inplace=True)

# Now, 'homicide_data' contains no data that is "Unknown"

****Important Note****

When dropping Unknown Values, Dallas, TX, Phoenix, AZ, and Kansas City, MO, where dropped entirely.
Dallas and Phoenix were dropped because it had all the age, sex, and race values as Unknown, and Kansas City only had age data.

This dropped us from an original amount of 52179 homicides to 47478 usable entries with age, sex and race.

In [7]:
homicide_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47478 entries, 0 to 52178
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   uid            47478 non-null  object 
 1   reported_date  47478 non-null  int64  
 2   victim_race    47478 non-null  object 
 3   victim_age     47478 non-null  object 
 4   victim_sex     47478 non-null  object 
 5   city           47478 non-null  object 
 6   state          47478 non-null  object 
 7   lat            47478 non-null  float64
 8   lon            47478 non-null  float64
 9   disposition    47478 non-null  object 
dtypes: float64(2), int64(1), object(7)
memory usage: 4.0+ MB


In [8]:
homicide_data.head()

Unnamed: 0,uid,reported_date,victim_race,victim_age,victim_sex,city,state,lat,lon,disposition
0,Alb-000001,20100504,Hispanic,78,Male,Albuquerque,NM,35.095788,-106.538555,Closed without arrest
1,Alb-000002,20100216,Hispanic,17,Male,Albuquerque,NM,35.05681,-106.715321,Closed by arrest
2,Alb-000003,20100601,White,15,Female,Albuquerque,NM,35.086092,-106.695568,Closed without arrest
3,Alb-000004,20100101,Hispanic,32,Male,Albuquerque,NM,35.078493,-106.556094,Closed by arrest
4,Alb-000005,20100102,White,72,Female,Albuquerque,NM,35.130357,-106.580986,Closed without arrest


In [9]:
homicide_data.disposition = homicide_data.disposition.replace("Closed without arrest", "No Arrest")
homicide_data.disposition = homicide_data.disposition.replace("Open/No arrest", "No Arrest")
homicide_data.disposition = homicide_data.disposition.replace("Closed by arrest", "Arrest Made")

In [10]:
homicide_data.disposition.value_counts()

No Arrest      24258
Arrest Made    23220
Name: disposition, dtype: int64

In [11]:
homicide_data.columns

Index(['uid', 'reported_date', 'victim_race', 'victim_age', 'victim_sex',
       'city', 'state', 'lat', 'lon', 'disposition'],
      dtype='object')

In [12]:
specific_uid1 = homicide_data.loc[homicide_data['uid'] == 'Mia-000649']
specific_uid1

Unnamed: 0,uid,reported_date,victim_race,victim_age,victim_sex,city,state,lat,lon,disposition


In [13]:
# Assuming 'uid' is a unique identifier in your DataFrame
chosen_uid1 = 'Mia-000649'
new_reported_date1 = '20151118'

# Locate the specific row with the given 'uid' value and update the 'reported_date'
homicide_data.loc[homicide_data['uid'] == chosen_uid1, 'reported_date'] = new_reported_date1

In [14]:
specific_uid2 = homicide_data.loc[homicide_data['uid'] == 'Mia-000652']
specific_uid2

Unnamed: 0,uid,reported_date,victim_race,victim_age,victim_sex,city,state,lat,lon,disposition


In [15]:
# Assuming 'uid' is a unique identifier in your DataFrame
chosen_uid2 = 'Mia-000652'
new_reported_date2 = '20151105'

# Locate the specific row with the given 'uid' value and update the 'reported_date'
homicide_data.loc[homicide_data['uid'] == chosen_uid2, 'reported_date'] = new_reported_date2

In [16]:
homicide_data['reported_date'] = pd.to_datetime(homicide_data['reported_date'], format='%Y%m%d')

In [17]:
homicide_data['reported_year'] = homicide_data['reported_date'].dt.year
homicide_data['reported_month'] = homicide_data['reported_date'].dt.strftime('%B')
homicide_data['reported_weekday'] = homicide_data['reported_date'].dt.day_name()

In [18]:
homicide_data['reported_weekday'].value_counts()

Sunday       7850
Saturday     7619
Monday       6853
Friday       6446
Tuesday      6331
Wednesday    6256
Thursday     6123
Name: reported_weekday, dtype: int64

In [19]:
homicide_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47478 entries, 0 to 52178
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   uid               47478 non-null  object        
 1   reported_date     47478 non-null  datetime64[ns]
 2   victim_race       47478 non-null  object        
 3   victim_age        47478 non-null  object        
 4   victim_sex        47478 non-null  object        
 5   city              47478 non-null  object        
 6   state             47478 non-null  object        
 7   lat               47478 non-null  float64       
 8   lon               47478 non-null  float64       
 9   disposition       47478 non-null  object        
 10  reported_year     47478 non-null  int64         
 11  reported_month    47478 non-null  object        
 12  reported_weekday  47478 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(9)
memory usage: 5.1+ MB


In [20]:
# Convert 'victim_age' to numeric
homicide_data['victim_age'] = pd.to_numeric(homicide_data['victim_age'])

# Define the age bins and labels
age_bins = [0, 18, 30, 45, 64, float('inf')]
age_labels = ['0-17', '18-29', '30-44', '45-64', '65+']

# Bin the 'victim_age' column and create a new column 'age_range'
homicide_data['age_range'] = pd.cut(homicide_data['victim_age'], bins=age_bins, labels=age_labels, right=False)

In [21]:
homicide_data.age_range.value_counts()

18-29    21428
30-44    13461
45-64     7144
0-17      3916
65+       1529
Name: age_range, dtype: int64

In [22]:
# Define a function to map month values to seasons
def get_season(reported_month):
    if reported_month in ['December','January', 'February']:
        return 'Winter'
    elif reported_month in ['March', 'April', 'May']:
        return 'Spring'
    elif reported_month in ['June', 'July', 'August']:
        return 'Summer'
    elif reported_month in ['September', 'October', 'November']:
        return 'Fall'
    else:
        return 'Unknown'  # Handle invalid month values if any

# Apply the function to create a new "season" column
homicide_data['season'] = homicide_data['reported_month'].apply(get_season)

In [23]:
homicide_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47478 entries, 0 to 52178
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   uid               47478 non-null  object        
 1   reported_date     47478 non-null  datetime64[ns]
 2   victim_race       47478 non-null  object        
 3   victim_age        47478 non-null  int64         
 4   victim_sex        47478 non-null  object        
 5   city              47478 non-null  object        
 6   state             47478 non-null  object        
 7   lat               47478 non-null  float64       
 8   lon               47478 non-null  float64       
 9   disposition       47478 non-null  object        
 10  reported_year     47478 non-null  int64         
 11  reported_month    47478 non-null  object        
 12  reported_weekday  47478 non-null  object        
 13  age_range         47478 non-null  category      
 14  season            4747

In [24]:
# Create a new column 'LOCATION' by concatenating 'city' and 'state' with a comma separator
homicide_data['LOCATION'] = homicide_data['city'] + ', ' + homicide_data['state']

In [25]:
homicide_data['LOCATION'].unique()

array(['Albuquerque, NM', 'Atlanta, GA', 'Baltimore, MD',
       'Baton Rouge, LA', 'Birmingham, AL', 'Boston, MA', 'Buffalo, NY',
       'Charlotte, NC', 'Chicago, IL', 'Cincinnati, OH', 'Columbus, OH',
       'Denver, CO', 'Detroit, MI', 'Durham, NC', 'Fort Worth, TX',
       'Fresno, CA', 'Houston, TX', 'Indianapolis, IN',
       'Jacksonville, FL', 'Las Vegas, NV', 'Long Beach, CA',
       'Los Angeles, CA', 'Louisville, KY', 'Memphis, TN', 'Miami, FL',
       'Milwaukee, WI', 'Minneapolis, MN', 'Nashville, TN',
       'New Orleans, LA', 'New York, NY', 'Oakland, CA',
       'Oklahoma City, OK', 'Omaha, NE', 'Philadelphia, PA',
       'Pittsburgh, PA', 'Richmond, VA', 'San Antonio, TX',
       'Sacramento, CA', 'Savannah, GA', 'San Bernardino, CA',
       'San Diego, CA', 'San Francisco, CA', 'St. Louis, MO',
       'Stockton, CA', 'Tampa, FL', 'Tulsa, OK', 'Tulsa, AL',
       'Washington, DC'], dtype=object)

In [26]:
#There is no Tulsa, AL. The lon and lat indicate Oklahoma.
# Replace 'AL' with 'OK' for 'LOCATION' containing 'city' Tulsa.
homicide_data['LOCATION'] = homicide_data['LOCATION'].str.replace(r'Tulsa.*, AL$', 'Tulsa, OK', regex=True)

In [27]:
locations_to_keep = homicide_data['LOCATION'].unique()

In [28]:
homicide_data.head()

Unnamed: 0,uid,reported_date,victim_race,victim_age,victim_sex,city,state,lat,lon,disposition,reported_year,reported_month,reported_weekday,age_range,season,LOCATION
0,Alb-000001,2010-05-04,Hispanic,78,Male,Albuquerque,NM,35.095788,-106.538555,No Arrest,2010,May,Tuesday,65+,Spring,"Albuquerque, NM"
1,Alb-000002,2010-02-16,Hispanic,17,Male,Albuquerque,NM,35.05681,-106.715321,Arrest Made,2010,February,Tuesday,0-17,Winter,"Albuquerque, NM"
2,Alb-000003,2010-06-01,White,15,Female,Albuquerque,NM,35.086092,-106.695568,No Arrest,2010,June,Tuesday,0-17,Summer,"Albuquerque, NM"
3,Alb-000004,2010-01-01,Hispanic,32,Male,Albuquerque,NM,35.078493,-106.556094,Arrest Made,2010,January,Friday,30-44,Winter,"Albuquerque, NM"
4,Alb-000005,2010-01-02,White,72,Female,Albuquerque,NM,35.130357,-106.580986,No Arrest,2010,January,Saturday,65+,Winter,"Albuquerque, NM"


***Population Data***

In [29]:
# Define the file paths using Path
Est_2010_csv = Path("Resources/SubEst2010.csv")
Est_2020_csv = Path("Resources/SubEst2020.csv")

# Load SubEst2010.csv into a DataFrame with the specified encoding
df_2010 = pd.read_csv(Est_2010_csv, encoding='latin1')

# Load SubEst2020.csv into a DataFrame with the specified encoding
df_2020 = pd.read_csv(Est_2020_csv, encoding='latin1')

In [30]:
state_abbreviations = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY',
    'District of Columbia': 'DC'
}

In [31]:
df_2010.head()

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,NAME,STNAME,ESTIMATESBASE2000,POPESTIMATE2000,POPESTIMATE2001,POPESTIMATE2002,POPESTIMATE2003,POPESTIMATE2004,POPESTIMATE2005,POPESTIMATE2006,POPESTIMATE2007,POPESTIMATE2008,POPESTIMATE2009,CENSUS2010POP,POPESTIMATE2010
0,40,1,0,0,0,Alabama,Alabama,4447207,4452173,4467634,4480089,4503491,4530729,4569805,4628981,4672840,4718206,4757938,4779736,4785298
1,162,1,0,124,0,Abbeville city,Alabama,2989,2985,2941,2909,2882,2857,2820,2807,2784,2742,2714,2688,2689
2,162,1,0,460,0,Adamsville city,Alabama,5033,5021,4960,4894,4841,4784,4728,4687,4633,4594,4558,4522,4523
3,162,1,0,484,0,Addison town,Alabama,698,701,701,708,714,722,729,741,750,752,759,758,755
4,162,1,0,676,0,Akron town,Alabama,488,485,473,454,442,426,416,404,395,384,369,356,355


In [32]:
df_2010.columns

Index(['SUMLEV', 'STATE', 'COUNTY', 'PLACE', 'COUSUB', 'NAME', 'STNAME',
       'ESTIMATESBASE2000', 'POPESTIMATE2000', 'POPESTIMATE2001',
       'POPESTIMATE2002', 'POPESTIMATE2003', 'POPESTIMATE2004',
       'POPESTIMATE2005', 'POPESTIMATE2006', 'POPESTIMATE2007',
       'POPESTIMATE2008', 'POPESTIMATE2009', 'CENSUS2010POP',
       'POPESTIMATE2010'],
      dtype='object')

In [33]:
df_2010['STNAME'] = df_2010['STNAME'].replace(state_abbreviations)

In [34]:
df_2010.head()

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,NAME,STNAME,ESTIMATESBASE2000,POPESTIMATE2000,POPESTIMATE2001,POPESTIMATE2002,POPESTIMATE2003,POPESTIMATE2004,POPESTIMATE2005,POPESTIMATE2006,POPESTIMATE2007,POPESTIMATE2008,POPESTIMATE2009,CENSUS2010POP,POPESTIMATE2010
0,40,1,0,0,0,Alabama,AL,4447207,4452173,4467634,4480089,4503491,4530729,4569805,4628981,4672840,4718206,4757938,4779736,4785298
1,162,1,0,124,0,Abbeville city,AL,2989,2985,2941,2909,2882,2857,2820,2807,2784,2742,2714,2688,2689
2,162,1,0,460,0,Adamsville city,AL,5033,5021,4960,4894,4841,4784,4728,4687,4633,4594,4558,4522,4523
3,162,1,0,484,0,Addison town,AL,698,701,701,708,714,722,729,741,750,752,759,758,755
4,162,1,0,676,0,Akron town,AL,488,485,473,454,442,426,416,404,395,384,369,356,355


In [35]:
# Remove ' city' and ' town' from the 'NAME' column
df_2010['NAME'] = df_2010['NAME'].str.replace(r' city.*$', '', regex=True)
df_2010['NAME'] = df_2010['NAME'].str.replace(r' town.*$', '', regex=True)
df_2010['NAME'] = df_2010['NAME'].str.replace(r' village.*$', '', regex=True)
df_2010['NAME'] = df_2010['NAME'].str.replace(r'/.*$', '', regex=True)
df_2010['NAME'] = df_2010['NAME'].str.replace(r'-.*$', '', regex=True)

In [36]:
# Create a new column 'LOCATION' by concatenating 'NAME' and 'STNAME' with a comma separator
df_2010['LOCATION'] = df_2010['NAME'] + ', ' + df_2010['STNAME']

In [37]:
# Select and retain only the desired columns
columns_to_keep_2010 = ['NAME', 'STNAME', 'LOCATION', 'POPESTIMATE2007', 'POPESTIMATE2008', 'POPESTIMATE2009']
df_2010 = df_2010[columns_to_keep_2010]

In [38]:
df_2010.head()

Unnamed: 0,NAME,STNAME,LOCATION,POPESTIMATE2007,POPESTIMATE2008,POPESTIMATE2009
0,Alabama,AL,"Alabama, AL",4672840,4718206,4757938
1,Abbeville,AL,"Abbeville, AL",2784,2742,2714
2,Adamsville,AL,"Adamsville, AL",4633,4594,4558
3,Addison,AL,"Addison, AL",750,752,759
4,Akron,AL,"Akron, AL",395,384,369


In [39]:
df_2020.head()

Unnamed: 0,SUMLEV,STATE,COUNTY,PLACE,COUSUB,CONCIT,PRIMGEO_FLAG,FUNCSTAT,NAME,STNAME,...,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019,POPESTIMATE042020,POPESTIMATE2020
0,40,1,0,0,0,0,0,A,Alabama,Alabama,...,4816632,4831586,4843737,4854803,4866824,4877989,4891628,4907965,4920706,4921532
1,162,1,0,124,0,0,0,A,Abbeville city,Alabama,...,2645,2629,2610,2602,2587,2578,2565,2555,2555,2553
2,162,1,0,460,0,0,0,A,Adamsville city,Alabama,...,4453,4430,4399,4371,4335,4304,4285,4254,4224,4211
3,162,1,0,484,0,0,0,A,Addison town,Alabama,...,745,744,742,734,734,728,725,723,719,717
4,162,1,0,676,0,0,0,A,Akron town,Alabama,...,347,344,338,338,335,332,332,328,328,327


In [40]:
df_2020.columns

Index(['SUMLEV', 'STATE', 'COUNTY', 'PLACE', 'COUSUB', 'CONCIT',
       'PRIMGEO_FLAG', 'FUNCSTAT', 'NAME', 'STNAME', 'CENSUS2010POP',
       'ESTIMATESBASE2010', 'POPESTIMATE2010', 'POPESTIMATE2011',
       'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014',
       'POPESTIMATE2015', 'POPESTIMATE2016', 'POPESTIMATE2017',
       'POPESTIMATE2018', 'POPESTIMATE2019', 'POPESTIMATE042020',
       'POPESTIMATE2020'],
      dtype='object')

In [41]:
df_2020['STNAME'] = df_2020['STNAME'].replace(state_abbreviations)

In [42]:
# Remove ' city' and ' town' from the 'NAME' column
df_2020['NAME'] = df_2020['NAME'].str.replace(r' city.*$', '', regex=True)
df_2020['NAME'] = df_2020['NAME'].str.replace(r' town.*$', '', regex=True)
df_2020['NAME'] = df_2020['NAME'].str.replace(r' village.*$', '', regex=True)
df_2020['NAME'] = df_2020['NAME'].str.replace(r'/.*$', '', regex=True)
df_2020['NAME'] = df_2020['NAME'].str.replace(r'-.*$', '', regex=True)

In [43]:
# Create a new column 'LOCATION' by concatenating 'NAME' and 'STNAME' with a comma separator
df_2020['LOCATION'] = df_2020['NAME'] + ', ' + df_2020['STNAME']

In [44]:
# Select and retain only the desired columns
columns_to_keep_2020 = ['NAME', 'STNAME', 'LOCATION', 'CENSUS2010POP', 'POPESTIMATE2011',
       'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014',
       'POPESTIMATE2015', 'POPESTIMATE2016', 'POPESTIMATE2017']
df_2020 = df_2020[columns_to_keep_2020]

In [45]:
df_2020.head()

Unnamed: 0,NAME,STNAME,LOCATION,CENSUS2010POP,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017
0,Alabama,AL,"Alabama, AL",4779736,4799642,4816632,4831586,4843737,4854803,4866824,4877989
1,Abbeville,AL,"Abbeville, AL",2688,2694,2645,2629,2610,2602,2587,2578
2,Adamsville,AL,"Adamsville, AL",4522,4474,4453,4430,4399,4371,4335,4304
3,Addison,AL,"Addison, AL",758,750,745,744,742,734,734,728
4,Akron,AL,"Akron, AL",356,347,347,344,338,338,335,332


In [46]:
merged_df = df_2010.merge(df_2020, on=['NAME', 'STNAME', 'LOCATION'], how='inner')

In [47]:
merged_df.columns

Index(['NAME', 'STNAME', 'LOCATION', 'POPESTIMATE2007', 'POPESTIMATE2008',
       'POPESTIMATE2009', 'CENSUS2010POP', 'POPESTIMATE2011',
       'POPESTIMATE2012', 'POPESTIMATE2013', 'POPESTIMATE2014',
       'POPESTIMATE2015', 'POPESTIMATE2016', 'POPESTIMATE2017'],
      dtype='object')

In [48]:
# Create a mapping dictionary for column renaming
column_name_mapping = {
    'POPESTIMATE2007': '2007',
    'POPESTIMATE2008': '2008',
    'POPESTIMATE2009': '2009',
    'CENSUS2010POP': '2010',
    'POPESTIMATE2011': '2011',
    'POPESTIMATE2012': '2012',
    'POPESTIMATE2013': '2013',
    'POPESTIMATE2014': '2014',
    'POPESTIMATE2015': '2015',
    'POPESTIMATE2016': '2016',
    'POPESTIMATE2017': '2017'
}

In [49]:
# Rename the columns based on the mapping dictionary
merged_df = merged_df.rename(columns=column_name_mapping)

In [50]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268916 entries, 0 to 268915
Data columns (total 14 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   NAME      268916 non-null  object
 1   STNAME    268916 non-null  object
 2   LOCATION  268916 non-null  object
 3   2007      268916 non-null  int64 
 4   2008      268916 non-null  int64 
 5   2009      268916 non-null  int64 
 6   2010      268916 non-null  object
 7   2011      268916 non-null  int64 
 8   2012      268916 non-null  int64 
 9   2013      268916 non-null  int64 
 10  2014      268916 non-null  int64 
 11  2015      268916 non-null  int64 
 12  2016      268916 non-null  int64 
 13  2017      268916 non-null  int64 
dtypes: int64(10), object(4)
memory usage: 30.8+ MB


In [51]:
# Remove duplicate rows based on 'LOCATION' column
merged_df = merged_df.drop_duplicates(subset=['LOCATION'])

In [52]:
# Find the row where 'LOCATION' is equal to 'Your_Location'
matching_rows = merged_df[merged_df['LOCATION'].str.contains('Tulsa', case=False, na=False)]

# Display the specific location
print(matching_rows)

                           NAME STNAME                     LOCATION    2007  \
204972                    Tulsa     OK                    Tulsa, OK  385779   
205371             Tulsa County     OK             Tulsa County, OK  582094   
205372  Balance of Tulsa County     OK  Balance of Tulsa County, OK   33388   

          2008    2009    2010    2011    2012    2013    2014    2015  \
204972  387130  390339  391906  392937  394616  398419  399945  403576   
205371  588510  597748  603403  609323  615376  623978  631241  640851   
205372   33845   34450   34828   34197   34378   34809   35162   35600   

          2016    2017  
204972  404086  402177  
205371  646328  646874  
205372   35878   35954  


In [53]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39274 entries, 0 to 268915
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   NAME      39274 non-null  object
 1   STNAME    39274 non-null  object
 2   LOCATION  39274 non-null  object
 3   2007      39274 non-null  int64 
 4   2008      39274 non-null  int64 
 5   2009      39274 non-null  int64 
 6   2010      39274 non-null  object
 7   2011      39274 non-null  int64 
 8   2012      39274 non-null  int64 
 9   2013      39274 non-null  int64 
 10  2014      39274 non-null  int64 
 11  2015      39274 non-null  int64 
 12  2016      39274 non-null  int64 
 13  2017      39274 non-null  int64 
dtypes: int64(10), object(4)
memory usage: 4.5+ MB


In [54]:
# Specify the path for the new CSV file where you want to save the filtered data
output_csv = Path("data/all_pop_data.csv")

# Save the filtered DataFrame to the new CSV file
merged_df.to_csv(output_csv, index=False, header=True)

***Combine Homicide data with Population Data***

In [55]:
filtered_pop_df = merged_df[merged_df['LOCATION'].isin(locations_to_keep)]
filtered_pop_df.head()

Unnamed: 0,NAME,STNAME,LOCATION,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
180,Birmingham,AL,"Birmingham, AL",218880,216505,214394,212237,211554,210775,211270,211179,211811,211512,211125
6143,Fresno,CA,"Fresno, CA",477659,484443,490262,494665,501623,505554,508993,514052,517408,520305,523938
6475,Long Beach,CA,"Long Beach, CA",460328,460643,461782,462257,464274,466526,468017,468785,469408,468484,466265
6495,Los Angeles,CA,"Los Angeles, CA",3751872,3763566,3781938,3792621,3818812,3847857,3877721,3904102,3933644,3957520,3975067
6699,Oakland,CA,"Oakland, CA",383500,386589,389613,390724,395416,400720,406609,412677,418539,421454,421938


***Create a new table for specific date***

In [56]:
filtered_pop_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47 entries, 180 to 260113
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   NAME      47 non-null     object
 1   STNAME    47 non-null     object
 2   LOCATION  47 non-null     object
 3   2007      47 non-null     int64 
 4   2008      47 non-null     int64 
 5   2009      47 non-null     int64 
 6   2010      47 non-null     object
 7   2011      47 non-null     int64 
 8   2012      47 non-null     int64 
 9   2013      47 non-null     int64 
 10  2014      47 non-null     int64 
 11  2015      47 non-null     int64 
 12  2016      47 non-null     int64 
 13  2017      47 non-null     int64 
dtypes: int64(10), object(4)
memory usage: 5.5+ KB


In [57]:
id_vars = ['NAME', 'STNAME', 'LOCATION']
value_vars = ['2007', '2008', '2009', '2010', '2011',
       '2012', '2013', '2014', '2015', '2016', '2017']

In [58]:
df_melt = pd.melt(filtered_pop_df,id_vars=id_vars,value_vars=value_vars,var_name="YEAR", value_name="POPULATION")
df_melt['YEAR'] = df_melt.YEAR.astype(int)
df_melt['POPULATION'] = df_melt.POPULATION.astype(int)

In [59]:
df_melt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   NAME        517 non-null    object
 1   STNAME      517 non-null    object
 2   LOCATION    517 non-null    object
 3   YEAR        517 non-null    int32 
 4   POPULATION  517 non-null    int32 
dtypes: int32(2), object(3)
memory usage: 16.3+ KB


In [60]:
df_final = pd.merge(homicide_data, df_melt, left_on = ['LOCATION','reported_year'],
                    right_on = ['LOCATION', 'YEAR'], how = 'left')

In [61]:
df_final.head()

Unnamed: 0,uid,reported_date,victim_race,victim_age,victim_sex,city,state,lat,lon,disposition,reported_year,reported_month,reported_weekday,age_range,season,LOCATION,NAME,STNAME,YEAR,POPULATION
0,Alb-000001,2010-05-04,Hispanic,78,Male,Albuquerque,NM,35.095788,-106.538555,No Arrest,2010,May,Tuesday,65+,Spring,"Albuquerque, NM",Albuquerque,NM,2010,545852
1,Alb-000002,2010-02-16,Hispanic,17,Male,Albuquerque,NM,35.05681,-106.715321,Arrest Made,2010,February,Tuesday,0-17,Winter,"Albuquerque, NM",Albuquerque,NM,2010,545852
2,Alb-000003,2010-06-01,White,15,Female,Albuquerque,NM,35.086092,-106.695568,No Arrest,2010,June,Tuesday,0-17,Summer,"Albuquerque, NM",Albuquerque,NM,2010,545852
3,Alb-000004,2010-01-01,Hispanic,32,Male,Albuquerque,NM,35.078493,-106.556094,Arrest Made,2010,January,Friday,30-44,Winter,"Albuquerque, NM",Albuquerque,NM,2010,545852
4,Alb-000005,2010-01-02,White,72,Female,Albuquerque,NM,35.130357,-106.580986,No Arrest,2010,January,Saturday,65+,Winter,"Albuquerque, NM",Albuquerque,NM,2010,545852


In [62]:
# Select and retain only the desired columns
final_columns_to_keep = ['uid', 'disposition','victim_sex','victim_race', 'victim_age', 'age_range',
                         'reported_date', 'reported_year','reported_month', 'reported_weekday', 'season',
                         'city', 'state', 'lat', 'lon', 'LOCATION', 'POPULATION']
df_final = df_final[final_columns_to_keep]

In [63]:
df_final.head()

Unnamed: 0,uid,disposition,victim_sex,victim_race,victim_age,age_range,reported_date,reported_year,reported_month,reported_weekday,season,city,state,lat,lon,LOCATION,POPULATION
0,Alb-000001,No Arrest,Male,Hispanic,78,65+,2010-05-04,2010,May,Tuesday,Spring,Albuquerque,NM,35.095788,-106.538555,"Albuquerque, NM",545852
1,Alb-000002,Arrest Made,Male,Hispanic,17,0-17,2010-02-16,2010,February,Tuesday,Winter,Albuquerque,NM,35.05681,-106.715321,"Albuquerque, NM",545852
2,Alb-000003,No Arrest,Female,White,15,0-17,2010-06-01,2010,June,Tuesday,Summer,Albuquerque,NM,35.086092,-106.695568,"Albuquerque, NM",545852
3,Alb-000004,Arrest Made,Male,Hispanic,32,30-44,2010-01-01,2010,January,Friday,Winter,Albuquerque,NM,35.078493,-106.556094,"Albuquerque, NM",545852
4,Alb-000005,No Arrest,Female,White,72,65+,2010-01-02,2010,January,Saturday,Winter,Albuquerque,NM,35.130357,-106.580986,"Albuquerque, NM",545852


In [64]:
# Specify the path for the new CSV file where you want to save the filtered data
output_csv = Path("data/ml_clean_homicide_data.csv")

# Save the filtered DataFrame to the new CSV file
df_final.to_csv(output_csv, index=False, header=True)