In [1]:
#import libraries 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime as dt
sns.set_theme()


In [2]:
df = pd.read_csv('preprocessing.csv',index_col=0,parse_dates=['date','reported'])
print(df.shape)
df.head()

(466246, 13)


Unnamed: 0,crime_type,crime_category,date,reported,y_lon,x_lat,neighborhood,dates,times,equity_score,pop_2010,crime_count,neighborhood_crime_rate
0,criminal-mischief-other,public-disorder,2022-01-04 11:30:00,2022-01-04 20:36:00,-105.018825,39.714268,Valverde,2022-01-04,11:30:00,1.6,3941.0,1,822.633849
1,criminal-mischief-other,public-disorder,2022-01-03 19:47:00,2022-01-03 21:12:00,-105.014892,39.711518,Valverde,2022-01-03,19:47:00,1.6,3941.0,1,822.633849
6,criminal-mischief-other,public-disorder,2019-06-19 16:30:00,2019-06-20 14:21:00,-105.01469,39.712854,Valverde,2019-06-19,16:30:00,1.6,3941.0,1,822.633849
7,criminal-mischief-other,public-disorder,2019-09-11 16:30:00,2019-09-13 14:55:00,-105.012846,39.71171,Valverde,2019-09-11,16:30:00,1.6,3941.0,1,822.633849
8,criminal-mischief-other,public-disorder,2021-11-08 02:42:00,2021-11-08 03:31:00,-105.024081,39.722317,Valverde,2021-11-08,02:42:00,1.6,3941.0,1,822.633849


In [3]:
#making times of day
bins = [0,5,10,14,18,20,24]
categories = ['Early Morning','Morning','Midday','Afternoon','Evening','Night']
ToD = pd.cut(df['date'].dt.hour,bins = bins, labels = categories)
df.insert(4,'time_of_day',ToD)

#making days of week
DOW = df['date'].dt.day_name()
df.insert(5,'day_of_week',DOW)
weekend = df["date"].dt.weekday >= 5
df.insert(6,'IsWeekend',weekend)

In [4]:
df.head()

Unnamed: 0,crime_type,crime_category,date,reported,time_of_day,day_of_week,IsWeekend,y_lon,x_lat,neighborhood,dates,times,equity_score,pop_2010,crime_count,neighborhood_crime_rate
0,criminal-mischief-other,public-disorder,2022-01-04 11:30:00,2022-01-04 20:36:00,Midday,Tuesday,False,-105.018825,39.714268,Valverde,2022-01-04,11:30:00,1.6,3941.0,1,822.633849
1,criminal-mischief-other,public-disorder,2022-01-03 19:47:00,2022-01-03 21:12:00,Evening,Monday,False,-105.014892,39.711518,Valverde,2022-01-03,19:47:00,1.6,3941.0,1,822.633849
6,criminal-mischief-other,public-disorder,2019-06-19 16:30:00,2019-06-20 14:21:00,Afternoon,Wednesday,False,-105.01469,39.712854,Valverde,2019-06-19,16:30:00,1.6,3941.0,1,822.633849
7,criminal-mischief-other,public-disorder,2019-09-11 16:30:00,2019-09-13 14:55:00,Afternoon,Wednesday,False,-105.012846,39.71171,Valverde,2019-09-11,16:30:00,1.6,3941.0,1,822.633849
8,criminal-mischief-other,public-disorder,2021-11-08 02:42:00,2021-11-08 03:31:00,Early Morning,Monday,False,-105.024081,39.722317,Valverde,2021-11-08,02:42:00,1.6,3941.0,1,822.633849


# Combine Crime Types to make predictions easier

In [5]:
df.crime_category.unique()

array(['public-disorder', 'drug-alcohol', 'all-other-crimes',
       'traffic-accident', 'robbery', 'other-crimes-against-persons',
       'aggravated-assault', 'arson', 'burglary', 'larceny',
       'theft-from-motor-vehicle', 'auto-theft', 'white-collar-crime',
       'murder'], dtype=object)

In [6]:
theft = ['larceny','auto-theft','theft-from-motor-vehicle','robbery', 'burglary']
violent = ['aggravated-assault','arson','murder','other-crimes-against-persons']
all_other_crimes = ['all-other-crimes','white-collar-crime']
traffic = ['traffic-accident']
drug_and_disorder = ['public-disorder', 'drug-alcohol']

In [7]:
for x in theft:
    df.loc[ df['crime_category'] == x, 'crime_category'] = 'theft'
    
for x in violent:
    df.loc[ df['crime_category'] == x, 'crime_category'] = 'violent'
    
for x in all_other_crimes:
    df.loc[ df['crime_category'] == x, 'crime_category'] = 'all_other'
    
for x in traffic:
    df.loc[ df['crime_category'] == x, 'crime_category'] = 'traffic'
    
for x in drug_and_disorder:
    df.loc[ df['crime_category'] == x, 'crime_category'] = 'drug_and_disorder'

In [8]:
print(df.crime_category.nunique())
df.crime_category.unique()

5


array(['drug_and_disorder', 'all_other', 'traffic', 'theft', 'violent'],
      dtype=object)

In [9]:
df.crime_category.value_counts(normalize=True)

theft                0.364125
traffic              0.229162
all_other            0.172934
drug_and_disorder    0.154959
violent              0.078819
Name: crime_category, dtype: float64

# Extract Features from Dataset

In [10]:
df.head()

Unnamed: 0,crime_type,crime_category,date,reported,time_of_day,day_of_week,IsWeekend,y_lon,x_lat,neighborhood,dates,times,equity_score,pop_2010,crime_count,neighborhood_crime_rate
0,criminal-mischief-other,drug_and_disorder,2022-01-04 11:30:00,2022-01-04 20:36:00,Midday,Tuesday,False,-105.018825,39.714268,Valverde,2022-01-04,11:30:00,1.6,3941.0,1,822.633849
1,criminal-mischief-other,drug_and_disorder,2022-01-03 19:47:00,2022-01-03 21:12:00,Evening,Monday,False,-105.014892,39.711518,Valverde,2022-01-03,19:47:00,1.6,3941.0,1,822.633849
6,criminal-mischief-other,drug_and_disorder,2019-06-19 16:30:00,2019-06-20 14:21:00,Afternoon,Wednesday,False,-105.01469,39.712854,Valverde,2019-06-19,16:30:00,1.6,3941.0,1,822.633849
7,criminal-mischief-other,drug_and_disorder,2019-09-11 16:30:00,2019-09-13 14:55:00,Afternoon,Wednesday,False,-105.012846,39.71171,Valverde,2019-09-11,16:30:00,1.6,3941.0,1,822.633849
8,criminal-mischief-other,drug_and_disorder,2021-11-08 02:42:00,2021-11-08 03:31:00,Early Morning,Monday,False,-105.024081,39.722317,Valverde,2021-11-08,02:42:00,1.6,3941.0,1,822.633849


Remove the date variables and crime_type

In [11]:
df1 = df.iloc[:,[1,4,5,6,7,8,9,12]]
df1.head()

Unnamed: 0,crime_category,time_of_day,day_of_week,IsWeekend,y_lon,x_lat,neighborhood,equity_score
0,drug_and_disorder,Midday,Tuesday,False,-105.018825,39.714268,Valverde,1.6
1,drug_and_disorder,Evening,Monday,False,-105.014892,39.711518,Valverde,1.6
6,drug_and_disorder,Afternoon,Wednesday,False,-105.01469,39.712854,Valverde,1.6
7,drug_and_disorder,Afternoon,Wednesday,False,-105.012846,39.71171,Valverde,1.6
8,drug_and_disorder,Early Morning,Monday,False,-105.024081,39.722317,Valverde,1.6


In [12]:
df2 = df1.loc[:,['crime_category','time_of_day','neighborhood','equity_score']]
df2.head()

Unnamed: 0,crime_category,time_of_day,neighborhood,equity_score
0,drug_and_disorder,Midday,Valverde,1.6
1,drug_and_disorder,Evening,Valverde,1.6
6,drug_and_disorder,Afternoon,Valverde,1.6
7,drug_and_disorder,Afternoon,Valverde,1.6
8,drug_and_disorder,Early Morning,Valverde,1.6


In [13]:
df3 = df1.loc[:,['crime_category','time_of_day','IsWeekend','neighborhood','equity_score']]
df3.head()

Unnamed: 0,crime_category,time_of_day,IsWeekend,neighborhood,equity_score
0,drug_and_disorder,Midday,False,Valverde,1.6
1,drug_and_disorder,Evening,False,Valverde,1.6
6,drug_and_disorder,Afternoon,False,Valverde,1.6
7,drug_and_disorder,Afternoon,False,Valverde,1.6
8,drug_and_disorder,Early Morning,False,Valverde,1.6


In [14]:
df4 = df1.loc[:,['crime_category','time_of_day','day_of_week','IsWeekend','neighborhood','equity_score']]
df4.head()

Unnamed: 0,crime_category,time_of_day,day_of_week,IsWeekend,neighborhood,equity_score
0,drug_and_disorder,Midday,Tuesday,False,Valverde,1.6
1,drug_and_disorder,Evening,Monday,False,Valverde,1.6
6,drug_and_disorder,Afternoon,Wednesday,False,Valverde,1.6
7,drug_and_disorder,Afternoon,Wednesday,False,Valverde,1.6
8,drug_and_disorder,Early Morning,Monday,False,Valverde,1.6


In [15]:
df5 = df1.loc[:,['crime_category','time_of_day','IsWeekend','equity_score']]
df5.head()

Unnamed: 0,crime_category,time_of_day,IsWeekend,equity_score
0,drug_and_disorder,Midday,False,1.6
1,drug_and_disorder,Evening,False,1.6
6,drug_and_disorder,Afternoon,False,1.6
7,drug_and_disorder,Afternoon,False,1.6
8,drug_and_disorder,Early Morning,False,1.6


In [16]:
df6 = df1.loc[:,['crime_category','day_of_week','IsWeekend','equity_score']]
df6.head()

Unnamed: 0,crime_category,day_of_week,IsWeekend,equity_score
0,drug_and_disorder,Tuesday,False,1.6
1,drug_and_disorder,Monday,False,1.6
6,drug_and_disorder,Wednesday,False,1.6
7,drug_and_disorder,Wednesday,False,1.6
8,drug_and_disorder,Monday,False,1.6


In [17]:
df7 = df1.loc[:,['crime_category','day_of_week','IsWeekend','neighborhood','equity_score']]
df7.head()

Unnamed: 0,crime_category,day_of_week,IsWeekend,neighborhood,equity_score
0,drug_and_disorder,Tuesday,False,Valverde,1.6
1,drug_and_disorder,Monday,False,Valverde,1.6
6,drug_and_disorder,Wednesday,False,Valverde,1.6
7,drug_and_disorder,Wednesday,False,Valverde,1.6
8,drug_and_disorder,Monday,False,Valverde,1.6


In [18]:
df1.to_csv('df1.csv')
df2.to_csv('df2.csv')
df3.to_csv('df3.csv')
df4.to_csv('df4.csv')
df5.to_csv('df5.csv')
df6.to_csv('df6.csv')
df7.to_csv('df7.csv')