This is our notebook for our preliminary data cleaning and logistic regression model.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from dateutil import parser

In [2]:
df = pd.read_csv("data.csv")
print(df.size)

141984612


In [3]:
def convert_to_datetime(row):
    dt = parser.parse(row["Date"])
    return dt

def label_month(row):
    dt = row["Datetime"]
    return dt.month

def label_hour(row):
    dt = row["Datetime"]
    return dt.year

In [4]:
# Drop all data not within last 5 years
df = df[df['Year'].map(int) > 2013]

In [5]:
# Convert Date to Month and Hour
df["Datetime"] = df.apply(lambda row : convert_to_datetime(row), axis = 1)
df["Month"] = df.apply(lambda row : label_month(row), axis = 1)
df["Hour"] = df.apply(lambda row : label_hour(row), axis = 1)

df['sin_hour'] = np.sin(2*np.pi*df["Hour"]/24)
df['cos_hour'] = np.cos(2*np.pi*df["Hour"]/24)

df['sin_month'] = np.sin(2*np.pi*(df["Month"] - 1)/12)
df['cos_month'] = np.cos(2*np.pi*(df["Month"] - 1)/12)

In [38]:
# Convert district to int and arrest and domestic to binary 0, 1 value
df["District"] = df["District"].astype(int) 
df[["Arrest", "Domestic"]] = df[["Arrest", "Domestic"]].astype(int)

In [6]:
# Drop unnecessary columns
df.drop(["ID", "Case Number", "Date", "IUCR", "Block", "Beat", "Ward", "FBI Code", "Updated On", "Location", "Year", "Datetime", "Month", "Hour"], inplace=True, axis=1)

In [7]:
# Drop rows with crime type that we don't think matter
df = df[
    (df["Primary Type"] != "GAMBLING") &
    (df["Primary Type"] != "LIQUOR LAW VIOLATION") &
    (df["Primary Type"] != "PROSTITUTION") &
    (df["Primary Type"] != "NARCOTICS") &
    (df["Primary Type"] != "PUBLIC INDECENCY") 
    ]

In [14]:
print(df.size)

12112815


In [None]:
# Drop all location features except for District for preliminary model 
df.drop(["X Coordinate", "Y Coordinate", "Latitude", "Longitude", "Community Area"], inplace=True, axis=1)

In [39]:
df[:100]

Unnamed: 0,Primary Type,Description,Location Description,Arrest,Domestic,District,Month,Hour,sin_hour,cos_hour,sin_month,cos_month
23,ASSAULT,SIMPLE,PARKING LOT/GARAGE(NON.RESID.),0,0,8,4,2016,7.847643e-15,1.000000,1.000000e+00,6.123234e-17
66,ASSAULT,SIMPLE,APARTMENT,0,1,7,4,2016,7.847643e-15,1.000000,1.000000e+00,6.123234e-17
304,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,0,1,7,4,2016,7.847643e-15,1.000000,1.000000e+00,6.123234e-17
674,OTHER OFFENSE,TELEPHONE THREAT,APARTMENT,0,0,25,4,2016,7.847643e-15,1.000000,1.000000e+00,6.123234e-17
696,THEFT,$500 AND UNDER,TAXICAB,0,0,8,4,2016,7.847643e-15,1.000000,1.000000e+00,6.123234e-17
826,CRIMINAL DAMAGE,TO PROPERTY,APARTMENT,0,0,8,4,2016,7.847643e-15,1.000000,1.000000e+00,6.123234e-17
1172,CRIMINAL TRESPASS,TO STATE SUP LAND,AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA,1,0,16,4,2016,7.847643e-15,1.000000,1.000000e+00,6.123234e-17
1300,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,0,1,10,4,2016,7.847643e-15,1.000000,1.000000e+00,6.123234e-17
1344,BATTERY,DOMESTIC BATTERY SIMPLE,STREET,1,0,5,4,2016,7.847643e-15,1.000000,1.000000e+00,6.123234e-17
1386,CRIMINAL TRESPASS,TO STATE SUP LAND,AIRPORT TERMINAL LOWER LEVEL - NON-SECURE AREA,1,0,16,4,2016,7.847643e-15,1.000000,1.000000e+00,6.123234e-17


In [52]:
# One Hot Encoded (Categorical Data): Primary Type, Description, Location Description, District
crime = pd.get_dummies(df, columns =['Primary Type', 'Description', 'Location Description', 'District'])
crime.drop(["Month", "Hour"], axis=1, inplace=True)

In [53]:
crime[:100]

Unnamed: 0,Arrest,Domestic,sin_hour,cos_hour,sin_month,cos_month,Primary Type_ARSON,Primary Type_ASSAULT,Primary Type_BATTERY,Primary Type_BURGLARY,...,District_15,District_16,District_17,District_18,District_19,District_20,District_22,District_24,District_25,District_31
23,0,0,7.847643e-15,1.000000,1.000000e+00,6.123234e-17,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
66,0,1,7.847643e-15,1.000000,1.000000e+00,6.123234e-17,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
304,0,1,7.847643e-15,1.000000,1.000000e+00,6.123234e-17,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
674,0,0,7.847643e-15,1.000000,1.000000e+00,6.123234e-17,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
696,0,0,7.847643e-15,1.000000,1.000000e+00,6.123234e-17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
826,0,0,7.847643e-15,1.000000,1.000000e+00,6.123234e-17,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1172,1,0,7.847643e-15,1.000000,1.000000e+00,6.123234e-17,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1300,0,1,7.847643e-15,1.000000,1.000000e+00,6.123234e-17,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1344,1,0,7.847643e-15,1.000000,1.000000e+00,6.123234e-17,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1386,1,0,7.847643e-15,1.000000,1.000000e+00,6.123234e-17,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
