This is our notebook for data cleaning.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from dateutil import parser
from datetime import datetime as DAMN

In [None]:
def convert_to_datetime(row):
    dt = parser.parse(row["Date"])
    return dt

def label_month(row):
    dt = row["Datetime"]
    return dt.month

def label_hour(row):
    dt = row["Datetime"]
    return dt.year

In [None]:
df = pd.read_csv("data.csv")
df.shape

In [None]:
# Drop all data not within last 5 years
df = df[df['Year'].map(int) > 2013]
df.shape

In [None]:
# Convert Date to Month and Hour
df["Datetime"] = df.apply(lambda row : convert_to_datetime(row), axis = 1)
df["Month"] = df.apply(lambda row : label_month(row), axis = 1)
df["Hour"] = df.apply(lambda row : label_hour(row), axis = 1)

df['sin_hour'] = np.sin(2*np.pi*df["Hour"]/24)
df['cos_hour'] = np.cos(2*np.pi*df["Hour"]/24)

df['sin_month'] = np.sin(2*np.pi*(df["Month"] - 1)/12)
df['cos_month'] = np.cos(2*np.pi*(df["Month"] - 1)/12)

In [None]:
# Drop unnecessary columns
df.drop(["ID", "Case Number", "Date", "IUCR", "FBI Code", "Location", "Year", "Datetime", "Month", "Hour", "Updated On"], inplace=True, axis=1)

In [None]:
df[:10]

In [None]:
df.shape

In [None]:
# Drop rows with crime type that we don't think matter
df = df[
    (df["Primary Type"] != "GAMBLING") &
    (df["Primary Type"] != "LIQUOR LAW VIOLATION") &
    (df["Primary Type"] != "PROSTITUTION") &
    (df["Primary Type"] != "NARCOTICS") &
    (df["Primary Type"] != "PUBLIC INDECENCY") 
    ]

In [None]:
df.shape

In [None]:
df.dropna(inplace = True)

In [None]:
df.shape

In [None]:
# Convert district to int and arrest and domestic to binary 0, 1 value
df[["District", "Ward", "Beat", "Community Area", "X Coordinate", "Y Coordinate"]] = df[["District", "Ward", "Beat", "Community Area", "X Coordinate", "Y Coordinate"]].astype(int)
df[["Latitude", "Longitude"]] = df[["Latitude", "Longitude"]].astype(float)
df[["Arrest", "Domestic"]] = df[["Arrest", "Domestic"]].astype(int)

In [None]:
df.shape

In [None]:
df[:10]

In [None]:
sns.heatmap(df.corr())
plt.show()

In [None]:
# df.to_csv('crime.csv')