In [14]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [15]:
import os
import numpy as np
import pandas as pd
import datetime
import sweetviz as sv
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


In [16]:
ip_path = "..//Base Data//"
op_path = "..//Processed Data//"

In [17]:
os.chdir(ip_path)
arrest_data_df_2019 = pd.read_csv("Arrest_Data_from_2010_to_2019.csv")
arrest_data_df_current = pd.read_csv("Arrest_Data_from_2020_to_Present.csv")

In [18]:
lookup_file = "Arrests Lookup Table.xlsx"
drop_cols = ["Cross Street", "Booking Date", "Booking Time", "Booking Location", "Booking Location Code", "Charge"]
lookup_cols = ["Descent Code", "Arrest Type Code"]

In [19]:
def floatToTime(time_float):
    try:
        time_int = int(time_float)
    except:
        return time_float
    hours = time_int // 100
    minutes = time_int % 100
    
    if hours == 24:
        hours = 23
        minutes = 59
    
    return datetime.time(hours, minutes)

In [20]:
def getWeekYear(x):
    week_num = x.isocalendar()[1]
    year_num = x.year
    
    week_year = str(year_num) + " Wk " + str(week_num)
    return week_year

In [21]:
def preProcessArrests(df, lookup_file, lookup_cols, drop_cols, ip_path):
    #Convert time to right format
    df["Time"] = df["Time"].apply(lambda x: floatToTime(x))
    df["Arrest Date"] = df["Arrest Date"].apply(lambda x: x.replace("12:00:00 AM", ""))
    df["Arrest Date"] = df["Arrest Date"].apply(lambda x: x.strip())
    df["Arrest Date"] = df["Arrest Date"].apply(lambda x: datetime.datetime.strptime(x, "%m/%d/%Y"))
    df["Arrest Week Year"] = df["Arrest Date"].apply(lambda x: getWeekYear(x))
    
    for col in lookup_cols:
        os.chdir(ip_path)
        lookup_df = pd.read_excel(lookup_file, sheet_name = col)
        
        df = df.merge(lookup_df, how = "left", on = col)
        df.drop(col, axis = 1, inplace = True)
        
    df.drop(drop_cols, axis = 1, inplace = True)
    
    return df

In [22]:
def imputeMissingValues(df, columns):
    """
    Impute missing values in specified columns using a random forest regressor
    or classifier, depending on the data type of the column.

    Args:
        df (pd.DataFrame): The input DataFrame.
        columns (list): A list of column names to impute missing values for.

    Returns:
        pd.DataFrame: The DataFrame with imputed values.
    """
    # Copy the input DataFrame to avoid modifying the original
    df_imputed = df.copy()

    # Loop over each column and impute missing values using a random forest
    for col in columns:
        print(col)
        # Check if the column has missing values
        if df_imputed[col].isnull().any():
            dtype = df_imputed[col].dtype
            missing_count = df_imputed[col].isnull().sum()
            if missing_count / len(df_imputed) < 0.09:
                # Impute using mode if missing count is less than the threshold
                df_imputed[col].fillna(df_imputed[col].mode()[0], inplace=True)
            else:
                other_missing_cols = [c for c in columns if c != col]
                df_imputed = df_imputed.drop(other_missing_cols, axis = 1)
                cat_cols_to_drop = [col for col in df_imputed.select_dtypes(include=['object', 'category'])
                    if df_imputed[col].nunique() > 15]
                cat_cols_to_drop = [x for x in cat_cols_to_drop if x != col]
                df_imputed = df_imputed.drop(cat_cols_to_drop, axis = 1)
                df_dummy = pd.get_dummies(df_imputed.drop(col, axis = 1))
                df_dummy[col] = df_imputed[col]
                df_dummy.drop("Arrest Date", axis = 1, inplace = True)
                train_data = df_dummy.dropna(subset=[col])
                test_data = df_dummy[df_dummy[col].isnull()]

                if dtype == 'object':
                    # For categorical columns, use a random forest classifier
                    rf = RandomForestClassifier(n_estimators=100, random_state=42)
                    
                else:
                    rf = RandomForestRegressor(n_estimators=100, random_state=42)

                rf.fit(train_data.drop(col, axis=1), train_data[col])

                imputed_values = rf.predict(test_data.drop(col, axis=1))

                # Update the missing values in the DataFrame
                df_imputed.loc[df_imputed[col].isnull(), col] = imputed_values

    return df_imputed

In [23]:
arrest_data_complete_df = pd.concat([arrest_data_df_2019, arrest_data_df_current], axis = 0, ignore_index = True)

In [24]:
arrests_preprocessed_df = preProcessArrests(arrest_data_complete_df, lookup_file, lookup_cols, drop_cols, ip_path)

In [25]:
cols_with_missing_values = arrests_preprocessed_df.columns[arrests_preprocessed_df.isnull().sum() > 0]
arrests_preprocessed_df = imputeMissingValues(arrests_preprocessed_df, cols_with_missing_values)

Time
Charge Group Code
Charge Group Description
Charge Description
Disposition Description
Mapped Arrest Type Value


In [26]:
arrests_preprocessed_df.head()

Unnamed: 0,Report ID,Report Type,Arrest Date,Time,Area ID,Area Name,Reporting District,Age,Sex Code,Charge Group Code,Charge Group Description,Charge Description,Disposition Description,Address,LAT,LON,Location,Arrest Week Year,Mapped Descent Value,Mapped Arrest Type Value
0,2377805,BOOKING,2010-06-22,18:45:00,16,Foothill,1664,46,F,6.0,Larceny,GRAND THEFT MONEY/PROPERTY > $400,MISDEMEANOR COMPLAINT FILED,PENDLETON,34.2375,-118.3745,POINT (-118.3745 34.2375),2010 Wk 25,White,Felony
1,121920046,RFC,2012-09-28,09:30:00,19,Mission,1998,60,M,24.0,Miscellaneous Other Violations,DRUNK DRIVING ALCOHOL/DRUGS,MISDEMEANOR COMPLAINT FILED,7600 WOODMAN AV,34.2111,-118.4309,POINT (-118.4309 34.2111),2012 Wk 39,Hispanic/Latin/Mexican,Misdemeanor
2,101820989,RFC,2010-10-20,07:00:00,18,Southeast,1829,14,M,8.0,Other Assaults,BATTERY,MISDEMEANOR COMPLAINT FILED,7200 QUAIL DR,34.1025,-118.2091,POINT (-118.2091 34.1025),2010 Wk 42,Hispanic/Latin/Mexican,Misdemeanor
3,90712341,RFC,2011-03-10,09:40:00,7,Wilshire,776,45,M,18.0,Drunkeness,DRINKING IN PUBLIC,MISDEMEANOR COMPLAINT FILED,4500 W WASHINGTON BL,34.0399,-118.3375,POINT (-118.3375 34.0399),2011 Wk 10,Black,Misdemeanor
4,121909585,RFC,2012-04-01,17:15:00,19,Mission,1993,37,M,24.0,Miscellaneous Other Violations,DRUNK DRIVING ALCOHOL/DRUGS,MISDEMEANOR COMPLAINT FILED,8100 SEPULVEDA PL,34.2208,-118.4662,POINT (-118.4662 34.2208),2012 Wk 13,Hispanic/Latin/Mexican,Misdemeanor


In [None]:
os.chdir(op_path)
arrests_preprocessed_df.to_csv("Arrests Data Pre-Processed V3.csv", index = False)