We will predict how long, in days, a complaint takes to be closed. This is a regression problem.

In [69]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import seaborn as sns

pd.set_option('display.max_columns', None) # Stretches dataframe output to display every columnn

In [70]:
filepath = "../data/complaints_cleaned.csv"
complaints = pd.read_csv(filepath, index_col= 0)
complaints.head()

Unnamed: 0,unique_mos_id,first_name,last_name,command_now,shield_no,complaint_id,month_received,year_received,month_closed,year_closed,command_at_incident,rank_abbrev_incident,rank_abbrev_now,mos_ethnicity,mos_gender,mos_age_incident,complainant_ethnicity,complainant_gender,complainant_age_incident,fado_type,allegation,precinct,contact_reason,outcome_description,board_disposition,date_received,date_closed,days_taken,rank_incident,rank_now,complainant_won
0,10004.0,Jonathan,Ruiz,078 PCT,8409.0,42835.0,7.0,2019.0,5.0,2020.0,078 PCT,POM,POM,Hispanic,M,32.0,Black,Female,38.0,Abuse of Authority,Failure to provide RTKA card,78.0,Report-domestic dispute,No arrest made or summons issued,Substantiated (Command Lvl Instructions),2019-07-01,2020-05-01,305 days,Police Officer,Police Officer,1.0
1,10007.0,John,Sears,078 PCT,5952.0,24601.0,11.0,2011.0,8.0,2012.0,PBBS,POM,POM,White,M,24.0,Black,Male,26.0,Discourtesy,Action,67.0,Moving violation,Moving violation summons issued,Substantiated (Charges),2011-11-01,2012-08-01,274 days,Police Officer,Police Officer,1.0
2,10007.0,John,Sears,078 PCT,5952.0,24601.0,11.0,2011.0,8.0,2012.0,PBBS,POM,POM,White,M,24.0,Black,Male,26.0,Offensive Language,Race,67.0,Moving violation,Moving violation summons issued,Substantiated (Charges),2011-11-01,2012-08-01,274 days,Police Officer,Police Officer,1.0
3,10007.0,John,Sears,078 PCT,5952.0,26146.0,7.0,2012.0,9.0,2013.0,PBBS,POM,POM,White,M,25.0,Black,Male,45.0,Abuse of Authority,Question,67.0,PD suspected C/V of violation/crime - street,No arrest made or summons issued,Substantiated (Charges),2012-07-01,2013-09-01,427 days,Police Officer,Police Officer,1.0
4,10009.0,Noemi,Sierra,078 PCT,24058.0,40253.0,8.0,2018.0,2.0,2019.0,078 PCT,POF,POF,Hispanic,F,39.0,,,16.0,Force,Physical force,67.0,Report-dispute,Arrest - other violation/crime,Substantiated (Command Discipline A),2018-08-01,2019-02-01,184 days,Police Officer,Police Officer,1.0


We won't focus on imputation for this project. The filtered dataset will still have enough values to train a model on (26k entries).

In [46]:
complaints['days_taken'] = pd.to_numeric(complaints['days_taken'].str.split().str[0]) # Reformat days_taken to integer
complaints = complaints.dropna().drop(columns = ['complaint_id', 'unique_mos_id', 'shield_no']) # Drop all rows with missing data and drop all unique identification columns

In [47]:
# We conducted inferential tests to see if there was a relationship between case duration and complainant gender/age/ethnicity so let's use those columns to train a baseline model
X_train, X_test, y_train, y_test = train_test_split(complaints.drop(columns = ['days_taken']), complaints['days_taken'], shuffle= True, test_size=0.2)
display(X_train.head())
display(y_train.head())

Unnamed: 0,first_name,last_name,command_now,month_received,year_received,month_closed,year_closed,command_at_incident,rank_abbrev_incident,rank_abbrev_now,mos_ethnicity,mos_gender,mos_age_incident,complainant_ethnicity,complainant_gender,complainant_age_incident,fado_type,allegation,precinct,contact_reason,outcome_description,board_disposition,date_received,date_closed,rank_incident,rank_now,complainant_won
6535,Freddy,Dominguez,D-E T/F,8.0,2004.0,10.0,2005.0,043 PCT,SGT,SDS,Hispanic,M,38.0,Black,Male,50.0,Abuse of Authority,Vehicle stop,43.0,PD suspected C/V of violation/crime - bldg,Arrest - other violation/crime,Unsubstantiated,2004-08-01,2005-10-01,Sergeant,Sergeant Detective Squad,0.0
13405,John,Zanca,I.A.B.,1.0,2015.0,9.0,2015.0,040 PCT,POM,SGT,White,M,24.0,Black,Male,33.0,Abuse of Authority,Search (of person),40.0,Other violation of VTL,Arrest - other violation/crime,Unsubstantiated,2015-01-01,2015-09-01,Police Officer,Sergeant,0.0
424,Mark,Xylas,079 PCT,7.0,2015.0,10.0,2015.0,081 PCT,POM,SGT,White,M,28.0,Black,Male,21.0,Abuse of Authority,Vehicle search,81.0,PD suspected C/V of violation/crime - auto,Arrest - other violation/crime,Exonerated,2015-07-01,2015-10-01,Police Officer,Sergeant,0.0
20159,Norman,Grandstaff,PBBS,2.0,2006.0,5.0,2006.0,070 PCT,SGT,INS,White,M,30.0,Black,Male,26.0,Force,Physical force,70.0,PD suspected C/V of violation/crime - street,Summons - disorderly conduct,Exonerated,2006-02-01,2006-05-01,Sergeant,Inspector,0.0
9983,David,Nisthaus,E S U,12.0,2004.0,1.0,2006.0,NARCBMN,SGT,LT,Hispanic,M,39.0,Black,Male,44.0,Abuse of Authority,Refusal to provide name/shield number,24.0,Execution of search warrant,Arrest - other violation/crime,Unsubstantiated,2004-12-01,2006-01-01,Sergeant,Lieutenant,0.0


6535     426.0
13405    243.0
424       92.0
20159     89.0
9983     396.0
Name: days_taken, dtype: float64

In [48]:
ct = ColumnTransformer([("ohe", OneHotEncoder(), ['complainant_gender', 'complainant_ethnicity']), ('age_ignore' , 'passthrough', ['complainant_age_incident'])], remainder='drop')
pl = Pipeline([('ct', ct), ('lr', LinearRegression())])
pl.fit(X_train, y_train)
pl.named_steps['ct'].transform(X_train).toarray()

array([[ 0.,  0.,  1., ...,  0.,  0., 50.],
       [ 0.,  0.,  1., ...,  0.,  0., 33.],
       [ 0.,  0.,  1., ...,  0.,  0., 21.],
       ...,
       [ 1.,  0.,  0., ...,  0.,  0., 28.],
       [ 0.,  0.,  1., ...,  0.,  0., 24.],
       [ 0.,  0.,  1., ...,  1.,  0., 25.]])

In [38]:
print(f"Baseline R^2: {pl.score(X_test, y_test)}")
print(f"Baseline MSE: {mean_squared_error(pl.predict(X_test), y_test)}")

Baseline R^2: 0.014160187406212388
Baseline MSE: 21620.405516546394
