In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
data=pd.read_csv("database.csv.zip",low_memory=False)

In [3]:
data= data[data["Perpetrator Age"].str.contains(' ') == False]
data["Perpetrator Age"] = data["Perpetrator Age"].astype(int)

# Preprocessing

In [4]:
idx_to_drop=data[data["Crime Solved"]=="No"].index
data.drop(idx_to_drop,inplace=True)

In [5]:
data.drop(["Record ID","Agency Code","Agency Name","Agency Type","Record Source"],axis=1,inplace=True)

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [7]:
data.loc[:,"Perpetrator Sex"]=data.loc[:,"Perpetrator Sex"].map({"Male":int(1),"Female":int(2),"Unknown":int(0)})

In [8]:
data.loc[:,"Victim Sex"]=data.loc[:,"Victim Sex"].map({"Male":int(1),"Female":int(2),"Unknown":int(0)})

In [9]:
data.loc[:,"Perpetrator Race"]=data.loc[:,"Perpetrator Race"].map({'Native American/Alaska Native':int(1),'White':int(2),'Black':int(3),'Asian/Pacific Islander':int(4),'Unknown':int(0)})

In [10]:
data.loc[:,"Victim Race"]=data.loc[:,"Victim Race"].map({'Native American/Alaska Native':int(1),'White':int(2),'Black':int(3),'Asian/Pacific Islander':int(4),'Unknown':int(0)})

In [11]:
data.loc[:,"Perpetrator Ethnicity"]=data.loc[:,"Perpetrator Ethnicity"].map({"Unknown":int(0),"Not Hispanic":int(1),"Hispanic":int(2)})

In [12]:
data=data[data['Victim Age']<100]

In [13]:
data.loc[:,"Victim Ethnicity"]=data.loc[:,"Victim Ethnicity"].map({"Unknown":int(0),"Not Hispanic":int(1),"Hispanic":int(2)})

In [14]:
data.loc[:,"Relationship"]=(encoder.fit_transform(data.loc[:,"Relationship"])).astype("int64")

In [15]:
data.loc[:,"Weapon"]=(encoder.fit_transform(data.loc[:,"Weapon"])).astype("int64")

In [16]:
data.loc[:,"City"]=(encoder.fit_transform(data.loc[:,"City"])).astype("int64")

In [17]:
data.loc[:,"State"]=(encoder.fit_transform(data.loc[:,"State"])).astype("int64")

In [18]:
data.loc[:,"Month"]=data.loc[:,"Month"].map(({'January':int(1),"February":int(2),"March":int(3),'April':int(4),'May':int(5),'June':int(6),'July':int(7),'August':int(8),'September':int(9),'October':int(10),'November':int(11),'December':int(12)})).astype("int64")

In [19]:
data.loc[:,"Crime Solved"]=data.loc[:,"Crime Solved"].map({"Yes":int(1),"No":int(0)})

In [20]:
data.loc[:,"Crime Type"]=data.loc[:,"Crime Type"].map({'Murder or Manslaughter':int(1),'Manslaughter by Negligence':int(2)})

# Supervised ML part2 

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor 
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LinearRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

#### Predicting perpetrator's Age

In [22]:
newdata=data[(data["Perpetrator Race"]!=0)& (data["Perpetrator Sex"]!=0)]
newdata=newdata[newdata["Perpetrator Ethnicity"]!=0]

In [37]:
bins = [-1, 12, 18, 60, np.inf]
labels = ['child', 'teenager', 'adult', 'elder']
age_groups = pd.cut(newdata["Perpetrator Age"], bins, labels=labels)
newdata['age_group'] = age_groups

In [38]:
x=newdata[["State","Year","Month","Victim Count","Perpetrator Count",
           "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type"]]
y=newdata[["age_group"]]

In [39]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)

1. Random Forest

In [23]:
x=newdata[["State","Year","Month","Victim Count","Perpetrator Count",
           "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type"]]
y=newdata[["Perpetrator Age"]]

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)

In [25]:
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.model_selection import GridSearchCV 

In [26]:
n_estimators=[int(x)for x in np.linspace(start=200,stop=2000,num=10)]
random_grid={'n_estimators':n_estimators}

In [27]:
param_grid={'n_estimators':[100,200,300,500]}
rf=RandomForestRegressor()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=2,n_jobs=-1,verbose=2)

In [None]:
grid_search.fit(x_train,y_train)
grid_search.best_params_

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
clfage=RandomForestClassifier(n_estimators=100)
clfage.fit(x_train,y_train.values.ravel())
y_age_pred=clfage.predict(x_test)
print(metrics.accuracy_score(y_test,y_age_pred))

2. logistic regression 

In [None]:
one_hot=pd.get_dummies(y)
one_hot

In [None]:
y=y.join(one_hot)

In [None]:
y.head()

2. a. predicting child group

In [None]:
x=newdata[["State","Year","Month","Victim Count","Perpetrator Count",
           "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type"]]
y_child=y[["age_group_child"]]


x_train,x_test,y_train,y_test=train_test_split(x,y_child,test_size=0.2)

loregchild=LogisticRegression(solver='lbfgs',multi_class='auto',dual=False ,max_iter=1000)
loregchild.fit(x_train,y_train.values.ravel())
y_child_pred=loregchild.predict(x_test)
print(metrics.accuracy_score(y_test,y_child_pred))

2. b. predicting teenager group

In [None]:
x=newdata[["State","Year","Month","Victim Count","Perpetrator Count",
           "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type"]]
y_teen=y[["age_group_teenager"]]


x_train,x_test,y_train,y_test=train_test_split(x,y_teen,test_size=0.2)

loregteen=LogisticRegression(solver='lbfgs',multi_class='auto',dual=False ,max_iter=1000)
loregteen.fit(x_train,y_train.values.ravel())
y_teen_pred=loregteen.predict(x_test)
print(metrics.accuracy_score(y_test,y_teen_pred))

2. c. predicting adult group

In [33]:
x=newdata[["State","Year","Month","Victim Count","Perpetrator Count",
           "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type"]]
y_adult=y[["age_group_adult"]]


x_train,x_test,y_train,y_test=train_test_split(x,y_adult,test_size=0.2)

loregadult=LogisticRegression(solver='lbfgs',multi_class='auto',dual=False ,max_iter=1000)
loregadult.fit(x_train,y_train.values.ravel())
y_adult_pred=loregadult.predict(x_test)
print(metrics.accuracy_score(y_test,y_adult_pred))

0.8086606769745089


2. d. predicting elder group

In [35]:
x=newdata[["State","Year","Month","Victim Count","Perpetrator Count",
           "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type"]]
y_elder=y[["age_group_elder"]]


x_train,x_test,y_train,y_test=train_test_split(x,y_elder,test_size=0.2)

loregelder=LogisticRegression(solver='lbfgs',multi_class='auto',dual=False ,max_iter=1000)
loregelder.fit(x_train,y_train.values.ravel())
y_elder_pred=loregelder.predict(x_test)
print(metrics.accuracy_score(y_test,y_elder_pred))

0.9676138737985792


### predicting crime decade 

In [23]:
bins = [1979, 1990, 2000, 2010, 2020]
labels = ['80s', '90s', '2000s', '2010s']
year_groups = pd.cut(newdata["Year"], bins, labels=labels)
newdata['year_group'] = year_groups

In [24]:
x=newdata[["State","Year","Month","Victim Count","Perpetrator Count",
           "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type"]]
y=newdata[["year_group"]]

In [25]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)

In [26]:
one_hot=pd.get_dummies(y)
one_hot

Unnamed: 0,year_group_80s,year_group_90s,year_group_2000s,year_group_2010s
44,1,0,0,0
52,1,0,0,0
53,1,0,0,0
54,1,0,0,0
58,1,0,0,0
...,...,...,...,...
638441,0,0,0,1
638442,0,0,0,1
638446,0,0,0,1
638452,0,0,0,1


In [27]:
y=y.join(one_hot)

In [28]:
y.isna().sum()

year_group          0
year_group_80s      0
year_group_90s      0
year_group_2000s    0
year_group_2010s    0
dtype: int64

In [30]:
x=newdata[["State","Month", "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type",'Perpetrator Age','Perpetrator Race','Perpetrator Ethnicity']]
y_80=y[["year_group_80s"]]


x_train,x_test,y_train,y_test=train_test_split(x,y_80,test_size=0.2)

loreg80=LogisticRegression(solver='lbfgs',multi_class='auto',dual=False ,max_iter=1000)
loreg80.fit(x_train,y_train.values.ravel())
y_80_pred=loreg80.predict(x_test)
print(metrics.accuracy_score(y_test,y_80_pred))

0.5834574287431095


In [31]:
x=newdata[["State","Month", "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type",'Perpetrator Age','Perpetrator Race','Perpetrator Ethnicity']]
y_90=y[["year_group_90s"]]


x_train,x_test,y_train,y_test=train_test_split(x,y_90,test_size=0.2)

loreg90=LogisticRegression(solver='lbfgs',multi_class='auto',dual=False ,max_iter=1000)
loreg90.fit(x_train,y_train.values.ravel())
y_90_pred=loreg90.predict(x_test)
print(metrics.accuracy_score(y_test,y_90_pred))

0.7700969250463725


In [32]:
x=newdata[["State","Month", "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type",'Perpetrator Age','Perpetrator Race','Perpetrator Ethnicity']]
y_2000=y[["year_group_2000s"]]


x_train,x_test,y_train,y_test=train_test_split(x,y_2000,test_size=0.2)

loreg2000=LogisticRegression(solver='lbfgs',multi_class='auto',dual=False ,max_iter=1000)
loreg2000.fit(x_train,y_train.values.ravel())
y_2000_pred=loreg2000.predict(x_test)
print(metrics.accuracy_score(y_test,y_2000_pred))

0.8042166313974449


In [33]:
x=newdata[["State","Month", "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type",'Perpetrator Age','Perpetrator Race','Perpetrator Ethnicity']]
y_2010=y[["year_group_2010s"]]


x_train,x_test,y_train,y_test=train_test_split(x,y_2010,test_size=0.2)

loreg2010=LogisticRegression(solver='lbfgs',multi_class='auto',dual=False ,max_iter=1000)
loreg2010.fit(x_train,y_train.values.ravel())
y_2010_pred=loreg2010.predict(x_test)
print(metrics.accuracy_score(y_test,y_2010_pred))

0.9172610183661206


### Predicting crime State

In [26]:
newdata.corr()

Unnamed: 0,City,State,Year,Month,Incident,Crime Type,Crime Solved,Victim Sex,Victim Age,Victim Race,Victim Ethnicity,Perpetrator Sex,Perpetrator Age,Perpetrator Race,Perpetrator Ethnicity,Relationship,Weapon,Victim Count,Perpetrator Count
City,1.0,-0.144277,0.063315,0.001894,-0.00858,-0.009509,,0.00976,-0.001525,-0.05753,0.02267,-0.014666,0.006751,-0.05982,0.031367,0.018416,-0.007315,0.01125,0.017419
State,-0.144277,1.0,-0.025301,0.00474,-0.021022,0.021035,,0.014991,0.030995,0.04316,-0.100272,0.030609,0.000738,0.037686,-0.104696,-0.135498,0.022618,-0.007517,-0.035169
Year,0.063315,-0.025301,1.0,-0.010744,-0.050778,0.008779,,0.012265,-0.014195,-0.032646,0.118068,-0.068587,-0.034783,-0.028612,0.118416,0.150519,-0.03323,0.020945,0.050822
Month,0.001894,0.00474,-0.010744,1.0,0.022239,0.001934,,-0.001681,0.001855,-0.002764,0.003553,-0.002948,-0.000227,-0.002427,0.004861,-0.002264,0.001438,-0.002722,-0.006916
Incident,-0.00858,-0.021022,-0.050778,0.022239,1.0,-0.001838,,-0.056865,-0.032141,0.085815,0.06459,-0.024667,-0.067014,0.079577,0.075069,0.096231,-0.004674,-0.01612,0.060459
Crime Type,-0.009509,0.021035,0.008779,0.001934,-0.001838,1.0,,0.013028,-0.080959,-0.038187,-0.014055,0.031247,-0.032626,-0.045042,-0.014018,-0.031034,0.042902,-0.009972,-0.034169
Crime Solved,,,,,,,,,,,,,,,,,,,
Victim Sex,0.00976,0.014991,0.012265,-0.001681,-0.056865,0.013028,,1.0,0.05729,-0.05517,-0.089801,-0.02188,0.178675,-0.040045,-0.076252,0.068783,-0.040962,0.088573,-0.104602
Victim Age,-0.001525,0.030995,-0.014195,0.001855,-0.032141,-0.080959,,0.05729,1.0,-0.066952,-0.117579,0.008046,0.310203,-0.035964,-0.099794,0.006297,0.003308,-0.02292,-0.037983
Victim Race,-0.05753,0.04316,-0.032646,-0.002764,0.085815,-0.038187,,-0.05517,-0.066952,1.0,-0.331588,0.059251,-0.044414,0.696266,-0.307654,-0.065651,0.010627,-0.035036,-0.028833


In [23]:
y=newdata[['State']]
x=newdata[['City' ,'Year' ,'Month' ,'Incident' ,'Crime Type'  ,'Victim Sex' ,'Victim Age','Victim Race','Victim Ethnicity',
           'Perpetrator Sex' ,'Perpetrator Age' ,'Perpetrator Race' ,'Perpetrator Ethnicity' ,
           'Relationship' ,'Weapon' ,'Victim Count' ,'Perpetrator Count']]

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=10)

In [33]:
from sklearn.neighbors import KNeighborsClassifier

knn=KNeighborsClassifier(n_neighbors=20)
knn.fit(x_train,y_train.values.ravel())
predicted=knn.predict(x_test)
metrics.accuracy_score(y_test,predicted)

0.7293675052903833