In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
data=pd.read_csv("database.csv.zip",low_memory=False)

In [3]:
data= data[data["Perpetrator Age"].str.contains(' ') == False]
data["Perpetrator Age"] = data["Perpetrator Age"].astype(int)

# Data Preprocessing

In [4]:
idx_to_drop=data[data["Crime Solved"]=="No"].index
data.drop(idx_to_drop,inplace=True)

In [5]:
data.drop(["Record ID","Agency Code","Agency Name","Agency Type","Record Source"],axis=1,inplace=True)

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [7]:
data.loc[:,"Perpetrator Sex"]=data.loc[:,"Perpetrator Sex"].map({"Male":int(1),"Female":int(2),"Unknown":int(0)})

In [8]:
data.loc[:,"Victim Sex"]=data.loc[:,"Victim Sex"].map({"Male":int(1),"Female":int(2),"Unknown":int(0)})

In [9]:
data.loc[:,"Perpetrator Race"]=data.loc[:,"Perpetrator Race"].map({'Native American/Alaska Native':int(1),'White':int(2),'Black':int(3),'Asian/Pacific Islander':int(4),'Unknown':int(0)})

In [10]:
data.loc[:,"Victim Race"]=data.loc[:,"Victim Race"].map({'Native American/Alaska Native':int(1),'White':int(2),'Black':int(3),'Asian/Pacific Islander':int(4),'Unknown':int(0)})

In [11]:
data=data[data['Victim Age']<100]

In [12]:
data.loc[:,"Perpetrator Ethnicity"]=data.loc[:,"Perpetrator Ethnicity"].map({"Unknown":int(0),"Not Hispanic":int(1),"Hispanic":int(2)})

In [13]:
data.loc[:,"Victim Ethnicity"]=data.loc[:,"Victim Ethnicity"].map({"Unknown":int(0),"Not Hispanic":int(1),"Hispanic":int(2)})

In [14]:
data.loc[:,"Relationship"]=(encoder.fit_transform(data.loc[:,"Relationship"])).astype("int64")

In [15]:
data.loc[:,"Weapon"]=(encoder.fit_transform(data.loc[:,"Weapon"])).astype("int64")

In [16]:
data.loc[:,"City"]=(encoder.fit_transform(data.loc[:,"City"])).astype("int64")

In [17]:
data.loc[:,"State"]=(encoder.fit_transform(data.loc[:,"State"])).astype("int64")

In [18]:
data.loc[:,"Month"]=data.loc[:,"Month"].map(({'January':int(1),"February":int(2),"March":int(3),'April':int(4),'May':int(5),'June':int(6),'July':int(7),'August':int(8),'September':int(9),'October':int(10),'November':int(11),'December':int(12)})).astype("int64")

In [19]:
data.loc[:,"Crime Solved"]=data.loc[:,"Crime Solved"].map({"Yes":int(1),"No":int(0)})

In [20]:
data.loc[:,"Crime Type"]=data.loc[:,"Crime Type"].map({'Murder or Manslaughter':int(1),'Manslaughter by Negligence':int(2)})

# Supervised Machine learning

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
newdata=data[(data["Perpetrator Race"]!=0)& (data["Perpetrator Sex"]!=0)]
newdata=newdata[newdata["Perpetrator Ethnicity"]!=0]

In [23]:
x=newdata[["State","Year","Month","Victim Count","Perpetrator Count",
           "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type"]]
y=newdata[["Perpetrator Sex","Perpetrator Race","Perpetrator Ethnicity"]]

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=6)

In [25]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LinearRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

#### Predicting Perpetrator's Sex

1. Logistic Regression

In [26]:
loregsex=LogisticRegression(solver='lbfgs', multi_class='auto',dual = False,max_iter=1000)
loregsex.fit(x_train,y_train["Perpetrator Sex"])
y_sex_pred=loregsex.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Sex"],y_sex_pred))

0.8897771507693916


2. Random Forest

In [27]:
clfsex=RandomForestClassifier(n_estimators=100)
clfsex.fit(x_train,y_train["Perpetrator Sex"])
y_sex_pred=clfsex.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Sex"],y_sex_pred))

0.9356271390129843


3. K-nearest neighbors

In [28]:
knnsex=KNeighborsClassifier(n_neighbors=20)
knnsex.fit(x_train,y_train["Perpetrator Sex"])
y_sex_pred=knnsex.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Sex"],y_sex_pred))

0.8941923348224783


The best model in this case is the Random Forest

#### Predicting Perpetrator's Race

1. Logisitic Regression

In [29]:
one_hot=pd.get_dummies(y['Perpetrator Race'])


In [30]:
one_hot.head()

Unnamed: 0,1,2,3,4
44,1,0,0,0
52,0,0,1,0
53,0,1,0,0
54,0,0,1,0
58,0,1,0,0


In [31]:
y=y.join(one_hot)

In [32]:
y.head()

Unnamed: 0,Perpetrator Sex,Perpetrator Race,Perpetrator Ethnicity,1,2,3,4
44,1,1,1,1,0,0,0
52,1,3,1,0,0,1,0
53,2,2,1,0,1,0,0
54,1,3,1,0,0,1,0
58,1,2,1,0,1,0,0


In [33]:
## 1 Native American/Alaska Native
##2  'White':
##3    Black
##4   Asian/Pacific Islander

1. 
    a. predicting native american /alaska native race 

In [34]:
x_train1,x_test1,y_train1,y_test1=train_test_split(x,y[1],test_size=0.2)

In [35]:
loregrace1=LogisticRegression(solver='lbfgs',multi_class='auto',dual=False ,max_iter=1000)
loregrace1.fit(x_train1,y_train1)
y_nativeamerican_pred=loregrace1.predict(x_test1)
print(metrics.accuracy_score(y_test1,y_nativeamerican_pred))

0.990856127700708


1. b. predicting white race 

In [36]:
x_train2,x_test2,y_train2,y_test2=train_test_split(x,y[2],test_size=0.2)

In [37]:
loregrace2=LogisticRegression(solver='lbfgs', multi_class='auto',dual = False,max_iter=1000)
loregrace2.fit(x_train2,y_train2)
y_white_pred=loregrace2.predict(x_test2)
print(metrics.accuracy_score(y_test2,y_white_pred))

0.8765054732607048


1. c. predicting black race 

In [38]:
x_train3,x_test3,y_train3,y_test3=train_test_split(x,y[3],test_size=0.2)

In [39]:
loregrace3=LogisticRegression(solver='lbfgs', multi_class='auto',dual = False,max_iter=1000)
loregrace3.fit(x_train3,y_train3)
y_black_pred=loregrace3.predict(x_test3)
print(metrics.accuracy_score(y_test3,y_black_pred))

0.8753820832353633


1. d. predicting asian race 

In [40]:
x_train4,x_test4,y_train4,y_test4=train_test_split(x,y[4],test_size=0.2)

In [41]:
loregrace4=LogisticRegression(solver='lbfgs', multi_class='auto',dual = False,max_iter=1000)
loregrace4.fit(x_train4,y_train4)
y_asian_pred=loregrace4.predict(x_test4)
print(metrics.accuracy_score(y_test4,y_asian_pred))

0.9833581524152886


since it is more detailed because we devided one feature into four features , the model is better with the logistic regression

2. Random Forest

In [42]:
clfrace=RandomForestClassifier(n_estimators=100)
clfrace.fit(x_train,y_train["Perpetrator Race"])
y_race_pred=clfrace.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Race"],y_race_pred))

0.8647751913681846


3. K-nearest neighbors

In [43]:
knnrace=KNeighborsClassifier(n_neighbors=20)
knnrace.fit(x_train,y_train["Perpetrator Race"])
y_race_pred=knnrace.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Race"],y_race_pred))

0.6807482300075763


Once again, Random Forest is te best model


#### Perpetrator's Ethnicity

2. Random Forest

In [44]:
clfethnicity=RandomForestClassifier(n_estimators=100)
clfethnicity.fit(x_train,y_train["Perpetrator Ethnicity"])
y_ethnicity_pred=clfethnicity.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Ethnicity"],y_ethnicity_pred))

0.8936175771350942


3. K-nearest neighbors

In [45]:
knnethnicity=KNeighborsClassifier(n_neighbors=20)
knnethnicity.fit(x_train,y_train["Perpetrator Ethnicity"])
y_ethnicity_pred=knnethnicity.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Ethnicity"],y_ethnicity_pred))

0.7951772604958591


1. logistic Regression

In [46]:
loregethnicity=LogisticRegression(solver='lbfgs', multi_class='auto',dual = False,max_iter=1000)
loregethnicity.fit(x_train,y_train["Perpetrator Ethnicity"])
y_ethnicity_pred=loregethnicity.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Ethnicity"],y_ethnicity_pred))

0.8981633879353136


the accuracy using logistic regression is higher than the accuracy using randomforest but it's just because logistic regression didn't use the whole data but only limited the maximum number of iterations to 1000 

#### Predicting perpetrator's Age

look next part 