In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
data=pd.read_csv("database.csv.zip",low_memory=False)

In [3]:
data= data[data["Perpetrator Age"].str.contains(' ') == False]
data["Perpetrator Age"] = data["Perpetrator Age"].astype(int)

# Data Preprocessing

In [4]:
idx_to_drop=data[data["Crime Solved"]=="No"].index
data.drop(idx_to_drop,inplace=True)

In [5]:
data.drop(["Record ID","Agency Code","Agency Name","Agency Type","Record Source"],axis=1,inplace=True)

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [7]:
data.loc[:,"Perpetrator Sex"]=data.loc[:,"Perpetrator Sex"].map({"Male":int(1),"Female":int(2),"Unknown":int(0)})

In [8]:
data.loc[:,"Victim Sex"]=data.loc[:,"Victim Sex"].map({"Male":int(1),"Female":int(2),"Unknown":int(0)})

In [9]:
data.loc[:,"Perpetrator Race"]=data.loc[:,"Perpetrator Race"].map({'Native American/Alaska Native':int(1),'White':int(2),'Black':int(3),'Asian/Pacific Islander':int(4),'Unknown':int(0)})

In [10]:
data.loc[:,"Victim Race"]=data.loc[:,"Victim Race"].map({'Native American/Alaska Native':int(1),'White':int(2),'Black':int(3),'Asian/Pacific Islander':int(4),'Unknown':int(0)})

In [11]:
data.loc[:,"Perpetrator Ethnicity"]=data.loc[:,"Perpetrator Ethnicity"].map({"Unknown":int(0),"Not Hispanic":int(1),"Hispanic":int(2)})

In [12]:
data.loc[:,"Victim Ethnicity"]=data.loc[:,"Victim Ethnicity"].map({"Unknown":int(0),"Not Hispanic":int(1),"Hispanic":int(2)})

In [13]:
data.loc[:,"Relationship"]=(encoder.fit_transform(data.loc[:,"Relationship"])).astype("int64")

In [14]:
data.loc[:,"Weapon"]=(encoder.fit_transform(data.loc[:,"Weapon"])).astype("int64")

In [15]:
data.loc[:,"City"]=(encoder.fit_transform(data.loc[:,"City"])).astype("int64")

In [16]:
data.loc[:,"State"]=(encoder.fit_transform(data.loc[:,"State"])).astype("int64")

In [17]:
data.loc[:,"Month"]=data.loc[:,"Month"].map(({'January':int(1),"February":int(2),"March":int(3),'April':int(4),'May':int(5),'June':int(6),'July':int(7),'August':int(8),'September':int(9),'October':int(10),'November':int(11),'December':int(12)})).astype("int64")

In [18]:
data.loc[:,"Crime Solved"]=data.loc[:,"Crime Solved"].map({"Yes":int(1),"No":int(0)})

In [19]:
data.loc[:,"Crime Type"]=data.loc[:,"Crime Type"].map({'Murder or Manslaughter':int(1),'Manslaughter by Negligence':int(2)})

# Supervised Machine learning

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
newdata=data[(data["Perpetrator Race"]!=0)& (data["Perpetrator Sex"]!=0)]
newdata=newdata[newdata["Perpetrator Ethnicity"]!=0]

In [22]:
x=newdata[["State","Year","Month","Victim Count","Perpetrator Count",
           "Victim Sex","Victim Age","Victim Race","Victim Ethnicity",
          "Weapon","Relationship","Crime Type"]]
y=newdata[["Perpetrator Sex","Perpetrator Race","Perpetrator Ethnicity"]]

In [23]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=6)

In [24]:
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LinearRegression 
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

#### Predicting Perpetrator's Sex

1. Logistic Regression

In [25]:
loregsex=LogisticRegression(solver='lbfgs', multi_class='auto',dual = False,max_iter=1000)
loregsex.fit(x_train,y_train["Perpetrator Sex"])
y_sex_pred=loregsex.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Sex"],y_sex_pred))

0.8895737567906393


2. Random Forest

In [26]:
clfsex=RandomForestClassifier(n_estimators=100)
clfsex.fit(x_train,y_train["Perpetrator Sex"])
y_sex_pred=clfsex.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Sex"],y_sex_pred))

0.9351755119097367


3. K-nearest neighbors

In [27]:
knnsex=KNeighborsClassifier(n_neighbors=20)
knnsex.fit(x_train,y_train["Perpetrator Sex"])
y_sex_pred=knnsex.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Sex"],y_sex_pred))

0.8940921437526118


4. Multi-Linear Regression

In [28]:
modelsex=LinearRegression()
modelsex.fit(x_train,y_train["Perpetrator Sex"])
y_sex_pred=modelsex.predict(x_test)
print(metrics.r2_score(y_test["Perpetrator Sex"],y_sex_pred))

0.02924465610275273


The best model in this case is the Random Forest

#### Predicting Perpetrator's Race

1. Logisitic Regression

In [29]:
loregrace=LogisticRegression(solver='lbfgs', multi_class='auto',dual = False,max_iter=1000)
loregrace.fit(x_train,y_train["Perpetrator Race"])
y_race_pred=loregrace.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Race"],y_race_pred))

0.8605307145842039




2. Random Forest

In [30]:
clfrace=RandomForestClassifier(n_estimators=100)
clfrace.fit(x_train,y_train["Perpetrator Race"])
y_race_pred=clfrace.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Race"],y_race_pred))

0.869384663602173


3. K-nearest neighbors

In [31]:
knnrace=KNeighborsClassifier(n_neighbors=20)
knnrace.fit(x_train,y_train["Perpetrator Race"])
y_race_pred=knnrace.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Race"],y_race_pred))

0.6812055996656916


4. Multi-Linear Regression

In [32]:
modelrace=LinearRegression()
modelrace.fit(x_train,y_train["Perpetrator Race"])
y_race_pred=modelrace.predict(x_test)
print(metrics.r2_score(y_test["Perpetrator Race"],y_race_pred))

0.49603926329148706


Once again, Random Forest is te best model


#### Perpetrator's Ethnicity

1. logistic Regression

In [33]:
loregethnicity=LogisticRegression(solver='lbfgs', multi_class='auto',dual = False,max_iter=1000)
loregethnicity.fit(x_train,y_train["Perpetrator Ethnicity"])
y_ethnicity_pred=loregethnicity.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Ethnicity"],y_ethnicity_pred))

0.9018230254910155


2. Random Forest

In [34]:
clfethnicity=RandomForestClassifier(n_estimators=100)
clfethnicity.fit(x_train,y_train["Perpetrator Ethnicity"])
y_ethnicity_pred=clfethnicity.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Ethnicity"],y_ethnicity_pred))

0.8972262849979106


3. K-nearest neighbors

In [35]:
knnethnicity=KNeighborsClassifier(n_neighbors=20)
knnethnicity.fit(x_train,y_train["Perpetrator Ethnicity"])
y_ethnicity_pred=knnethnicity.predict(x_test)
print(metrics.accuracy_score(y_test["Perpetrator Ethnicity"],y_ethnicity_pred))

0.7983702465524446


4. Multi-linear Regression

In [36]:
modelethnicity=LinearRegression()
modelethnicity.fit(x_train,y_train["Perpetrator Ethnicity"])
y_ethnicity_pred=modelethnicity.predict(x_test)
print(metrics.r2_score(y_test["Perpetrator Ethnicity"],y_ethnicity_pred))

0.4985478647281222


the accuracy using logistic regression is higher than the accuracy using randomforest but it's just because logistic regression didn't use the whole data but only limited the maximum number of iterations to 1000 

#### Predicting perpetrator's Age

look next part 