In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from imblearn.over_sampling import SMOTENC

In [8]:
df = pd.read_csv("../cleaned_hate_crime.csv")

In [9]:
df.head()

Unnamed: 0,region_name,offender_race,grouped_total_offender_count,grouped_victim_count,generalized_offense_name,generalized_location_name,generalized_bias_desc
0,Midwest,White,Few,Few,violent crimes,Residence,race
1,Northeast,White,Few,Few,violent crimes,Miscellaneous,race
2,Northeast,White,Few,Few,violent crimes,Miscellaneous,race
3,South,Black or African American,Several,Few,violent crimes,Public Place,race
4,Northeast,Black or African American,Few,Few,violent crimes,Miscellaneous,race


In [10]:
# Creating X and Y General Subsets of the data
X_xgb = df[["region_name", "offender_race", "grouped_total_offender_count", "grouped_victim_count", "generalized_offense_name", "generalized_location_name"]]

Y_xgb = df["generalized_bias_desc"]

In [11]:
cat_vars=["region_name", "offender_race", "grouped_total_offender_count", "grouped_victim_count", "generalized_offense_name", "generalized_location_name"]

for var in cat_vars:
    cat_list = pd.get_dummies(X_xgb[var], prefix=var)
    X_xgb = pd.concat([X_xgb, cat_list], axis=1)
    X_xgb.drop(var, axis=1, inplace=True)

In [12]:
X_xgb.head()

Unnamed: 0,region_name_Midwest,region_name_Northeast,region_name_Other,region_name_South,region_name_U.S. Territories,region_name_West,offender_race_American Indian or Alaska Native,offender_race_Asian,offender_race_Black or African American,offender_race_Multiple,...,generalized_offense_name_sexual crimes,generalized_offense_name_violent crimes,generalized_location_name_Construction/Industrial,generalized_location_name_Education,generalized_location_name_Law Enforcement,generalized_location_name_Miscellaneous,generalized_location_name_Outdoor/Nature,generalized_location_name_Public Place,generalized_location_name_Residence,generalized_location_name_Retail
0,True,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,True,False
1,False,True,False,False,False,False,False,False,False,False,...,False,True,False,False,False,True,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,...,False,True,False,False,False,True,False,False,False,False
3,False,False,False,True,False,False,False,False,True,False,...,False,True,False,False,False,False,False,True,False,False
4,False,True,False,False,False,False,False,False,True,False,...,False,True,False,False,False,True,False,False,False,False


In [13]:
# encode the generalized bias description to 0, 1,2
Y_xgb = Y_xgb.replace("race", 0)
Y_xgb = Y_xgb.replace("religion", 1)
Y_xgb = Y_xgb.replace("sexual orientation", 2)

In [14]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_xgb, Y_xgb, test_size=0.2, random_state=69)

In [21]:
xgb_model = xgb.XGBClassifier(objective="multi:softprob", num_class=3, max_depth=10, random_state=69)

xgb_model.fit(X_train, y_train)

In [25]:
y_pred = xgb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accruacy:", accuracy)

Accruacy: 0.7152793443116023


In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      1.00      0.83     19004
           1       0.56      0.03      0.06      2310
           2       0.40      0.00      0.01      5284

    accuracy                           0.72     26598
   macro avg       0.56      0.34      0.30     26598
weighted avg       0.64      0.72      0.60     26598

