In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [12]:
df = pd.read_csv('dataset/Hate_Crimes_2017-2025.csv')

pd.set_option('display.width', 1000)
print(df.head())

df = df.drop(columns=[
    "Incident Number",
    "Date of Incident",
    "Zip Code",
    "Council District"
])


# Clean column names
df.columns = df.columns.str.strip().str.replace(" ", "_").str.replace("<", "lt").str.replace(">", "gt")

# Group the bias categories into two broader classes: Racism and Transphobia
df['Bias'] = df['Bias'].replace({
    'Anti-Black or African American': 'Racism',
    'Anti-Black' : "Racism",
    'Anti-Hispanic' : "Racism",
    'Anti-Gay': 'Transphobia',
    'Anti-Gay (Male)': 'Transphobia',
    'Anti-Lesbian/Gay/Bisexual/Transgender (Mixed Group)': 'Transphobia',
    'Anti-Transgender': 'Transphobia',
    'Anti-Hispanic or Latino': 'Racism',
    'Anti-Islamic (Muslim)': 'Racism',
    'Anti-Jewish': 'Racism',
    'Anti-Arab': 'Racism',
    'Anti-Asian': 'Racism',
    'Anti-Lesbian/Bisexual/Transgender (Mixed Group)': 'Transphobia',
    'Anti-Other Race/Ethnicity/Ancestry': 'Racism',
    'Anti-Religion (Other)': 'Racism',
    'Anti-White': 'Racism',  
})

# Define features and label
X = df.drop(columns=["Bias"])
y = df["Bias"]

# Identify categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ],
    remainder="passthrough"
)

# Full model pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42, class_weight="balanced"))
])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model.fit(X_train, y_train)

# Evaluate
print("\nModel Performance:")
print("Accuracy:", accuracy_score(y_test, model.predict(X_test)))
print("Classification Report:\n", classification_report(y_test, model.predict(X_test)))


  Month Incident Number        Date of Incident Day of Week Number of Victims under 18 Number of Victims over 18 Number of Offenders under 18 Number of Offenders over 18             Race/Ethnicity of Offenders          Offense(s)                    Offense Location                            Bias  Zip Code APD Sector  Council District
0   Jan     2017-241137  01/01/2017 12:00:00 AM         Sun                          0                         1                            0                           1                      White/Not Hispanic  Aggravated Assault                     Park/Playground  Anti-Black or African American   78704.0      Henry               9.0
1   Feb     2017-580344  02/01/2017 12:00:00 AM         Wed                          0                         1                            0                           1  Black or African American/Not Hispanic  Aggravated Assault  Highway/Road/Alley/Street/Sidewalk                      Anti-White   78702.0    Charlie        