In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv("../dataset/chicago_crime_features.csv")  

top_crimes = df['Crime_Type'].value_counts().nlargest(10).index
df = df[df['Crime_Type'].isin(top_crimes)]

# ---- Select features ----
features = [
    'Latitude', 'Longitude', 'Zone', 'Hour', 'DayOfWeek',
    'IsWeekend', 'Light_Condition', 'Temperature', 'Rain'
]
X = df[features].copy()
y = df['Crime_Type']

# ---- Encode categorical columns (like DayOfWeek or Light_Condition) ----
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

# ---- Balance classes ----
df_balanced = pd.concat([
    resample(df[df['Crime_Type'] == c],
             replace=True,
             n_samples=df['Crime_Type'].value_counts().max(),
             random_state=42)
    for c in top_crimes
])
X = df_balanced[features].copy()
y = df_balanced['Crime_Type']

for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

# ---- Encode target ----
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)

In [3]:
mapping = dict(zip(range(len(target_le.classes_)), target_le.classes_))
print(mapping)

{0: 'ASSAULT', 1: 'BATTERY', 2: 'BURGLARY', 3: 'CRIMINAL DAMAGE', 4: 'DECEPTIVE PRACTICE', 5: 'MOTOR VEHICLE THEFT', 6: 'OTHER OFFENSE', 7: 'ROBBERY', 8: 'THEFT', 9: 'WEAPONS VIOLATION'}


In [5]:
import joblib
joblib.dump(target_le, "../models/label_encoder.pkl")
print("Label Encoder saved.")


Label Encoder saved.


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns

# ---- Load cleaned Chicago dataset ----
df = pd.read_csv("../dataset/clean_sf.csv")  # adjust path


# ---- Keep top 10 most frequent crimes ----
top_crimes = df['Crime_Type'].value_counts().nlargest(10).index
df = df[df['Crime_Type'].isin(top_crimes)]

# ---- Select features ----
features = [
    'Latitude', 'Longitude', 'Zone', 'Hour', 'DayOfWeek',
    'IsWeekend', 'Light_Condition', 'Temperature', 'Rain'
]
X = df[features].copy()
y = df['Crime_Type']

# ---- Encode categorical columns (like DayOfWeek or Light_Condition) ----
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

# ---- Balance classes ----
df_balanced = pd.concat([
    resample(df[df['Crime_Type'] == c],
             replace=True,
             n_samples=df['Crime_Type'].value_counts().max(),
             random_state=42)
    for c in top_crimes
])
X = df_balanced[features].copy()
y = df_balanced['Crime_Type']

for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

# ---- Encode target ----
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)

In [8]:
mapping = dict(zip(range(len(target_le.classes_)), target_le.classes_))
print(mapping)

{0: 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT', 1: 'BATTERY - SIMPLE ASSAULT', 2: 'BURGLARY', 3: 'BURGLARY FROM VEHICLE', 4: 'INTIMATE PARTNER - SIMPLE ASSAULT', 5: 'THEFT OF IDENTITY', 6: 'THEFT PLAIN - PETTY ($950 & UNDER)', 7: 'VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)', 8: 'VANDALISM - MISDEAMEANOR ($399 OR UNDER)', 9: 'VEHICLE - STOLEN'}


In [9]:
import joblib
joblib.dump(target_le, "../models/sf_label_encoder.pkl")
print("Label Encoder saved.")


Label Encoder saved.


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import resample
import matplotlib.pyplot as plt
import seaborn as sns

# ---- Load cleaned Chicago dataset ----
df = pd.read_csv("../dataset/boston_crime_features.csv")  # adjust path


# ---- Keep top 10 most frequent crimes ----
top_crimes = df['Crime_Type'].value_counts().nlargest(10).index
df = df[df['Crime_Type'].isin(top_crimes)]

# ---- Select features ----
features = [
    'Latitude', 'Longitude', 'Zone', 'Hour', 'DayOfWeek',
    'IsWeekend', 'Light_Condition', 'Temperature', 'Rain'
]
X = df[features].copy()
y = df['Crime_Type']

# ---- Encode categorical columns (like DayOfWeek or Light_Condition) ----
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

# ---- Balance classes ----
df_balanced = pd.concat([
    resample(df[df['Crime_Type'] == c],
             replace=True,
             n_samples=df['Crime_Type'].value_counts().max(),
             random_state=42)
    for c in top_crimes
])
X = df_balanced[features].copy()
y = df_balanced['Crime_Type']

for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])

In [11]:
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)

In [12]:
import joblib
joblib.dump(target_le, "../models/boston_label_encoder.pkl")
print("Label Encoder saved.")

Label Encoder saved.
