Steps:
1.Input: 
- Area name
- Time: morning, afternoon, evening, night 
- Day: weekday, weekend 
- address? 

2. Risk level: Low, Medium, High risk
- number of crimes
- number of part 1 or 2 crimes 


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


crime_lapd = pd.read_csv("crime_20_24_clean.csv")
crime_lapd['date_occ'] = pd.to_datetime(crime_lapd['date_occ'])
crime_lapd['hour'] = crime_lapd['date_occ'].dt.hour
crime_lapd['day_of_week'] = crime_lapd['date_occ'].dt.dayofweek
crime_lapd['month'] = crime_lapd['date_occ'].dt.month
crime_lapd['year_month'] = crime_lapd['date_occ'].dt.to_period('M')

total_crimes = crime_lapd.groupby(['area_name','year_month']).size()
part1_crimes = crime_lapd[crime_lapd['part_1-2']==1].groupby(['area_name','year_month']).size()
risk_df = pd.DataFrame({
    'total_crimes': total_crimes,
    'part1_crimes': part1_crimes
}).fillna(0)
risk_df['risk_score'] = risk_df['total_crimes'] + 2 * risk_df['part1_crimes']


low = risk_df['risk_score'].quantile(0.33)
mid = risk_df['risk_score'].quantile(0.66)
def risk_level(score):
    if score <= low:
        return 'Low'
    elif score <= mid:
        return 'Medium'
    else:
        return 'High'

risk_df['risk_level'] = risk_df['risk_score'].apply(risk_level)
risk_df.reset_index(inplace=True)
crime_lapd = crime_lapd.merge(
    risk_df[['area_name','year_month','risk_level']],
    on=['area_name','year_month'],
    how='left'
)


crime_sample = crime_lapd.sample(n=500_000, random_state=42)
X = crime_sample[['area_name', 'hour', 'day_of_week', 'month']]
y = crime_sample['risk_level']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

categorical_features = ['area_name', 'hour', 'day_of_week', 'month']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=400,
        max_depth=20,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

        High       0.81      0.84      0.82     41502
         Low       0.79      0.71      0.75     26346
      Medium       0.63      0.65      0.64     32152

    accuracy                           0.75    100000
   macro avg       0.74      0.73      0.74    100000
weighted avg       0.75      0.75      0.75    100000



In [25]:
import joblib

joblib.dump(model, "crime_risk_model.pkl")

['crime_risk_model.pkl']