In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
import optuna
from sklearn.model_selection import cross_val_score
import xgboost as xgb
from imblearn.over_sampling import SMOTE


pd.options.display.max_columns = 400
pd.options.display.max_rows = 400

crimes = '/Users/oliverbohler/Desktop/Springboard/capstone project 2 violent crime in los angeles/Crime_LA_2010_2023.csv'
crime = pd.read_csv(crimes)
crime.head()

Unnamed: 0,Date Occured,Time Occured,Area Name,Zone,Crime Code,year,month,day,Weekday,Crime Code Description,Vict Age,Crime Category,Vict Sex,Vict Descent,Premis Code,Premis Group,Premis Desc,Weapon Category,Weapon Used Cd,Weapon Desc,Status,Status Desc,Location,Lat,Lon
0,2010-01-02,21:00,Central,Hollywood/Greater Downtown Area,122,2010,1,2,Saturday,"RAPE, ATTEMPTED",47.0,Sexual Offenses,F,H,103.0,Street/Outdoor,ALLEY,Strong-Arm,400,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,8TH ST,34.0387,-118.2488
1,2010-01-08,21:00,Central,Hollywood/Greater Downtown Area,230,2010,1,8,Friday,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",51.0,Assault & Battery,M,B,710.0,Other Premise,OTHER PREMISE,Other Object,500,UNKNOWN WEAPON/OTHER WEAPON,AA,Adult Arrest,500 CROCKER ST,34.0435,-118.2427
2,2010-01-09,02:30,Central,Hollywood/Greater Downtown Area,230,2010,1,9,Saturday,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",30.0,Assault & Battery,M,H,108.0,Street/Outdoor,PARKING LOT,Strong-Arm,400,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,800 W OLYMPIC BL,34.045,-118.264
3,2010-01-14,14:45,Central,Hollywood/Greater Downtown Area,624,2010,1,14,Thursday,BATTERY - SIMPLE ASSAULT,38.0,Assault & Battery,F,B,101.0,Street/Outdoor,STREET,Strong-Arm,400,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",IC,Invest Cont,900 N BROADWAY,34.064,-118.2375
4,2010-01-14,20:00,Central,Hollywood/Greater Downtown Area,210,2010,1,14,Thursday,ROBBERY,40.0,Robbery & Burglary,M,H,101.0,Street/Outdoor,STREET,Strong-Arm,400,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",AO,Adult Other,ALAMEDA ST,34.035,-118.2386


In [20]:
print(type(crime))

<class 'pandas.core.frame.DataFrame'>


In [21]:
crime['Status Desc'].value_counts()

Status Desc
Invest Cont     447698
Adult Other     194436
Adult Arrest    139635
Juv Arrest        9367
Juv Other         3953
UNK                  7
Name: count, dtype: int64

In [22]:
main_label = 'Status Desc'
remove_status = ['Juv Arrest', 'Juv Other', 'UNK']

crime = crime[~crime[main_label].isin(remove_status)]

crime[main_label] = crime[main_label].apply(lambda x: 1 if x in ['Adult Arrest', 'Adult Other'] else 0).astype(int)

crime['Vict Age'] = crime['Vict Age'].apply(lambda x: 5 * round(1 / 5 * x))

cols2drop = ['Crime Code', 'Date Occured', 'Weekday', 'Area Name', 'Time Occured', 'Crime Code Description',
             'Premis Code', 'Premis Desc', 'Weapon Used Cd', "Weapon Desc", 'Status',
             'Location', 'Lat', 'Lon']

crime = crime.drop(cols2drop, axis=1)

y = crime[main_label].values.reshape(-1,)
X = crime.drop([main_label], axis=1)

cat_cols = X.select_dtypes(include=['object']).columns
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0, stratify=y)


In [23]:
# add class weights to handle imbalanced classes
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
print(class_weights)

{0: 0.8730993025424534, 1: 1.1700628982589791}


In [24]:
model = RandomForestClassifier(n_estimators = 100, random_state=42, class_weight = class_weights)
model.fit(X_train, y_train)

accuracy = accuracy_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_prob)


y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC score: {roc_auc:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.65
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.74      0.71     89540
           1       0.60      0.53      0.56     66814

    accuracy                           0.65    156354
   macro avg       0.64      0.64      0.64    156354
weighted avg       0.65      0.65      0.65    156354

Confusion Matrix:
[[66410 23130]
 [31439 35375]]


In [25]:
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print(f"Original training set shape: {X_train.shape}, {y_train.shape}")
print(f"Resampled training set shape: {X_train_res.shape}, {y_train_res.shape}")

Original training set shape: (625415, 22), (625415,)
Resampled training set shape: (716316, 22), (716316,)


In [26]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42
)

model.fit(X_train_res, y_train_res)

y_test_pred = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC score: {roc_auc:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Accuracy: 0.67
ROC AUC score: 0.7289
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.71      0.71     89540
           1       0.61      0.62      0.62     66814

    accuracy                           0.67    156354
   macro avg       0.66      0.67      0.66    156354
weighted avg       0.67      0.67      0.67    156354

Confusion Matrix:
[[63593 25947]
 [25402 41412]]


In [27]:
from imblearn.pipeline import Pipeline
def objective(trial):
    # Define hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    class_weight = trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample', None])
    bootstrap = trial.suggest_categorical('bootstrap', [True, False])

    # Create the RandomForest model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        class_weight=class_weight,
        bootstrap=bootstrap,
        random_state=0
    )
    
    # Create a pipeline with SMOTE and the model
    pipeline = Pipeline([
        ('smote', SMOTE(random_state=42)),
        ('model', model)
    ])

    # Evaluate the model using cross-validation with SMOTE
    roc_auc = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='roc_auc').mean()
    
    return roc_auc

# Create a study object
study = optuna.create_study(direction='maximize')

# Start the optimization
study.optimize(objective, n_trials=25)  # Increase the number of trials

# Print the best parameters found
print("Best parameters found: ", study.best_params)
print("Best ROC AUC score: ", study.best_value)


[I 2024-09-03 10:42:24,737] A new study created in memory with name: no-name-6eb401f6-9f9d-49cd-9604-b73a8d7163c0
[I 2024-09-03 10:50:19,595] Trial 0 finished with value: 0.7231776697895226 and parameters: {'n_estimators': 137, 'max_depth': 29, 'min_samples_split': 17, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'class_weight': None, 'bootstrap': False}. Best is trial 0 with value: 0.7231776697895226.
[I 2024-09-03 11:19:06,777] Trial 1 finished with value: 0.726820742207512 and parameters: {'n_estimators': 233, 'max_depth': 20, 'min_samples_split': 9, 'min_samples_leaf': 5, 'max_features': None, 'class_weight': 'balanced', 'bootstrap': True}. Best is trial 1 with value: 0.726820742207512.
[I 2024-09-03 11:34:32,202] Trial 2 finished with value: 0.7129814966852782 and parameters: {'n_estimators': 146, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 3, 'max_features': None, 'class_weight': 'balanced_subsample', 'bootstrap': False}. Best is trial 1 with value: 0.7268207422

Best parameters found:  {'n_estimators': 498, 'max_depth': 21, 'min_samples_split': 12, 'min_samples_leaf': 8, 'max_features': 'sqrt', 'class_weight': None, 'bootstrap': True}
Best ROC AUC score:  0.7341695758517015


In [28]:
best_params = study.best_params

# Retrain the model with the best parameters on the resampled data
model = RandomForestClassifier(**best_params, random_state=0)
model.fit(X_train_res, y_train_res)

y_test_pred = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)[:, 1]

roc_auc_test = roc_auc_score(y_test, y_test_prob)
accuracy_test = accuracy_score(y_test, y_test_pred)

print(f"Test ROC AUC score: {roc_auc_test:.4f}")
print(f"Accuracy: {accuracy_test:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Test ROC AUC score: 0.7367
Accuracy: 0.68
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.71      0.72     89540
           1       0.62      0.63      0.63     66814

    accuracy                           0.68    156354
   macro avg       0.67      0.67      0.67    156354
weighted avg       0.68      0.68      0.68    156354

Confusion Matrix:
[[63502 26038]
 [24391 42423]]


In [29]:
model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train_res, y_train_res)

y_test_pred = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC score: {roc_auc:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Accuracy: 0.68
ROC AUC score: 0.7341
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.75      0.73     89540
           1       0.63      0.59      0.61     66814

    accuracy                           0.68    156354
   macro avg       0.67      0.67      0.67    156354
weighted avg       0.68      0.68      0.68    156354

Confusion Matrix:
[[66796 22744]
 [27429 39385]]


In [30]:
train_pool = Pool(X_train_res, y_train_res)
test_pool = Pool(X_test, y_test)

# Define the CatBoost model
model = CatBoostClassifier(
    iterations=350,
    depth=5,
    border_count=22,
    l2_leaf_reg=0.3,
    verbose=100  
)

model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50)

y_test_pred = model.predict(test_pool)
y_test_prob = model.predict_proba(test_pool)[:, 1]

roc_auc_test = roc_auc_score(y_test, y_test_prob)
accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test ROC AUC score: {roc_auc_test:.4f}")
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))


0:	learn: 0.6890092	test: 0.6890348	best: 0.6890348 (0)	total: 50.7ms	remaining: 17.7s
100:	learn: 0.5958910	test: 0.6145881	best: 0.6145881 (100)	total: 3.82s	remaining: 9.41s
200:	learn: 0.5683794	test: 0.6053789	best: 0.6053789 (200)	total: 7.56s	remaining: 5.61s
300:	learn: 0.5545970	test: 0.6013192	best: 0.6013192 (300)	total: 11.3s	remaining: 1.83s
349:	learn: 0.5510708	test: 0.5999993	best: 0.5999993 (349)	total: 13s	remaining: 0us

bestTest = 0.5999992545
bestIteration = 349

Test ROC AUC score: 0.7309
Accuracy: 0.68
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.75      0.73     89540
           1       0.64      0.58      0.60     66814

    accuracy                           0.68    156354
   macro avg       0.67      0.67      0.67    156354
weighted avg       0.68      0.68      0.68    156354

Confusion Matrix:
[[67562 21978]
 [28351 38463]]


In [31]:
lr_model = LogisticRegression(random_state=42, max_iter=1000)

lr_cv_scores = cross_val_score(lr_model, X_train_res, y_train_res, cv=5)
print(f"Logistic Regression Cross-validation scores: {lr_cv_scores}")
print(f"Mean Logistic Regression cross-validation score: {lr_cv_scores.mean():.2f}")

lr_model.fit(X_train_res, y_train_res)

y_test_pred = lr_model.predict(X_test)
y_test_prob = lr_model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC score: {roc_auc:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Logistic Regression Cross-validation scores: [0.65374414 0.65242247 0.6537836  0.65411865 0.65581483]
Mean Logistic Regression cross-validation score: 0.65
Accuracy: 0.66
ROC AUC score: 0.7057
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.66      0.69     89540
           1       0.59      0.65      0.62     66814

    accuracy                           0.66    156354
   macro avg       0.65      0.65      0.65    156354
weighted avg       0.66      0.66      0.66    156354

Confusion Matrix:
[[59303 30237]
 [23641 43173]]


In [32]:
gb_model = GradientBoostingClassifier(n_estimators=10, random_state=42)

gb_cv_scores = cross_val_score(gb_model, X_train_res, y_train_res, cv=5)
print(f"Gradient Boosting Cross-validation scores: {gb_cv_scores}")
print(f"Mean Gradient Boosting cross-validation score: {gb_cv_scores.mean():.2f}")

gb_model.fit(X_train_res, y_train_res)

y_test_pred = gb_model.predict(X_test)
y_test_prob = gb_model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC score: {roc_auc:.4f}")
print("Classification Report:")
print(classification_report(y_test, y_test_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Gradient Boosting Cross-validation scores: [0.65389072 0.65053782 0.65314841 0.65553562 0.65666641]
Mean Gradient Boosting cross-validation score: 0.65
Accuracy: 0.66
ROC AUC score: 0.7049
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.67      0.69     89540
           1       0.59      0.64      0.61     66814

    accuracy                           0.66    156354
   macro avg       0.65      0.65      0.65    156354
weighted avg       0.66      0.66      0.66    156354

Confusion Matrix:
[[60070 29470]
 [24298 42516]]


In [33]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train_res, y_train_res)

y_test_pred_knn = knn.predict(X_test)
y_test_prob_knn = knn.predict_proba(X_test)[:, 1]

accuracy_knn = accuracy_score(y_test, y_test_pred_knn)
roc_auc_knn = roc_auc_score(y_test, y_test_prob_knn)

print(f"k-NN Accuracy: {accuracy_knn:.2f}")
print(f"k-NN ROC AUC Score: {roc_auc_knn:.4f}")
print("Classification Report for k-NN:\n", classification_report(y_test, y_test_pred_knn))
print("Confusion Matrix for k-NN:\n", confusion_matrix(y_test, y_test_pred_knn))

k-NN Accuracy: 0.62
k-NN ROC AUC Score: 0.6613
Classification Report for k-NN:
               precision    recall  f1-score   support

           0       0.69      0.63      0.66     89540
           1       0.55      0.61      0.58     66814

    accuracy                           0.62    156354
   macro avg       0.62      0.62      0.62    156354
weighted avg       0.63      0.62      0.62    156354

Confusion Matrix for k-NN:
 [[56276 33264]
 [25817 40997]]
