In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    confusion_matrix, classification_report,
    f1_score, recall_score, roc_auc_score,
    precision_recall_curve
)
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier

df = pd.read_csv('../data/filtered_events_country_code.csv')

In [2]:
df['violent'] = (df['event_type'] == 'Riots').astype(int)
df['event_date'] = pd.to_datetime(df['event_date'])

df['month'] = df['event_date'].dt.month
df['is_weekend'] = df['event_date'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)

df = df.sort_values('event_date').reset_index(drop=True)
df["event_id_prefix"] = df["event_id_cnty"].astype(str).str[:3]

columns_for_violence_rates = ['event_id_prefix', 'assoc_actor_1']

for col in columns_for_violence_rates:
    df[f'{col}_total'] = df.groupby(col).cumcount()
    df[f'{col}_violent_sum'] = df.groupby(col)['violent'].cumsum().shift(1).fillna(0)
    df[f'{col}_violence_rate'] = df[f'{col}_violent_sum'] / df[f'{col}_total'].replace(0, np.nan)
    df[f'{col}_violence_rate'] = df[f'{col}_violence_rate'].fillna(0)

categorical_features = ['event_id_prefix', 'assoc_actor_1']
numerical_features = ['year', 'month', 'is_weekend']

violence_rate_features = [f'{col}_violence_rate' for col in columns_for_violence_rates]
for feature in violence_rate_features:
    numerical_features.append(feature)

features = categorical_features + numerical_features
missing_features = [f for f in features if f not in df.columns]

X = df[features]
y = df['violent']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

existing_categorical_features = [f for f in categorical_features if f in X.columns]
existing_numerical_features = [f for f in numerical_features if f in X.columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), existing_categorical_features),
        ('num', StandardScaler(), existing_numerical_features)
    ],
    remainder='drop'
)
"""
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))
"""
class_weights = {0: 1, 1: 10}
models = {
    'XGBoost_Optimized': XGBClassifier(
        n_estimators=2000,
        max_depth=10,
        learning_rate=0.04,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=class_weights[0] / class_weights[1],
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1,
    ),
    'Random_Forest_Optimized': RandomForestClassifier(
        n_estimators=2000,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight={0: 1, 1: 1000},
        random_state=42,
        n_jobs=-1,
    ),

    'Gradient_Boosting_Optimized': GradientBoostingClassifier(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.8,
        random_state=42,
    ),

    'Logistic_Regression': LogisticRegression(
        max_iter=3000,
        random_state=42,
        class_weight={0: 1, 1: 1000},
        C=0.15,
        solver='liblinear'
    )
}

def evaluate_model_with_threshold(model, model_name):
    print(f"\n{'='*60}")
    print(f"{model_name}:")
    print(f"{'='*60}")
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_proba = pipeline.predict_proba(X_val)[:, 1]
    
    precision, recall, thresholds = precision_recall_curve(y_val, y_proba)
    f2_scores = 5 * (precision * recall) / (4 * precision + recall + 1e-8)
    best_idx = np.argmax(f2_scores)
    best_threshold = thresholds[best_idx]
    best_f2 = f2_scores[best_idx]
    
    print(f"Best threshold for F2 score: {best_threshold:.2f} with F2: {best_f2:.3f}")

    y_pred = (y_proba > best_threshold).astype(int)
    
    cm = confusion_matrix(y_val, y_pred)
    print("\nConfusion Matrix:")
    print(pd.DataFrame(
        cm,
        index=['Actual 0', 'Actual 1'],
        columns=['Pred 0', 'Pred 1']
    ))
    
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred, digits=4, zero_division=0))
    
    f1_class_0 = f1_score(y_val, y_pred, pos_label=0, zero_division=0)
    f1_class_1 = f1_score(y_val, y_pred, pos_label=1, zero_division=0)
    print(f"F1 Score (Class 0): {f1_class_0:.4f}")
    print(f"F1 Score (Class 1): {f1_class_1:.4f}")
    
    recall_metric = recall_score(y_val, y_pred, zero_division=0)
    print(f"Recall (Class 1): {recall_metric:.4f}")
    
    roc_auc = roc_auc_score(y_val, y_proba)
    print(f"ROC-AUC Score: {roc_auc:.4f}")

    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='roc_auc', n_jobs=-1)
    print(f"Cross-validation ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    return pipeline, y_pred, y_proba, best_threshold, roc_auc, cv_scores.mean()

results = {}
for name, model in models.items():
    pipeline, y_pred, y_proba, threshold, roc_auc_val, cv_roc_auc = evaluate_model_with_threshold(model, name)
    results[name] = {
        'pipeline': pipeline,
        'y_pred': y_pred,
        'y_proba': y_proba,
        'best_threshold': threshold,
        'roc_auc_val': roc_auc_val,
        'cv_roc_auc': cv_roc_auc
        }


if results:
    valid_results = {k: v for k, v in results.items() if not np.isnan(v['roc_auc_val'])}
    best_model_name = max(valid_results.keys(), key=lambda n: valid_results[n]['roc_auc_val'])
    print(f"\n{'='*60}")
    print(f"BEST MODEL: {best_model_name}")
    best_model = results[best_model_name]['pipeline']
    print(f"{'='*60}")

    clf = best_model.named_steps['classifier']
    if hasattr(clf, 'feature_importances_'):
        all_transformed_features = []
        if hasattr(preprocessor, 'named_transformers_'):
            for transformer_name, _, original_cols in preprocessor.transformers:
                if transformer_name == 'cat':
                    all_transformed_features.extend(preprocessor.named_transformers_['cat'].get_feature_names_out(original_cols))
                elif transformer_name == 'num':
                    all_transformed_features.extend(original_cols)

        importances = clf.feature_importances_
        
        if len(importances) == len(all_transformed_features):
            feat_imp_df = pd.DataFrame({
                'feature': all_transformed_features,
                'importance': importances
            }).sort_values('importance', ascending=False)
                
            print("\nFeature Importances:")
            print(feat_imp_df.head(15).to_string(index=False))



XGBoost_Optimized:


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best threshold for F2 score: 0.01 with F2: 0.344

Confusion Matrix:
          Pred 0  Pred 1
Actual 0   51331    2297
Actual 1     744     552

Classification Report:
              precision    recall  f1-score   support

           0     0.9857    0.9572    0.9712     53628
           1     0.1938    0.4259    0.2663      1296

    accuracy                         0.9446     54924
   macro avg     0.5897    0.6915    0.6188     54924
weighted avg     0.9670    0.9446    0.9546     54924

F1 Score (Class 0): 0.9712
F1 Score (Class 1): 0.2663
Recall (Class 1): 0.4259
ROC-AUC Score: 0.8076
Cross-validation ROC-AUC: 0.7855 (+/- 0.0085)

Random_Forest_Optimized:
Best threshold for F2 score: 0.96 with F2: 0.249

Confusion Matrix:
          Pred 0  Pred 1
Actual 0   50331    3297
Actual 1     853     443

Classification Report:
              precision    recall  f1-score   support

           0     0.9833    0.9385    0.9604     53628
           1     0.1184    0.3418    0.1759      1296

  