# Risk Model Score

In [126]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

df = pd.read_csv('california_risk_model_data_dec2022_mar2023.csv')

## Join with Risk Model Y

In [127]:
risk_y = pd.read_csv('risk_model_y.csv')

df['date'] = pd.to_datetime(df['date'])
risk_y['start_time'] = pd.to_datetime(risk_y['start_time'])

merged = df.merge(
    risk_y, 
    left_on=['county_name', 'date'], 
    right_on=['county', 'start_time'],
    how='left'
)

print(f"Merged data shape: {merged.shape}")
merged.head()

Merged data shape: (7018, 41)


Unnamed: 0,county_code,county_name,date,ivt_max,ar_intensity,gage_height_mean,streamflow_mean,wind_speed_max,wind_gust_max,precip_total,runoff_total,temp_anomaly,soil_moisture_mean,IVT_max,IVT_duration,AR_category,Precip_24h,Precip_72h,Wind_gust_max,Soil_moisture_pct,API_7d,API_14d,Snowpack_SWE,Temp_anomaly,Streamflow_pct,Streamflow_p95_exceed,Runoff_ratio,Flood_stage_exceed,AR_count_7d,AR_count_14d,Wet_days_10,Dry_gap,Population_exposed,Infrastructure_density,County_area,Agricultural_share,start_time,county,duration,max_customers,risk score
0,6001,Alameda,2022-12-01,517.87616,AR2,3.363378,8.844656,10.958761,28.427376,3.846169,0.067472,2.004796,0.180444,517.87616,12,AR2,3.846169,3.846169,28.427376,0.180444,3.846169,3.846169,80,2.004796,56.25,-0.990279,0.017543,0,1.0,1.0,1.0,0,100000,2.0,5000,0.45,NaT,,,,
1,6001,Alameda,2022-12-02,231.908783,,3.363423,9.136874,6.120897,11.851345,0.020504,0.000238,-2.666713,0.343442,231.908783,0,,0.020504,3.866673,11.851345,0.343442,3.482056,3.674364,80,-2.666713,59.375,-0.989958,0.011628,0,1.0,1.0,1.0,1,100000,2.0,5000,0.45,NaT,,,,
2,6001,Alameda,2022-12-03,456.337891,AR1,3.365482,7.29801,5.717101,9.30435,0.570774,0.009298,-2.150843,0.315793,456.337891,12,AR1,0.570774,4.437447,9.30435,0.315793,3.666162,4.051805,80,-2.150843,46.875,-0.991979,0.016291,0,2.0,2.0,1.0,2,100000,2.0,5000,0.45,NaT,,,,
3,6001,Alameda,2022-12-04,550.994995,AR2,3.800255,41.210267,6.26716,22.702106,1.450539,0.032067,0.536351,0.388857,550.994995,12,AR2,1.450539,2.041817,22.702106,0.388857,4.672956,5.280471,80,0.536351,78.125,-0.954708,0.022107,0,3.0,3.0,2.0,0,100000,2.0,5000,0.45,NaT,,,,
4,6001,Alameda,2022-12-05,330.216705,AR1,3.439359,8.097739,6.848634,18.281908,0.118732,0.001907,3.223302,0.352034,330.216705,12,AR1,0.118732,2.140045,18.281908,0.352034,4.20289,5.104804,80,3.223302,53.125,-0.9911,0.016064,0,4.0,4.0,2.0,1,100000,2.0,5000,0.45,NaT,,,,


## Clean Up Data

In [128]:
# Drop specified columns
columns_to_drop = ['max_customers', 'duration', 'start_time', 'county', 
                   'Population_exposed', 'Infrastructure_density', 
                   'County_area', 'Agricultural_share', 'ar_intensity']

merged = merged.drop(columns=columns_to_drop, errors='ignore')
merged['risk score'] = merged['risk score'].fillna(0)

# Fill AR_category null values with 'N/A'
merged['AR_category'] = merged['AR_category'].fillna('No AR')

cols = ['gage_height_mean', 'streamflow_mean', 'Streamflow_pct']

for col in cols:
    county_mean = merged.groupby('county_name')[col].transform('mean')
    global_mean = merged[col].mean()
    merged[col] = merged[col].fillna(county_mean.fillna(global_mean))


## Machine Learning Model Training

In [129]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

# Prepare data for ML
ml_data = merged.copy()

# Exclude county_code, county_name, date from features
exclude_cols = ['county_code', 'county_name', 'date', 'risk score']
feature_cols = [col for col in ml_data.columns if col not in exclude_cols]

# Encode categorical variable AR_category
le = LabelEncoder()
ml_data['AR_category_encoded'] = le.fit_transform(ml_data['AR_category'])

# Update feature columns
feature_cols = [col for col in feature_cols if col != 'AR_category'] + ['AR_category_encoded']

# Prepare X and y
X = ml_data[feature_cols]
y = ml_data['risk score'].astype(int)

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:")
print(y.value_counts())
print(f"\nFeature columns: {feature_cols}")

Features shape: (7018, 28)
Target shape: (7018,)

Target distribution:
risk score
0    6794
1     224
Name: count, dtype: int64

Feature columns: ['ivt_max', 'gage_height_mean', 'streamflow_mean', 'wind_speed_max', 'wind_gust_max', 'precip_total', 'runoff_total', 'temp_anomaly', 'soil_moisture_mean', 'IVT_max', 'IVT_duration', 'Precip_24h', 'Precip_72h', 'Wind_gust_max', 'Soil_moisture_pct', 'API_7d', 'API_14d', 'Snowpack_SWE', 'Temp_anomaly', 'Streamflow_pct', 'Streamflow_p95_exceed', 'Runoff_ratio', 'Flood_stage_exceed', 'AR_count_7d', 'AR_count_14d', 'Wet_days_10', 'Dry_gap', 'AR_category_encoded']


In [130]:
# Split data into train and test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"\nTraining set target distribution:")
print(y_train.value_counts())
print(f"\nTest set target distribution:")
print(y_test.value_counts())

Training set size: 5614
Test set size: 1404

Training set target distribution:
risk score
0    5435
1     179
Name: count, dtype: int64

Test set target distribution:
risk score
0    1359
1      45
Name: count, dtype: int64


In [131]:
# Train Random Forest Classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

print("Training Random Forest model...")
rf_model.fit(X_train, y_train)
print("Training complete!")

Training Random Forest model...
Training complete!


In [132]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate model
print("Model Evaluation:")
print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Model Evaluation:

Accuracy: 0.9687

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1359
           1       0.56      0.11      0.19        45

    accuracy                           0.97      1404
   macro avg       0.76      0.55      0.58      1404
weighted avg       0.96      0.97      0.96      1404


Confusion Matrix:
[[1355    4]
 [  40    5]]


## Improve Recall for Class 1

In [133]:
from sklearn.utils.class_weight import compute_class_weight

print("Class distribution:")
print(y_train.value_counts())
print(f"\nClass imbalance ratio: {y_train.value_counts()[0] / y_train.value_counts()[1]:.2f}:1")

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
print(f"\nClass weights: {class_weight_dict}")

Class distribution:
risk score
0    5435
1     179
Name: count, dtype: int64

Class imbalance ratio: 30.36:1

Class weights: {0: np.float64(0.5164673413063477), 1: np.float64(15.681564245810057)}


In [134]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [135]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
import time
from xgboost import XGBClassifier

# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'Decision Tree': DecisionTreeClassifier(class_weight='balanced', max_depth=15, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=15, class_weight='balanced', random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, max_depth=5, scale_pos_weight=30, random_state=42, eval_metric='logloss')
}

results = []

for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()
    
    model.fit(X_train_smote, y_train_smote)
    y_pred_model = model.predict(X_test)
    
    train_time = time.time() - start_time
    
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred_model),
        'Precision_Class1': precision_score(y_test, y_pred_model),
        'Recall_Class1': recall_score(y_test, y_pred_model),
        'F1_Class1': f1_score(y_test, y_pred_model),
        'Train_Time': train_time
    })
    
    print(f"Completed in {train_time:.2f}s")

model_comparison = pd.DataFrame(results).sort_values('Recall_Class1', ascending=False)
print("\n" + "="*80)
print("MODEL COMPARISON (sorted by Recall for Class 1)")
print("="*80)
print(model_comparison.to_string(index=False))


Training Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Completed in 0.42s

Training Decision Tree...
Completed in 0.24s

Training Random Forest...
Completed in 0.72s

Training Gradient Boosting...
Completed in 9.94s

Training AdaBoost...




Completed in 2.31s

Training XGBoost...
Completed in 0.26s

MODEL COMPARISON (sorted by Recall for Class 1)
              Model  Accuracy  Precision_Class1  Recall_Class1  F1_Class1  Train_Time
Logistic Regression  0.813390          0.121951       0.777778   0.210843    0.417224
      Random Forest  0.960826          0.413793       0.533333   0.466019    0.717731
      Decision Tree  0.923789          0.207547       0.488889   0.291391    0.238053
            XGBoost  0.955840          0.355932       0.466667   0.403846    0.255770
           AdaBoost  0.926638          0.204082       0.444444   0.279720    2.306607
  Gradient Boosting  0.966524          0.473684       0.400000   0.433735    9.937970


In [136]:
# Detailed evaluation of top 3 models
top_models = model_comparison.head(3)['Model'].tolist()

for model_name in top_models:
    print(f"\n{'='*80}")
    print(f"{model_name} - Detailed Report")
    print(f"{'='*80}")
    
    model = models[model_name]
    model.fit(X_train_smote, y_train_smote)
    y_pred_model = model.predict(X_test)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_model))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred_model)
    print(cm)
    print(f"\nTrue Negatives: {cm[0,0]}, False Positives: {cm[0,1]}")
    print(f"False Negatives: {cm[1,0]}, True Positives: {cm[1,1]}")


Logistic Regression - Detailed Report


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.81      0.89      1359
           1       0.12      0.78      0.21        45

    accuracy                           0.81      1404
   macro avg       0.56      0.80      0.55      1404
weighted avg       0.96      0.81      0.87      1404


Confusion Matrix:
[[1107  252]
 [  10   35]]

True Negatives: 1107, False Positives: 252
False Negatives: 10, True Positives: 35

Random Forest - Detailed Report

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1359
           1       0.41      0.53      0.47        45

    accuracy                           0.96      1404
   macro avg       0.70      0.75      0.72      1404
weighted avg       0.97      0.96      0.96      1404


Confusion Matrix:
[[1325   34]
 [  21   24]]

True Negatives: 1325, False Positives: 34
False Negatives: 21, True Positives: 24

Decis

In [None]:
# Select best model based on recall
best_model_name = model_comparison.iloc[0]['Model']
best_model = models[best_model_name]

print(f"Best model for Class 1 Recall: {best_model_name}")
print(f"Recall: {model_comparison.iloc[0]['Recall_Class1']:.4f}")
print(f"Precision: {model_comparison.iloc[0]['Precision_Class1']:.4f}")
print(f"F1 Score: {model_comparison.iloc[0]['F1_Class1']:.4f}")

# Train on full SMOTE data and predict probability scores
best_model.fit(X_train_smote, y_train_smote)
ml_data['predicted_risk_score'] = best_model.predict_proba(X)[:, 1]

# Save final results
final_results = ml_data[['county_name', 'date', 'risk score', 'predicted_risk_score']].copy()
final_results.to_csv('risk_model_score_predictions.csv', index=False)

print(f"\nFinal predictions saved to risk_model_score_predictions.csv")
print(f"\nPredicted risk score statistics (0-1 scale):")
print(final_results['predicted_risk_score'].describe())
print(f"\nSample predictions:")
print(final_results[final_results['predicted_risk_score'] > 0.5].head(10))

Best model for Class 1 Recall: Logistic Regression
Recall: 0.7778
Precision: 0.1220
F1 Score: 0.2108

Final predictions saved to risk_model_predictions_best.csv

Predicted risk score statistics (0-1 scale):
count    7.018000e+03
mean     2.452568e-01
std      2.785584e-01
min      3.612995e-07
25%      2.842306e-02
50%      1.150691e-01
75%      3.902627e-01
max      9.993807e-01
Name: predicted_risk_score, dtype: float64

Sample predictions:
   county_name       date  risk score  predicted_risk_score
9      Alameda 2022-12-10         0.0              0.542350
10     Alameda 2022-12-11         0.0              0.545634
26     Alameda 2022-12-27         0.0              0.754458
30     Alameda 2022-12-31         0.0              0.860257
32      Alpine 2022-12-02         0.0              0.894106
33      Alpine 2022-12-03         0.0              0.678616
34      Alpine 2022-12-04         0.0              0.895378
35      Alpine 2022-12-05         0.0              0.735474
36      Alpin

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
