In [None]:
# 🚀 ML-Powered Risk Detection – Databricks

## 1. Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## 2. Load CSV from Unity Catalog Volume
df_spark = spark.read.option("header", True).csv("dbfs:/Volumes/etl-pipeline/default/ops_data/sample_ops_data.csv")
df = df_spark.toPandas()

## 3. Create Binary Target
df['risk_flag'] = df['incident_t'].apply(lambda x: 1 if pd.notnull(x) and x.strip() != '' else 0)

## 4. Feature Selection + Encoding
features = ['delay_min', 'crew_id', 'aircraft_id', 'location']
df_encoded = pd.get_dummies(df[features], drop_first=True)
X = df_encoded
y = df['risk_flag']

## 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 6. Train Model with Class Imbalance Handling
model = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)

## 7. Evaluate Model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

## 8. Save Predictions to Delta Table
df['predicted_risk'] = model.predict(X)
df_spark_out = spark.createDataFrame(df)
df_spark_out.write.format("delta").mode("overwrite").saveAsTable("`etl-pipeline`.default.predicted_risk_output")