In [2]:
# -- coding: utf-8 --
# 🚀 Faulty Device Severity Prediction (RandomForest + Fixed Encoders + Export)

import pandas as pd
import numpy as np
import joblib
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# --------------------------
# STEP 1: Load Data
# --------------------------
df = pd.read_csv("events_cleaned_label_encoded.csv", encoding="ISO-8859-1", low_memory=False)

# --------------------------
# STEP 2: Preprocessing Dates
# --------------------------
df["date_initiated_by_firm"] = pd.to_datetime(df["date_initiated_by_firm"], errors="coerce", dayfirst=True)
df["date_terminated"] = pd.to_datetime(df["date_terminated"], errors="coerce", dayfirst=True)
df["date_updated"] = pd.to_datetime(df["date_updated"], errors="coerce", dayfirst=True)

# --------------------------
# STEP 3: Recall Duration Logic
# --------------------------
df["recall_duration_days"] = np.nan

terminated_mask = df["status"].str.lower() == "terminated"
df.loc[terminated_mask, "recall_duration_days"] = (
    df.loc[terminated_mask, "date_terminated"] - df.loc[terminated_mask, "date_initiated_by_firm"]
).dt.days

ongoing_mask = df["status"].str.lower() == "ongoing"
df.loc[ongoing_mask, "recall_duration_days"] = (
    df.loc[ongoing_mask, "date_updated"] - df.loc[ongoing_mask, "date_initiated_by_firm"]
).dt.days

df["recall_duration_days"] = df["recall_duration_days"].fillna(df["recall_duration_days"].median())

# --------------------------
# STEP 4: Target Label
# --------------------------
df["label"] = np.where(df["recall_duration_days"] <= 50, 1, 0)
print("Label distribution:\n", df["label"].value_counts())

# --------------------------
# STEP 5: Feature Selection
# --------------------------
exclude_cols = ["device_id", "date_initiated_by_firm", "date_terminated", "date_updated",
                "recall_duration_days", "label"]

feature_cols = [col for col in df.columns if col not in exclude_cols]
print("Using features:", feature_cols)

X = df[feature_cols].copy()
y = df["label"]

# --------------------------
# STEP 6: Encode Categoricals (Save Encoders)
# --------------------------
encoders = {}
for col in X.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders[col] = le  # save encoder for inference

# --------------------------
# STEP 7: Train/Test Split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --------------------------
# STEP 8: Train Random Forest
# --------------------------
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# --------------------------
# STEP 9: Evaluate
# --------------------------
y_pred = model.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# --------------------------
# STEP 10: Save as Single Artifact
# --------------------------
artifact = {
    "model": model,
    "feature_cols": feature_cols,
    "encoders": encoders
}

# joblib.dump(artifact, "device_failure_pipeline.joblib", compress=3)
# print("✅ Single pipeline joblib saved successfully.")

# --------------------------
# STEP 11: Predict Function
# --------------------------
def predict_device_failure(device_id):
    device_row = df[df["device_id"] == device_id].copy()
    if device_row.empty:
        return {"error": f"Device ID {device_id} not found"}

    X_new = device_row[artifact["feature_cols"]].copy()

    # Apply SAME encoders
    for col, le in artifact["encoders"].items():
        if col in X_new:
            X_new[col] = le.transform(X_new[col].astype(str))

    pred_class = artifact["model"].predict(X_new)[0]
    pred_prob = artifact["model"].predict_proba(X_new)[0][1]

    return {
        "device_id": int(device_id),
        "failure_prediction": int(pred_class),
        "risk_percentage": round(float(pred_prob) * 100, 2),
        "within_50_days": "Yes" if pred_class == 1 else "No"
    }

# --------------------------
# STEP 12: Example Run (20 Random Devices)
# --------------------------

print("\n🔍 Predictions for given Device:\n")
print(predict_device_failure(19717))

Label distribution:
 label
0    57521
1    42479
Name: count, dtype: int64
Using features: ['id', 'action', 'action_classification', 'action_summary', 'reason', 'manufacturer_id', 'type', 'date_posted', 'status']
✅ Accuracy: 0.69865

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.84      0.76     11504
           1       0.70      0.51      0.59      8496

    accuracy                           0.70     20000
   macro avg       0.70      0.67      0.68     20000
weighted avg       0.70      0.70      0.69     20000


🔍 Predictions for given Device:

{'device_id': 19717, 'failure_prediction': 0, 'risk_percentage': 9.33, 'within_50_days': 'No'}
