In [19]:
# --------------------------
# STEP 1: Imports
# --------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# --------------------------
# STEP 2: Load Data
# --------------------------
df = pd.read_csv("events_cleaned_label_encoded.csv", encoding="ISO-8859-1", low_memory=False)

# --------------------------
# STEP 3: Preprocessing Dates
# --------------------------
df["date_initiated_by_firm"] = pd.to_datetime(df["date_initiated_by_firm"], errors="coerce", dayfirst=True)
df["date_terminated"] = pd.to_datetime(df["date_terminated"], errors="coerce", dayfirst=True)
df["date_updated"] = pd.to_datetime(df["date_updated"], errors="coerce", dayfirst=True)

# --------------------------
# STEP 4: Recall Duration Logic
# --------------------------
df["recall_duration_days"] = np.nan

# Case 1: Terminated → use terminated - initiated
terminated_mask = df["status"].str.lower() == "terminated"
df.loc[terminated_mask, "recall_duration_days"] = (
    df.loc[terminated_mask, "date_terminated"] - df.loc[terminated_mask, "date_initiated_by_firm"]
).dt.days

# Case 2: Ongoing → use updated - initiated
ongoing_mask = df["status"].str.lower() == "ongoing"
df.loc[ongoing_mask, "recall_duration_days"] = (
    df.loc[ongoing_mask, "date_updated"] - df.loc[ongoing_mask, "date_initiated_by_firm"]
).dt.days

# Handle missing durations
df["recall_duration_days"] = df["recall_duration_days"].fillna(df["recall_duration_days"].median())

# --------------------------
# STEP 5: Target Label
# --------------------------
df["label"] = np.where(df["recall_duration_days"] <= 50, 1, 0)
print("Label distribution:\n", df["label"].value_counts())

# --------------------------
# STEP 6: Feature Selection
# --------------------------
exclude_cols = ["device_id", "date_initiated_by_firm", "date_terminated", "date_updated",
                "recall_duration_days", "label"]

feature_cols = [col for col in df.columns if col not in exclude_cols]
print("Using features:", feature_cols)

X = df[feature_cols].copy()
y = df["label"]

# Encode categorical features
for col in X.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# --------------------------
# STEP 7: Train/Test Split
# --------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale numeric features (important for Logistic Regression convergence)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# --------------------------
# STEP 8: Train Model
# --------------------------
# Use class_weight="balanced" to handle imbalance
model = LogisticRegression(max_iter=5000, solver="saga", class_weight="balanced", random_state=42)
model.fit(X_train, y_train)

# --------------------------
# STEP 9: Evaluate
# --------------------------
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# --------------------------
# STEP 10: Predict for a Device
# --------------------------
def predict_device_failure(device_id):
    device_row = df[df["device_id"] == device_id].copy()

    if device_row.empty:
        return {"error": "Device ID not found"}

    X_new = device_row[feature_cols].copy()
    for col in X_new.select_dtypes(include=["object"]).columns:
        le = LabelEncoder()
        X_new[col] = le.fit_transform(X_new[col].astype(str))

    # Scale with same scaler as training
    X_new = scaler.transform(X_new)

    pred_class = model.predict(X_new)[0]
    pred_prob = model.predict_proba(X_new)[0][1]

    return {
        "device_id": device_id,
        "failure_prediction": int(pred_class),
        "risk_percentage": round(float(pred_prob) * 100, 2),
        "within_50_days": "Yes" if pred_class == 1 else "No"
    }

# Example run
print(predict_device_failure(12530))


Label distribution:
 label
0    57521
1    42479
Name: count, dtype: int64
Using features: ['id', 'action', 'action_classification', 'action_summary', 'reason', 'manufacturer_id', 'type', 'date_posted', 'status']
Accuracy: 0.7086

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.85      0.77     11504
           1       0.72      0.51      0.60      8496

    accuracy                           0.71     20000
   macro avg       0.71      0.68      0.68     20000
weighted avg       0.71      0.71      0.70     20000

{'device_id': 12530, 'failure_prediction': 1, 'risk_percentage': 77.27, 'within_50_days': 'Yes'}
