In [None]:
import os, random, numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns, joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score,
                             explained_variance_score, mean_absolute_percentage_error,
                             classification_report, confusion_matrix)

import lightgbm as lgb
import xgboost as xgb

sns.set(style="whitegrid")
RND = 42
random.seed(RND)
np.random.seed(RND)

WORKDIR = "/kaggle/working" if 'KAGGLE_URL_BASE' in os.environ else os.getcwd()
os.makedirs(os.path.join(WORKDIR, "models"), exist_ok=True)
os.makedirs(os.path.join(WORKDIR, "plots"), exist_ok=True)
PLOT_DIR = os.path.join(WORKDIR, "plots")

def savefig(name):
    path = os.path.join(PLOT_DIR, f"{name}.png")
    plt.savefig(path, dpi=250, bbox_inches='tight')

damage_types = ["dent", "scratch", "crack", "glass shatter", "lamp broken", "tire flat"]
parts = ["back_bumper","back_door","back_glass","back_light","front_bumper","front_door","front_glass","front_light","hood"]
severity_levels = ["minor","moderate","severe"]

part_base_cost = {
    "back_bumper":4500,"back_door":6500,"back_glass":9000,"back_light":2500,
    "front_bumper":5000,"front_door":7000,"front_glass":12000,"front_light":3000,"hood":8500
}
severity_multiplier = {"minor":0.6,"moderate":1.0,"severe":1.6}
damage_type_multiplier = {"scratch":0.4,"dent":0.9,"crack":1.1,"lamp broken":1.6,"glass shatter":2.3,"tire flat":0.7}

def generate_stage5_dataset(n=8000, out=None):
    rows=[]
    for _ in range(n):
        part=random.choice(parts)
        dmg=random.choice(damage_types)
        sev=random.choices(severity_levels,weights=[0.55,0.35,0.10])[0]
        cov=round(random.uniform(1,25),2)
        base=part_base_cost[part]
        cost=base*severity_multiplier[sev]*damage_type_multiplier[dmg]*(1+cov/100*0.5)
        cost=int(cost*random.uniform(0.88,1.18))
        rows.append([part,sev,dmg,cov,max(cost,250)])
    df=pd.DataFrame(rows,columns=["part","severity","damage_type","coverage_percent","cost"])
    if out: df.to_csv(out,index=False)
    return df

csv_path=os.path.join(WORKDIR,"stage5_cost_dataset_kaggle.csv")
df=generate_stage5_dataset(8000,out=csv_path)

X=df[["part","severity","damage_type","coverage_percent"]]
y=df["cost"]

categorical=["part","severity","damage_type"]
numeric=["coverage_percent"]

preprocess=ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), categorical),
    ("num", "passthrough", numeric)
])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=RND)

def regression_metrics(y_true,y_pred,prefix=""):
    return {
        f"{prefix}MAE": mean_absolute_error(y_true,y_pred),
        f"{prefix}RMSE": np.sqrt(mean_squared_error(y_true,y_pred)),
        f"{prefix}R2": r2_score(y_true,y_pred),
        f"{prefix}MAPE%": mean_absolute_percentage_error(y_true,y_pred)*100,
        f"{prefix}ExplainedVar": explained_variance_score(y_true,y_pred)
    }

def plot_pred_vs_actual(y_true,y_pred,title,fname):
    plt.figure(figsize=(7,6))
    sns.scatterplot(x=y_true,y=y_pred,alpha=0.6)
    mn,mx=min(y_true.min(),y_pred.min()),max(y_true.max(),y_pred.max())
    plt.plot([mn,mx],[mn,mx],'r--')
    plt.title(title)
    plt.grid()
    savefig(fname)
    plt.show()

def plot_residuals(y_true,y_pred,title,fname):
    r=y_true-y_pred
    plt.figure(figsize=(8,4))
    sns.histplot(r,bins=40,kde=True)
    plt.title(title)
    savefig(fname)
    plt.show()

def plot_feature_importance(df_imp,title,fname):
    plt.figure(figsize=(10,6))
    sns.barplot(data=df_imp.head(20),x="importance",y="feature")
    plt.title(title)
    savefig(fname)
    plt.show()

def classification_report_for_costs(y_true,y_pred,bins=[0,2000,5000,10000,20000,1e9]):
    labels=[f"bin_{i}" for i in range(len(bins)-1)]
    t=pd.cut(y_true,bins=bins,labels=labels)
    p=pd.cut(y_pred,bins=bins,labels=labels)
    print(confusion_matrix(t,p))
    print(classification_report(t,p,zero_division=0))

lgb_model=lgb.LGBMRegressor(n_estimators=650,learning_rate=0.045,num_leaves=60,
                             subsample=0.9,colsample_bytree=0.9,reg_alpha=0.2,
                             reg_lambda=0.4,random_state=RND)

pipeline_lgb=Pipeline([("prep",preprocess),("model",lgb_model)])
pipeline_lgb.fit(X_train,y_train)
y_pred_lgb=pipeline_lgb.predict(X_test)

metrics_lgb=regression_metrics(y_test,y_pred_lgb,"LGB_")
metrics_lgb

plot_pred_vs_actual(y_test,y_pred_lgb,"LightGBM: Pred vs Actual","lgb_pred_vs_actual")
plot_residuals(y_test,y_pred_lgb,"LightGBM Residuals","lgb_residuals")
classification_report_for_costs(y_test,y_pred_lgb)

ohe_l=pipeline_lgb.named_steps["prep"].transformers_[0][1]
features=list(ohe_l.get_feature_names_out(categorical))+numeric
imp=pipeline_lgb.named_steps["model"].feature_importances_
fi=pd.DataFrame({"feature":features,"importance":imp}).sort_values("importance",ascending=False)
plot_feature_importance(fi,"LightGBM Feature Importance","lgb_feature_importance")

joblib.dump(pipeline_lgb, os.path.join(WORKDIR,"models","stage5_cost_model_lgbm.pkl"))

xgb_model=xgb.XGBRegressor(n_estimators=600,learning_rate=0.05,max_depth=7,
                           subsample=0.9,colsample_bytree=0.9,reg_lambda=1.0,
                           reg_alpha=0.3,objective="reg:squarederror",
                           tree_method="hist",random_state=RND)

pipeline_xgb=Pipeline([("prep",preprocess),("model",xgb_model)])
pipeline_xgb.fit(X_train,y_train)
y_pred_xgb=pipeline_xgb.predict(X_test)

metrics_xgb=regression_metrics(y_test,y_pred_xgb,"XGB_")
metrics_xgb

plot_pred_vs_actual(y_test,y_pred_xgb,"XGBoost: Pred vs Actual","xgb_pred_vs_actual")
plot_residuals(y_test,y_pred_xgb,"XGBoost Residuals","xgb_residuals")
classification_report_for_costs(y_test,y_pred_xgb)

ohe_x=pipeline_xgb.named_steps["prep"].transformers_[0][1]
features_x=list(ohe_x.get_feature_names_out(categorical))+numeric
imp_x=pipeline_xgb.named_steps["model"].feature_importances_
fi_x=pd.DataFrame({"feature":features_x,"importance":imp_x}).sort_values("importance",ascending=False)
plot_feature_importance(fi_x,"XGBoost Feature Importance","xgb_feature_importance")

joblib.dump(pipeline_xgb, os.path.join(WORKDIR,"models","stage5_cost_model_xgb.pkl"))

compare=pd.DataFrame({
    "model":["LightGBM","XGBoost"],
    "MAE":[metrics_lgb["LGB_MAE"],metrics_xgb["XGB_MAE"]],
    "RMSE":[metrics_lgb["LGB_RMSE"],metrics_xgb["XGB_RMSE"]],
    "R2":[metrics_lgb["LGB_R2"],metrics_xgb["XGB_R2"]],
    "MAPE%":[metrics_lgb["LGB_MAPE%"],metrics_xgb["XGB_MAPE%"]]
})
compare


Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m28.9 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0
Note: you may need to restart the kernel to use updated packages.


OSError: dlopen(/Users/harshilarora/Documents/Car-Damage-Detection-Model/detectron2-env/lib/python3.10/site-packages/lightgbm/lib/lib_lightgbm.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib
  Referenced from: <D44045CD-B874-3A27-9A61-F131D99AACE4> /Users/harshilarora/Documents/Car-Damage-Detection-Model/detectron2-env/lib/python3.10/site-packages/lightgbm/lib/lib_lightgbm.dylib
  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/local/lib/libomp/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/local/lib/libomp/libomp.dylib' (no such file), '/opt/homebrew/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/lib/libomp.dylib' (no such file)