In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import joblib

# 🔹 Load dataset
df = pd.read_csv("/content/cleaned_data.csv")  # Update with actual dataset filename

# 🔹 Define Features and Target
X = df.drop(columns=['item_outlet_sales'])  # Replace 'sales' with your actual target column
y = df['item_outlet_sales']

# 🔹 Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🔹 Apply Standard Scaling (Important!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 🔹 Save the scaler for Flask app
joblib.dump(scaler, "models/sc.sav")

# ----------------------------------------
# 🔹 XGBoost Model with Hyperparameter Tuning
# ----------------------------------------
xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

param_grid_xgb = {
    'n_estimators': [100, 300],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5],
    'subsample': [0.8, 1]
}

grid_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_xgb.fit(X_train_scaled, y_train)

best_xgb = grid_xgb.best_estimator_
print("✅ Best XGBoost Params:", grid_xgb.best_params_)

# 🔹 Evaluate XGBoost
y_pred_xgb = best_xgb.predict(X_test_scaled)
print("XGBoost RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_xgb)))

# 🔹 Save XGBoost Model
joblib.dump(best_xgb, "models/xgb_model.pkl")

# ----------------------------------------
# 🔹 LightGBM Model with Hyperparameter Tuning
# ----------------------------------------
lgb_model = lgb.LGBMRegressor()

param_grid_lgb = {
    'num_leaves': [31, 50],
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 300]
}

grid_lgb = GridSearchCV(lgb_model, param_grid_lgb, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_lgb.fit(X_train_scaled, y_train)

best_lgb = grid_lgb.best_estimator_
print("✅ Best LightGBM Params:", grid_lgb.best_params_)

# 🔹 Evaluate LightGBM
y_pred_lgb = best_lgb.predict(X_test_scaled)
print("LightGBM RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lgb)))

# 🔹 Save LightGBM Model
joblib.dump(best_lgb, "models/lgb_model.pkl")

print("🎉 Training complete! Models and scaler saved.")


Fitting 3 folds for each of 16 candidates, totalling 48 fits
✅ Best XGBoost Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1}
XGBoost RMSE: 1030.4515288043328
Fitting 3 folds for each of 8 candidates, totalling 24 fits




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000230 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 783
[LightGBM] [Info] Number of data points in the train set: 6818, number of used features: 9
[LightGBM] [Info] Start training from score 2202.365232
✅ Best LightGBM Params: {'learning_rate': 0.01, 'n_estimators': 300, 'num_leaves': 31}
LightGBM RMSE: 1029.0536208296912
🎉 Training complete! Models and scaler saved.




In [7]:
!mkdir models