# Modeling

In [5]:
import joblib

In [9]:
data=joblib.load(r"C:\Users\sohaibkhan\Project1\Data\EDA.pkl")

In [2]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error, r2_score
# from xgboost import XGBRegressor

# # ==============================
# # 1. Load Data
# # ==============================
# data = pd.read_csv("final_ready_for_modeling.csv")

# # Drop rows where price is NaN or negative
# data = data[(data["price"].notnull()) & (data["price"] >= 0)]

# # Convert all columns to numeric
# data = data.apply(pd.to_numeric, errors="coerce")
# data.dropna(inplace=True)

# # Features & Target
# X = data.drop("price", axis=1)
# y = np.log1p(data["price"])  # Safe log-transform now

# # Reduce memory usage
# X = X.astype(np.float32)
# y = y.astype(np.float32)

# # ==============================
# # 2. Train-Test Split
# # ==============================
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )

# # ==============================
# # 3. XGBoost Model
# # ==============================
# xgb = XGBRegressor(
#     objective='reg:squarederror',
#     n_estimators=500,
#     learning_rate=0.05,
#     max_depth=6,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     min_child_weight=1,
#     gamma=0,
#     tree_method="hist",
#     n_jobs=-1,
#     random_state=42
# )

# # ==============================
# # 4. Train Model
# # ==============================
# xgb.fit(X_train, y_train, verbose=50)  # No early stopping for compatibility

# # ==============================
# # 5. Predictions & Evaluation
# # ==============================
# y_train_pred = np.expm1(xgb.predict(X_train))
# y_test_pred = np.expm1(xgb.predict(X_test))

# train_r2 = r2_score(np.expm1(y_train), y_train_pred)
# test_r2 = r2_score(np.expm1(y_test), y_test_pred)
# rmse = np.sqrt(mean_squared_error(np.expm1(y_test), y_test_pred))

# print(f"\n📊 Train R² Score: {train_r2:.4f}")
# print(f"📊 Test R² Score : {test_r2:.4f}")
# print(f"📉 Test RMSE     : {rmse:.4f}")


In [13]:
import re

# ==============================
# 1. Clean column names
# ==============================
X = data.drop('price', axis=1)
y = data['price']

# Replace special JSON characters with underscores
X.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in X.columns]

# X → all features except price.

# y → the target column price.

# re.sub(...) → replaces special characters (like spaces, -, @, #) in column names with _.
# This prevents LightGBM from throwing errors when column names contain invalid characters.






from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
from lightgbm import LGBMRegressor

# ==============================
# 2. Train-test split
# ==============================



# Clean special characters
X.columns = [re.sub(r'[^A-Za-z0-9_]+', '_', col) for col in X.columns]

# Remove duplicate column names
X = X.loc[:, ~X.columns.duplicated()]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==============================
# 3. Initialize LightGBM
# ==============================
lgbm = LGBMRegressor(
    objective='regression',
    boosting_type='gbdt',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
# objective='regression' → predicts continuous values (price).

# boosting_type='gbdt' → standard gradient boosting.

# n_estimators=500 → number of trees.

# learning_rate=0.05 → smaller rate means slower learning but often better accuracy.

# max_depth=-1 → no depth limit (trees grow until stopping rules kick in).

# num_leaves=31 → controls complexity of trees.

# subsample=0.8 → uses 80% of rows per tree (reduces overfitting).

# colsample_bytree=0.8 → uses 80% of features per tree.

# n_jobs=-1 → uses all CPU cores for speed.









# ==============================
# 4. Train Model with callbacks
# ==============================
lgbm.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    callbacks=[
        lgb.early_stopping(50),
        lgb.log_evaluation(50)
    ]
)
# eval_set → monitors performance on test data while training.

# eval_metric='rmse' → uses Root Mean Squared Error for evaluation.

# early_stopping(50) → stops training if no improvement after 50 rounds (prevents overfitting).

# log_evaluation(50) → prints progress every 50 rounds.










# ==============================
# 5. Predictions & Evaluation
# ==============================
y_train_pred = lgbm.predict(X_train)
y_test_pred = lgbm.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

print(f"✅ Train R² Score: {train_r2:.4f}")
print(f"✅ Test R² Score : {test_r2:.4f}")
print(f"📉 Test RMSE     : {test_rmse:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043196 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3826
[LightGBM] [Info] Number of data points in the train set: 269624, number of used features: 1627
[LightGBM] [Info] Start training from score 0.000253
Training until validation scores don't improve for 50 rounds
[50]	valid_0's rmse: 0.652437	valid_0's l2: 0.425674
[100]	valid_0's rmse: 0.627406	valid_0's l2: 0.393639
[150]	valid_0's rmse: 0.619814	valid_0's l2: 0.38417
[200]	valid_0's rmse: 0.615897	valid_0's l2: 0.379329
[250]	valid_0's rmse: 0.612928	valid_0's l2: 0.375681
[300]	valid_0's rmse: 0.610765	valid_0's l2: 0.373034
[350]	valid_0's rmse: 0.608932	valid_0's l2: 0.370798
[400]	valid_0's rmse: 0.607363	valid_0's l2: 0.36889
[450]	valid_0's rmse: 0.60595	valid_0's l2: 0.367175
[500]	valid_0's rmse: 0.60481	valid_0's l2: 0.365795
Did not meet early stopping. Best iteration is:
[500]	valid



# Cross Validation For lightGBM

In [15]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

# Define K-Fold (5 folds here, but you can try 10 for more robustness)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Use R² as the scoring metric
cv_scores = cross_val_score(
    lgbm,
    X, y,
    cv=kf,
    scoring='r2',
    n_jobs=-1
)

print("Cross-Validation R² Scores:", cv_scores)
print("Mean R²:", np.mean(cv_scores))
print("Std Dev R²:", np.std(cv_scores))


Cross-Validation R² Scores: [0.62952413        nan        nan        nan 0.62873614]
Mean R²: nan
Std Dev R²: nan


3 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "D:\6 Semester\anacondala\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\6 Semester\anacondala\Lib\site-packages\lightgbm\sklearn.py", line 1398, in fit
    super().fit(
  File "D:\6 Semester\anacondala\Lib\site-packages\lightgbm\sklearn.py", line 1049, in fit
    self._Booster = train(
                    ^^^^^^
  File "D:\6 Semester\anacondala\Lib\site-packages\lightgbm\engine.py", line 297, in train
    booster = Booster(params=params, train_set=train_set)
              ^^^

In [19]:


# Save the trained model
joblib.dump(lgbm, "price_predictor.pkl")
print("✅ Model saved as price_predictor.pkl")



✅ Model saved as price_predictor.pkl


In [25]:
print("Training features:", list(data.columns))

Training features: ['price', 'qty_ordered', 'discount_amount', 'year', 'month', 'year', 'month', 'dayofweek', 'is_weekend', 'quarter', 'discount_per_unit', 'is_discounted', 'status_canceled_True', 'status_closed_True', 'status_cod_True', 'status_complete_True', 'status_exchange_True', 'status_fraud_True', 'status_holded_True', 'status_order_refunded_True', 'status_paid_True', 'status_payment_review_True', 'status_pending_True', 'status_pending_paypal_True', 'status_processing_True', 'status_received_True', 'status_refund_True', 'created_at_2016-07-02_00:00:00_True', 'created_at_2016-07-03_00:00:00_True', 'created_at_2016-07-04_00:00:00_True', 'created_at_2016-07-05_00:00:00_True', 'created_at_2016-07-06_00:00:00_True', 'created_at_2016-07-07_00:00:00_True', 'created_at_2016-07-08_00:00:00_True', 'created_at_2016-07-09_00:00:00_True', 'created_at_2016-07-10_00:00:00_True', 'created_at_2016-07-11_00:00:00_True', 'created_at_2016-07-12_00:00:00_True', 'created_at_2016-07-13_00:00:00_True'