In [74]:
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import json
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import f_regression, chi2
from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMRegressor


In [75]:
# Paths (Notebook-safe)
BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
MODEL_DIR = BASE_DIR / "models"
MODEL_DIR.mkdir(exist_ok=True)

MODEL_OUT = MODEL_DIR / "lgbm_pipeline.joblib"

RANDOM_SEED = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.25


In [76]:
df = pd.read_csv("data.csv")
df.head()


Unnamed: 0,user_id,age,monthly_income,account_age_days,gender,default_payment_method,city,total_sessions,num_product_page_visits,num_cart_visits,...,total_spent,avg_purchase_value,avg_discount_used,impulse_purchase_ratio,past_impulse_purchases,avg_minutes_to_purchase,stress_level,mood_last_week,saving_habit_score,impulse_buy_score
0,1,26,34371.207904,842,Female,UPI,Mumbai,101,36,11,...,8205.970287,2735.323429,13.333333,0.0,0.0,105.333333,5.205191,Sad,3,31.29
1,2,53,15657.516054,985,Male,UPI,Delhi,89,41,9,...,0.0,0.0,0.0,0.0,0.0,0.0,5.892735,Neutral,5,26.97
2,3,23,29061.244738,556,Female,Card,Bengaluru,93,54,9,...,774.538968,774.538968,50.0,0.0,0.0,256.0,4.491991,Neutral,5,28.48
3,4,41,66152.011773,1879,Male,Card,Bengaluru,103,43,6,...,0.0,0.0,0.0,0.0,0.0,0.0,3.486099,Neutral,3,18.49
4,5,43,52571.78723,172,Male,UPI,Kolkata,99,40,12,...,0.0,0.0,0.0,0.0,0.0,0.0,6.550978,Anxious,5,26.74


In [77]:
MOOD_MAPPING = {
    "Happy": 1,
    "Neutral": 2,
    "Anxious": 3,
    "Sad": 4
}

df["mood_last_week"] = df["mood_last_week"].map(MOOD_MAPPING)


In [78]:
target = "impulse_buy_score"

X = df.drop(columns=[target, "user_id"], errors="ignore")
y = df[target]

X_num = X.select_dtypes(include="number")

f_scores, p_values = f_regression(X_num, y)

anova_df = pd.DataFrame({
    "Feature": X_num.columns,
    "F_score": f_scores,
    "p_value": p_values
}).sort_values("p_value")

anova_df


Unnamed: 0,Feature,F_score,p_value
12,avg_discount_used,16475.258367,0.0
13,impulse_purchase_ratio,220381.212946,0.0
15,avg_minutes_to_purchase,3808.867974,0.0
14,past_impulse_purchases,50290.642851,0.0
9,total_purchases,3334.503447,0.0
8,late_night_session_ratio,5366.121735,0.0
16,stress_level,5169.239306,0.0
1,monthly_income,724.024386,2.410859e-158
11,avg_purchase_value,287.31772,2.8895739999999997e-64
10,total_spent,188.202038,9.382849000000001e-43


In [79]:
significant_numeric_features = anova_df[anova_df["p_value"] < 0.05]["Feature"].tolist()
significant_numeric_features


['avg_discount_used',
 'impulse_purchase_ratio',
 'avg_minutes_to_purchase',
 'past_impulse_purchases',
 'total_purchases',
 'late_night_session_ratio',
 'stress_level',
 'monthly_income',
 'avg_purchase_value',
 'total_spent',
 'saving_habit_score',
 'mood_last_week',
 'num_product_page_visits',
 'num_checkout_visits']

In [80]:
df["impulse_bin"] = pd.qcut(df[target], q=3, labels=[0, 1, 2])

le = LabelEncoder()
city_encoded = le.fit_transform(df["city"])

chi2_score, p_val = chi2(city_encoded.reshape(-1, 1), df["impulse_bin"])

print("Chi2:", chi2_score[0])
print("p-value:", p_val[0])


Chi2: 2.764252266119196
p-value: 0.251044231801337


In [81]:
numeric_features = significant_numeric_features

categorical_features = [
    "gender",
    "default_payment_method",
    "device_preference"
]


In [82]:
def build_pipeline(numeric_features, categorical_features):

    num_transformer = Pipeline([
        ("scaler", StandardScaler())
    ])

    cat_transformer = Pipeline([
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

    preprocessor = ColumnTransformer(
        [
            ("num", num_transformer, numeric_features),
            ("cat", cat_transformer, categorical_features)
        ],
        remainder="drop"
    )

    return preprocessor


In [83]:
X = df[numeric_features + categorical_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=VAL_SIZE, random_state=RANDOM_SEED
)

print("Train:", len(X_train))
print("Val  :", len(X_val))
print("Test :", len(X_test))


Train: 30000
Val  : 10000
Test : 10000


In [84]:
lgbm = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=20,
    reg_alpha=0.01,
    reg_lambda=0.01,
    random_state=RANDOM_SEED,
    verbosity=-1
)

pipeline = Pipeline([
    ("preprocessor", build_pipeline(numeric_features, categorical_features)),
    ("model", lgbm)
])


In [85]:
pipeline.fit(X_train, y_train)


In [86]:
def evaluate(y_true, y_pred, name="Set"):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true != 0, y_true, 1))) * 100

    print(f"\n{name} Performance")
    print(f"R2   : {r2:.4f}")
    print(f"RMSE : {rmse:.4f}")
    print(f"MAE  : {mae:.4f}")
    print(f"MAPE : {mape:.2f}%")

    return r2, rmse, mae, mape


In [87]:
evaluate(y_train, pipeline.predict(X_train), "Train")
evaluate(y_val, pipeline.predict(X_val), "Validation")
evaluate(y_test, pipeline.predict(X_test), "Test")



Train Performance
R2   : 0.9685
RMSE : 2.9005
MAE  : 2.3137
MAPE : 8.63%

Validation Performance
R2   : 0.9637
RMSE : 3.1293
MAE  : 2.4978
MAPE : 9.29%

Test Performance
R2   : 0.9621
RMSE : 3.1560
MAE  : 2.5091
MAPE : 9.37%


(0.9620803880003388,
 np.float64(3.1559815070337485),
 2.509109957359595,
 np.float64(9.370953291688489))

In [None]:
# joblib.dump(pipeline, MODEL_OUT)
# print("✅ Model saved at:", MODEL_OUT.resolve())


✅ Model saved at: C:\Users\Pegasus\Desktop\model_prac\models\lgbm_pipeline.joblib
