In [2]:
import pandas as pd

# Load original dataset
df = pd.read_csv("retail_sales.csv")

# Filter out invalid rows (Units Sold must be > 0)
df = df[df["Units_Sold"] > 0]

# Feature Engineering
df["Price_Gap"] = df["Price_INR"] - df["Competitor_Pricing_INR"]
df["Price_Per_Unit"] = df["Revenue_INR"] / df["Units_Sold"]
df["Discount_Effectiveness"] = df["Units_Sold"] / (df["Discount_%"] + 1)

# Select only relevant columns for pricing model
pricing_df = df[[
    "Product_ID", "Brand", "Category", "Region",
    "Price_INR", "Discount_%", "Competitor_Pricing_INR",
    "Price_Gap", "Price_Per_Unit", "Discount_Effectiveness",
    "Holiday_Promotion", "Weather_Condition", "Customer_Type",
    "Loyalty_Score", "Inventory_Level", "Units_Sold"  # Target
]]

# Save to CSV
pricing_df.to_csv("cleaned_pricing_dataset.csv", index=False)
print("✅ Saved as 'cleaned_pricing_dataset.csv'")


✅ Saved as 'cleaned_pricing_dataset.csv'


In [3]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load cleaned dataset
df = pd.read_csv("cleaned_pricing_dataset.csv")

# Encode categorical variables
categorical_cols = ["Product_ID", "Brand", "Category", "Region",
                    "Holiday_Promotion", "Weather_Condition", "Customer_Type"]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Features & Target
X = df.drop("Units_Sold", axis=1)
y = df["Units_Sold"]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to DMatrix for older XGBoost support
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Parameters
params = {
    "objective": "reg:squarederror",
    "learning_rate": 0.1,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "eval_metric": "rmse"
}

# Train model
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=500,
    evals=[(dtest, "eval")],
    early_stopping_rounds=20,
    verbose_eval=50
)

# Predict
y_pred = model.predict(dtest)

# Evaluation
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("📊 MAE:", mae)
print("📉 RMSE:", rmse)


[0]	eval-rmse:8.94878
[50]	eval-rmse:1.21113
[100]	eval-rmse:0.57962
[150]	eval-rmse:0.52859
[200]	eval-rmse:0.51638
[250]	eval-rmse:0.50931
[289]	eval-rmse:0.50627
📊 MAE: 0.24222524464130402
📉 RMSE: 0.5062692520813676


In [4]:
def simulate_optimal_price_discount(row, model, price_range, discount_range, feature_names):
    best_revenue = -np.inf
    best_combo = (row["Price_INR"], row["Discount_%"])

    for price in price_range:
        for discount in discount_range:
            temp = row.copy()
            temp["Price_INR"] = price
            temp["Discount_%"] = discount
            temp["Price_Gap"] = price - temp["Competitor_Pricing_INR"]
            temp["Price_Per_Unit"] = price
            temp["Discount_Effectiveness"] = 1 / (discount + 1)

            temp = temp[feature_names].values.reshape(1, -1)
            temp_dmatrix = xgb.DMatrix(temp, feature_names=feature_names)
            units_pred = model.predict(temp_dmatrix)[0]
            revenue = price * units_pred

            if revenue > best_revenue:
                best_revenue = revenue
                best_combo = (price, discount)

    return best_combo[0], best_combo[1], best_revenue


In [5]:
row = df.iloc[0]  # Pick any row
feature_names = list(X.columns)  # Same as model training

best_price, best_discount, max_revenue = simulate_optimal_price_discount(
    row, model,
    price_range=np.arange(10, 200, 5),
    discount_range=np.arange(0, 30, 2),
    feature_names=feature_names
)

print("✅ Optimal Price:", best_price)
print("💸 Optimal Discount:", best_discount)
print("📈 Expected Max Revenue:", max_revenue)


✅ Optimal Price: 185
💸 Optimal Discount: 16
📈 Expected Max Revenue: 540.3726065158844


In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from joblib import dump

# 📥 Load dataset
df = pd.read_csv("cleaned_pricing_dataset.csv")

# 🧠 Encode categorical columns
cat_cols = ["Product_ID", "Brand", "Category", "Region", "Holiday_Promotion", "Weather_Condition", "Customer_Type"]
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# ⚙️ Feature engineering
df["Price_Gap"] = df["Price_INR"] - df["Competitor_Pricing_INR"]
df["Price_Per_Unit"] = df["Price_INR"]
df["Discount_Effectiveness"] = 1 / (df["Discount_%"] + 1)
df["Price_Discount_Interaction"] = df["Price_INR"] * (1 - df["Discount_%"] / 100)

# 🔍 Optional: Keep top-selling products only
top_products = df.groupby("Product_ID")["Units_Sold"].sum().sort_values(ascending=False).head(20).index
df = df[df["Product_ID"].isin(top_products)]

# 🧾 Feature selection
features = [
    'Product_ID', 'Brand', 'Category', 'Region', 'Price_INR', 'Discount_%',
    'Competitor_Pricing_INR', 'Price_Gap', 'Price_Per_Unit',
    'Discount_Effectiveness', 'Holiday_Promotion', 'Weather_Condition',
    'Customer_Type', 'Loyalty_Score', 'Inventory_Level', 'Price_Discount_Interaction'
]
X = df[features]
y = np.log1p(df["Units_Sold"])  # log-transform target for stability

# 🧪 Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🧠 Define model
model = XGBRegressor(
    objective="reg:squarederror",
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    n_estimators=500
)

# 🚀 Train model (without early_stopping)
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=50)

# 📈 Evaluate
y_pred = np.expm1(model.predict(X_test))  # reverse log1p
y_true = np.expm1(y_test)

print("\n📊 MAE:", mean_absolute_error(y_true, y_pred))
print("📉 RMSE:", np.sqrt(mean_squared_error(y_true, y_pred)))

# 💾 Save model and encoders
dump(model, "xgb_pricing_model.joblib")
dump(label_encoders, "label_encoders.pkl")
print("✅ Model and encoders saved.")


[0]	validation_0-rmse:0.65573
[50]	validation_0-rmse:0.65957
[100]	validation_0-rmse:0.66398
[150]	validation_0-rmse:0.66647
[200]	validation_0-rmse:0.66940
[250]	validation_0-rmse:0.67346
[300]	validation_0-rmse:0.67682
[350]	validation_0-rmse:0.67906
[400]	validation_0-rmse:0.68244
[450]	validation_0-rmse:0.68480
[499]	validation_0-rmse:0.68694

📊 MAE: 8.596517083965814
📉 RMSE: 10.760037191782862
✅ Model and encoders saved.


In [None]:
import pandas as pd
import numpy as np
from joblib import load
from tqdm import tqdm

# Load model and encoders
model = load("xgb_pricing_model.joblib")
label_encoders = load("label_encoders.pkl")

# Load and encode
df = pd.read_csv("cleaned_pricing_dataset.csv")
cat_cols = ["Product_ID", "Brand", "Category", "Region", "Holiday_Promotion", "Weather_Condition", "Customer_Type"]
for col in cat_cols:
    df[col] = label_encoders[col].transform(df[col])

# Feature engineering
df["Price_Gap"] = df["Price_INR"] - df["Competitor_Pricing_INR"]
df["Price_Per_Unit"] = df["Price_INR"]
df["Discount_Effectiveness"] = 1 / (df["Discount_%"] + 1)
df["Price_Discount_Interaction"] = df["Price_INR"] * (1 - df["Discount_%"] / 100)

features = [
    'Product_ID', 'Brand', 'Category', 'Region', 'Price_INR', 'Discount_%',
    'Competitor_Pricing_INR', 'Price_Gap', 'Price_Per_Unit',
    'Discount_Effectiveness', 'Holiday_Promotion', 'Weather_Condition',
    'Customer_Type', 'Loyalty_Score', 'Inventory_Level', 'Price_Discount_Interaction'
]

# Price & discount grid
price_grid = np.arange(10, 200, 5)
discount_grid = np.arange(0, 30, 2)

results = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    grid = pd.DataFrame([(p, d) for p in price_grid for d in discount_grid], columns=["Price_INR", "Discount_%"])
    base = pd.DataFrame([row] * len(grid)).reset_index(drop=True)
    base["Price_INR"] = grid["Price_INR"]
    base["Discount_%"] = grid["Discount_%"]
    base["Price_Gap"] = base["Price_INR"] - base["Competitor_Pricing_INR"]
    base["Price_Per_Unit"] = base["Price_INR"]
    base["Discount_Effectiveness"] = 1 / (base["Discount_%"] + 1)
    base["Price_Discount_Interaction"] = base["Price_INR"] * (1 - base["Discount_%"] / 100)

    preds = model.predict(base[features])
    units_pred = np.expm1(preds)
    revenue = base["Price_INR"] * units_pred

    best = revenue.idxmax()
    results.append({
        "Product_ID": int(row["Product_ID"]),
        "Brand": int(row["Brand"]),
        "Category": int(row["Category"]),
        "Region": int(row["Region"]),
        "Optimal_Price": round(base.loc[best, "Price_INR"], 2),
        "Optimal_Discount": round(base.loc[best, "Discount_%"], 2),
        "Predicted_Revenue": round(revenue[best], 2)
    })

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("optimized_dynamic_pricing.csv", index=False)
print("✅ Optimization saved to optimized_dynamic_pricing.csv")


100%|██████████| 100000/100000 [1:03:02<00:00, 26.44it/s] 


✅ Optimization saved to optimized_dynamic_pricing.csv


: 

In [9]:
joblib.dump(label_encoders, "label_encoders.pkl")
print("✅ Label encoders saved as: label_encoders.pkl")
import joblib

✅ Label encoders saved as: label_encoders.pkl
