In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [None]:
df=pd.read_csv(r'/content/Ecommerce_Sales_Data_2024_2025.csv')


In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.nunique()

In [None]:
df.drop(['Customer Name','Order ID'],inplace=True,axis=1)

In [None]:
df.head()

In [None]:
numeric_columns=['Unit Price','Discount','Sales','Profit']

In [None]:
for col in numeric_columns:
  plt.figure(figsize=(8,6))
  sns.histplot(df[col],kde=True,bins=30)


In [None]:
sns.countplot(x=df['Category'])
plt.xticks(rotation=90)
plt.tight_layout()

In [None]:
sns.countplot(x=df['Sub-Category'])
plt.tight_layout()
plt.xticks(rotation=90)

In [None]:
sns.countplot(x=df['Region'])


In [None]:
for col in numeric_columns:
  plt.figure(figsize=(8,6))
  sns.boxplot(x=df[col])

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(numeric_only=True),annot=True)



In [None]:
df.head()

In [None]:
df['Region'].value_counts()

In [None]:
categorical_cols = ['Region', 'City', 'Category', 'Sub-Category', 'Payment Mode']

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [None]:
df.head()

In [None]:
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Year'] = df['Order Date'].dt.year
df['Month'] = df['Order Date'].dt.month
df['Day'] = df['Order Date'].dt.day
df['Weekday'] = df['Order Date'].dt.day_name()
df['Quarter'] = df['Order Date'].dt.quarter
df['Is_Weekend'] = df['Order Date'].dt.weekday >= 5
df = pd.get_dummies(df, columns=['Weekday', 'Is_Weekend'], drop_first=True)
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 31)
df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 31)
df = df.drop(['Order Date'], axis=1)


In [None]:
df.head()

In [None]:

bool_cols = df.select_dtypes(include=['bool']).columns
df[bool_cols] = df[bool_cols].astype(int)


In [None]:
df['Product_Avg_Sales'] = df.groupby('Product Name')['Sales'].transform('mean')
df['Product_Avg_Profit'] = df.groupby('Product Name')['Profit'].transform('mean')
df['Product_Order_Count'] = df.groupby('Product Name')['Quantity'].transform('count')


In [None]:
df = df.drop('Product Name', axis=1)


In [None]:
df.head()

In [None]:
X=df.drop('Sales',axis=1)
y=df['Sales']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)


In [None]:
!pip install optuna

In [None]:
!pip install catboost

In [None]:
# -------------------------------

import optuna
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Optional CatBoost
try:
    import catboost as cb
    CATBOOST_INSTALLED = True
except:
    CATBOOST_INSTALLED = False
    print("CatBoost not installed, skipping it.")

# -------------------------------

def objective(trial):
    # Available models
    models = ["DecisionTree", "RandomForest", "XGBoost", "LightGBM"]
    if CATBOOST_INSTALLED:
        models.append("CatBoost")

    model_name = trial.suggest_categorical("model", models)

    if model_name == "DecisionTree":
        max_depth = trial.suggest_int("dt_max_depth", 3, 20)
        min_samples_split = trial.suggest_int("dt_min_samples_split", 2, 20)
        model = DecisionTreeRegressor(max_depth=max_depth,
                                      min_samples_split=min_samples_split,
                                      random_state=42)

    elif model_name == "RandomForest":
        n_estimators = trial.suggest_int("rf_n_estimators", 50, 500)
        max_depth = trial.suggest_int("rf_max_depth", 3, 20)
        min_samples_split = trial.suggest_int("rf_min_samples_split", 2, 20)
        model = RandomForestRegressor(n_estimators=n_estimators,
                                      max_depth=max_depth,
                                      min_samples_split=min_samples_split,
                                      n_jobs=-1,
                                      random_state=42)

    elif model_name == "XGBoost":
        n_estimators = trial.suggest_int("xgb_n_estimators", 50, 500)
        max_depth = trial.suggest_int("xgb_max_depth", 3, 15)
        learning_rate = trial.suggest_float("xgb_lr", 0.01, 0.3)
        subsample = trial.suggest_float("xgb_subsample", 0.5, 1.0)
        model = xgb.XGBRegressor(n_estimators=n_estimators,
                                 max_depth=max_depth,
                                 learning_rate=learning_rate,
                                 subsample=subsample,
                                 n_jobs=-1,
                                 random_state=42,
                                 tree_method='hist')

    elif model_name == "LightGBM":
        n_estimators = trial.suggest_int("lgb_n_estimators", 50, 500)
        max_depth = trial.suggest_int("lgb_max_depth", 3, 20)
        learning_rate = trial.suggest_float("lgb_lr", 0.01, 0.3)
        num_leaves = trial.suggest_int("lgb_num_leaves", 20, 150)
        model = lgb.LGBMRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  learning_rate=learning_rate,
                                  num_leaves=num_leaves,
                                  n_jobs=-1,
                                  random_state=42)

    else:  # CatBoost
        n_estimators = trial.suggest_int("cb_n_estimators", 50, 500)
        max_depth = trial.suggest_int("cb_max_depth", 3, 10)
        learning_rate = trial.suggest_float("cb_lr", 0.01, 0.3)
        model = cb.CatBoostRegressor(iterations=n_estimators,
                                     depth=max_depth,
                                     learning_rate=learning_rate,
                                     silent=True,
                                     random_state=42)

    # -------------------------------
    # Train & evaluate
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

# -------------------------------

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=300, show_progress_bar=True)

# -------------------------------

print("Best RMSE:", study.best_value)
print("Best model & hyperparameters:", study.best_trial.params)


In [None]:
from catboost import CatBoostRegressor

# -------------------------------
# Best hyperparameters
best_params = {
    'iterations': 496,
    'depth': 6,
    'learning_rate': 0.1327407810411538,
    'random_state': 42,

}
model = CatBoostRegressor(**best_params)
model.fit(X_train, y_train)
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE:", rmse)

In [None]:
# Predict on same dataset (ya alag test set agar available ho)
preds = model.predict(X_test)

# Evaluate RMSE / R2
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, preds))
r2 = r2_score(y_test, preds)

print("RMSE:", rmse)
print("R2 Score:", r2)


In [None]:
from google.colab import output
output.clear()  # removes widget metadata traces
