In [1]:
import os

os.chdir("../")

%pwd

'c:\\Arjun_Works\\SalesNexus'

In [2]:
import pandas as pd

train_df = pd.read_csv("train_final.csv")
test_df = pd.read_csv("test_final.csv")

In [3]:
train_df["transactions"] = train_df["transactions"].fillna(0)  
test_df["transactions"] = test_df["transactions"].fillna(0)

In [4]:
train_df.columns, test_df.columns

(Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion', 'city',
        'state', 'type_x', 'cluster', 'transactions', 'dcoilwtico', 'type_y'],
       dtype='object'),
 Index(['id', 'date', 'store_nbr', 'family', 'onpromotion', 'city', 'state',
        'type_x', 'cluster', 'transactions', 'dcoilwtico', 'type_y'],
       dtype='object'))

In [5]:
train_df.shape, test_df.shape

((3013362, 13), (28512, 12))

In [6]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor


# Fill missing values
train_df["transactions"] = train_df["transactions"].fillna(0)
test_df["transactions"] = test_df["transactions"].fillna(0)
train_df["type_y"] = train_df["type_y"].fillna("Regular Day")
test_df["type_y"] = test_df["type_y"].fillna("Regular Day")
train_df["dcoilwtico"] = train_df["dcoilwtico"].bfill()
test_df["dcoilwtico"] = test_df["dcoilwtico"].bfill()

# Parse date and extract features
for df in [train_df, test_df]:
    df["date"] = pd.to_datetime(df["date"])
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["day_of_week"] = df["date"].dt.dayofweek
    df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)
    df["day_of_year"] = df["date"].dt.dayofyear
    df["is_month_start"] = df["date"].dt.is_month_start.astype(int)
    df["is_month_end"] = df["date"].dt.is_month_end.astype(int)

# Interactions
train_df["onpromotion_trend"] = train_df["onpromotion"] * train_df["day_of_year"]
test_df["onpromotion_trend"] = test_df["onpromotion"] * test_df["day_of_year"]
train_df["month_sales_interaction"] = train_df["month"] * train_df["sales"]

# One-hot encode
cat_cols = ["family", "state", "city", "type_x", "type_y"]
train_df = pd.get_dummies(train_df, columns=cat_cols, drop_first=True, dtype=int)
test_df = pd.get_dummies(test_df, columns=cat_cols, drop_first=True, dtype=int)

# Align columns
common_cols = set(train_df.columns) & set(test_df.columns)
train_features = train_df[list(common_cols) + ["sales"]]
test_features = test_df[list(common_cols)]

# Scale selected features
scale_cols = ["onpromotion", "transactions", "dcoilwtico", "onpromotion_trend"]
scaler = MinMaxScaler()
train_features[scale_cols] = scaler.fit_transform(train_features[scale_cols])
test_features[scale_cols] = scaler.transform(test_features[scale_cols])

# Drop non-numeric columns
train_features = train_features.select_dtypes(include=[np.number])
test_features = test_features.select_dtypes(include=[np.number])

# Train model
X_train = train_features.drop("sales", axis=1)
y_train = train_features["sales"]
# {'subsample': 0.8, 'reg_lambda': 2, 'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.6}

model = XGBRegressor(n_estimators=500, 
                     learning_rate=0.05, 
                     max_depth=7, 
                     subsample=0.8, 
                     colsample_bytree=0.6, 
                     reg_alpha=0.1, 
                     reg_lambda=2, 
                     gamma=0, 
                     random_state=42)

model.fit(X_train, y_train)

# Evaluate on train (no val split here)
y_pred = model.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, y_pred))
mae = mean_absolute_error(y_train, y_pred)
r2 = r2_score(y_train, y_pred)

# 👇 RMSLE Evaluation
rmsle = np.sqrt(np.mean((np.log1p(y_train) - np.log1p(y_pred)) ** 2))

print("✅ Training Evaluation:")
print(f"RMSE: {rmse:.4f}")
print(f"MAE : {mae:.4f}")
print(f"R2  : {r2:.4f}")
print(f"RMSLE: {rmsle:.4f}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_features[scale_cols] = scaler.fit_transform(train_features[scale_cols])


✅ Training Evaluation:
RMSE: 209.7993
MAE : 60.9279
R2  : 0.9640
RMSLE: 1.5052


  rmsle = np.sqrt(np.mean((np.log1p(y_train) - np.log1p(y_pred)) ** 2))


In [11]:
from catboost import CatBoostRegressor

# Train CatBoost model with similar hyperparameters
catboost_model = CatBoostRegressor(
  iterations=1000,
  learning_rate=0.05,
  depth=7,
  subsample=0.8,
  colsample_bylevel=0.6,
  reg_lambda=2,
  random_seed=42,
  verbose=0
)

catboost_model.fit(X_train, y_train)

# Evaluate on train set
y_pred_cb = catboost_model.predict(X_train)
rmse_cb = np.sqrt(mean_squared_error(y_train, y_pred_cb))
mae_cb = mean_absolute_error(y_train, y_pred_cb)
r2_cb = r2_score(y_train, y_pred_cb)
rmsle_cb = np.sqrt(np.mean((np.log1p(y_train) - np.log1p(y_pred_cb)) ** 2))

print("✅ CatBoost Training Evaluation:")
print(f"RMSE: {rmse_cb:.4f}")
print(f"MAE : {mae_cb:.4f}")
print(f"R2  : {r2_cb:.4f}")
print(f"RMSLE: {rmsle_cb:.4f}")

✅ CatBoost Training Evaluation:
RMSE: 253.4910
MAE : 72.7220
R2  : 0.9474
RMSLE: 1.6468


  rmsle_cb = np.sqrt(np.mean((np.log1p(y_train) - np.log1p(y_pred_cb)) ** 2))


In [12]:
# Predict on test set
test_df["sales"] = catboost_model.predict(test_features)

# Ensure no negative predictions
test_df["sales"] = test_df["sales"].clip(lower=0)

# Prepare submission
submission = test_df[["id", "sales"]].copy()
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv saved successfully!")


✅ submission.csv saved successfully!


In [8]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

def get_best_hyperparameters(X, y, n_iter=30, cv=3, random_state=42, verbose=1):
    """Performs hyperparameter tuning using RandomizedSearchCV for XGBRegressor."""
    
    param_dist = {
        "n_estimators": [100, 300, 500, 700],
        "learning_rate": [0.01, 0.05, 0.1, 0.2],
        "max_depth": [3, 5, 7, 10],
        "subsample": [0.6, 0.8, 1.0],
        "colsample_bytree": [0.6, 0.8, 1.0],
        "gamma": [0, 0.1, 0.3, 0.5],
        "reg_alpha": [0, 0.1, 0.5],
        "reg_lambda": [1, 1.5, 2]
    }

    xgb = XGBRegressor(objective="reg:squarederror", random_state=random_state)

    random_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        verbose=verbose,
        n_jobs=-1,
        scoring='neg_root_mean_squared_error'
    )

    print("🔍 Searching for best hyperparameters...")
    random_search.fit(X, y)
    print("✅ Best Hyperparameters Found!")
    print(random_search.best_params_)

    return random_search.best_estimator_, random_search.best_params_


In [9]:
# Call hyperparameter tuning
best_model, best_params = get_best_hyperparameters(X_train, y_train)

# Predict on test features
test_df["sales"] = best_model.predict(test_features).clip(lower=0)
submission = test_df[["id", "sales"]]
submission.to_csv("submission.csv", index=False)


🔍 Searching for best hyperparameters...
Fitting 3 folds for each of 30 candidates, totalling 90 fits


8 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\phoen\anaconda3\envs\chest-cancer-detection\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\phoen\anaconda3\envs\chest-cancer-detection\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\Users\phoen\anaconda3\envs\chest-cancer-detection\lib\site-packages\xgboost\sklearn.py", line 1222, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "c:\Users\phoen\anaconda3\envs\chest-cancer-detection\lib\site-pack

✅ Best Hyperparameters Found!
{'subsample': 0.8, 'reg_lambda': 2, 'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.05, 'gamma': 0, 'colsample_bytree': 0.6}


ValueError: One of max or min must be given

In [7]:
# Predict on test set
test_df["sales"] = model.predict(test_features)

# Ensure no negative predictions
test_df["sales"] = test_df["sales"].clip(lower=0)

# Prepare submission
submission = test_df[["id", "sales"]].copy()
submission.to_csv("submission.csv", index=False)
print("✅ submission.csv saved successfully!")


✅ submission.csv saved successfully!
