<a href="https://colab.research.google.com/github/RayAKaan/FUTURE_ML_01/blob/main/AI-Sales-Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install xgboost scikit-learn pandas numpy matplotlib seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor



In [None]:
# Load datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
store = pd.read_csv("store.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Store shape:", store.shape)

# Merge store info
train = train.merge(store, on="Store", how="left")
test = test.merge(store, on="Store", how="left")


  train = pd.read_csv("train.csv")


Train shape: (1017209, 9)
Test shape: (41088, 8)
Store shape: (1115, 10)


In [None]:
# ---------------------------
# 1. Feature engineering
# ---------------------------
for df in [train, test]:
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['IsWeekend'] = df['DayOfWeek'].isin([5,6]).astype(int)
    df['IsHoliday'] = df['SchoolHoliday'] | (df['StateHoliday'] != '0')

    df['CompetitionOpenSinceYear'] = df['CompetitionOpenSinceYear'].fillna(df['Year'])
    df['CompetitionOpenSinceMonth'] = df['CompetitionOpenSinceMonth'].fillna(df['Month'])
    df['Promo2SinceYear'] = df['Promo2SinceYear'].fillna(0)
    df['Promo2SinceWeek'] = df['Promo2SinceWeek'].fillna(0)

    df['CompetitionOpen'] = ((df['Year'] > df['CompetitionOpenSinceYear']) |
                             ((df['Year'] == df['CompetitionOpenSinceYear']) &
                              (df['Month'] >= df['CompetitionOpenSinceMonth']))).astype(int)

    df['Promo2Active'] = ((df['Promo2'] == 1) &
                          (df['Year'] >= df['Promo2SinceYear']) &
                          (df['WeekOfYear'] >= df['Promo2SinceWeek'])).astype(int)

train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

# ---------------------------
# 2. Prepare features
# ---------------------------
features = [
    "Store", "DayOfWeek", "Promo", "SchoolHoliday", "StateHoliday",
    "Year", "Month", "Day", "WeekOfYear", "IsWeekend",
    "StoreType", "Assortment", "CompetitionDistance", "CompetitionOpen",
    "Promo2Active"
]

categorical_cols = ["StateHoliday", "StoreType", "Assortment"]

X = train[features].copy()
y = train["Sales"]
X_test = test[features].copy()

# Label encode categorical columns
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str)).astype(int)
    X_test[col] = le.transform(X_test[col].astype(str)).astype(int)

# ---------------------------
# 3. Train XGBoost
# ---------------------------
dtrain = xgb.DMatrix(X.values, label=y.values)
dtest = xgb.DMatrix(X_test.values)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "eta": 0.03,
    "max_depth": 8,
    "subsample": 0.85,
    "colsample_bytree": 0.85,
    "min_child_weight": 3,
    "gamma": 1,
    "lambda": 2,
    "seed": 42
}

model = xgb.train(params, dtrain, num_boost_round=1000, verbose_eval=50)

# ---------------------------
# 4. Predict on test set
# ---------------------------
test_preds = model.predict(dtest)

# ---------------------------
# 5. Create minimal Power BI CSV
# ---------------------------
powerbi_cols = [
    "Id", "Store", "Date", "DayOfWeek", "IsWeekend",
    "IsHoliday", "Promo", "CompetitionDistance", "Promo2Active"
]

powerbi_df = test[powerbi_cols].copy()
powerbi_df["PredictedSales"] = test_preds

powerbi_df.to_csv("sales_predictions_powerbi_minimal.csv", index=False)
print("✅ Minimal Power BI CSV saved as sales_predictions_powerbi_minimal.csv")


✅ Minimal Power BI CSV saved as sales_predictions_powerbi_minimal.csv


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
import xgboost as xgb

# Convert training data to DMatrix
dtrain_full = xgb.DMatrix(X.values, label=y.values)

# Predict on the training set
train_preds = model.predict(dtrain_full)

# Calculate RMSE
train_rmse = np.sqrt(mean_squared_error(y, train_preds))
print(f"✅ Train RMSE: {train_rmse:.2f}")


✅ Train RMSE: 1006.03


In [None]:
from google.colab import files

files.download("sales_predictions_powerbi_minimal.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>