In [75]:
import pandas as pd

file = r'C:\Users\girip\Downloads\TRAIN.csv'   # <- put your real path here
dataset = pd.read_csv(file)

dataset.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


##Feature Engineering

In [76]:
# ensure Date is datetime, sort by Store_id & Date, then create features and rolling mean
dataset['Date'] = pd.to_datetime(dataset['Date'])
dataset = dataset.sort_values(['Store_id', 'Date']).reset_index(drop=True)

dataset['Year'] = dataset['Date'].dt.year
dataset['Month'] = dataset['Date'].dt.month
dataset['DayOfWeek'] = dataset['Date'].dt.dayofweek

dataset['Sales_Rolling_Mean_7D'] = dataset.groupby('Store_id')['Sales'] \
    .transform(lambda x: x.rolling(window=7, min_periods=1).mean())

#Data Transformation

In [77]:
from sklearn.preprocessing import StandardScaler

# Use numeric columns that the API expects (and that will be model inputs)
numerical_features = ['Store_id', 'Year', 'Month', 'DayOfWeek']

scaler = StandardScaler()
dataset[numerical_features] = scaler.fit_transform(dataset[numerical_features])

print("Numerical features scaled successfully.")

Numerical features scaled successfully.


In [78]:
import pandas as pd

# Identify categorical features to be one-hot encoded
categorical_features = ['Store_Type', 'Location_Type', 'Region_Code', 'Discount']

# Apply one-hot encoding (do not drop first to keep consistent column set with the Flask API)
dataset = pd.get_dummies(dataset, columns=categorical_features, drop_first=False)

print("Categorical features one-hot encoded successfully.")

Categorical features one-hot encoded successfully.


In [79]:
X_train = dataset.drop(['Sales', 'ID', 'Date'], axis=1)
y_train = dataset['Sales']

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
final_model_lr = LinearRegression()
final_model_lr.fit(X_train, y_train)

print("Linear Regression model re-trained on the entire X_train and y_train datasets.")

Linear Regression model re-trained on the entire X_train and y_train datasets.


In [81]:
import joblib

# Save the trained Linear Regression model to a file
joblib.dump(final_model_lr, 'linear_regression_model.joblib')

print("Linear Regression model saved to 'linear_regression_model.joblib'.")

Linear Regression model saved to 'linear_regression_model.joblib'.


In [82]:
import joblib

# Save the fitted StandardScaler instance to a file
joblib.dump(scaler, 'scaler.joblib')
print("StandardScaler saved to 'scaler.joblib'.")

StandardScaler saved to 'scaler.joblib'.


In [83]:
joblib.dump(final_model_lr, 'linear_regression_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
# Save the final X_train columns in order
joblib.dump(X_train.columns.tolist(), 'X_train_columns.joblib')
# And save encoded column list if your Flask loads it:
encoded_columns = [c for c in X_train.columns if c not in ['Store_id','Year','Month','DayOfWeek','Holiday']]
joblib.dump(encoded_columns, 'encoded_columns.joblib')

['encoded_columns.joblib']

In [84]:
import pandas as pd

# 1. Group the original dataset by the 'Date' column and sum the 'Sales' to re-create daily_sales
daily_sales = dataset.groupby('Date')['Sales'].sum().reset_index()

# 2. Convert the 'Date' column to datetime objects and then set it as the DataFrame's index
daily_sales['Date'] = pd.to_datetime(daily_sales['Date'])
daily_sales.set_index('Date', inplace=True)

# 3. Define the split_date
split_date = '2019-02-16'

# 4. Filter daily_sales to create train_data
train_data = daily_sales[daily_sales.index <= split_date]

# 5. Filter daily_sales to create validation_data
validation_data = daily_sales[daily_sales.index > split_date]

print("daily_sales DataFrame re-created and indexed by Date successfully.")
print(f"Training data contains data up to {train_data.index.max().strftime('%Y-%m-%d')}")
print(f"Validation data contains data from {validation_data.index.min().strftime('%Y-%m-%d')} to {validation_data.index.max().strftime('%Y-%m-%d')}")
print(f"Shape of training data: {train_data.shape}")
print(f"Shape of validation data: {validation_data.shape}")
print("Head of re-indexed daily_sales:")
print(daily_sales.head())

daily_sales DataFrame re-created and indexed by Date successfully.
Training data contains data up to 2019-02-16
Validation data contains data from 2019-02-17 to 2019-05-31
Shape of training data: (412, 1)
Shape of validation data: (104, 1)
Head of re-indexed daily_sales:
                 Sales
Date                  
2018-01-01  15345484.5
2018-01-02  19592415.0
2018-01-03  18652527.0
2018-01-04  19956267.0
2018-01-05  22902651.0


In [85]:
from statsmodels.tsa.arima.model import ARIMA

arima_order = (5, 1, 0)


model_arima = ARIMA(train_data['Sales'], order=arima_order)
model_arima_fit = model_arima.fit()

print("ARIMA model fitted successfully.")



  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


ARIMA model fitted successfully.


In [86]:
import joblib

# Save the trained ARIMA model to a file
joblib.dump(model_arima_fit, 'arima_model.joblib')

print("ARIMA model saved to 'arima_model.joblib'.")

ARIMA model saved to 'arima_model.joblib'.


In [87]:
from sklearn.ensemble import HistGradientBoostingRegressor

# Train XGBoost if available, otherwise fall back to sklearn's HistGradientBoostingRegressor
# Use only features available at inference time (exclude engineered/time-series features)
# such as '#Order' and 'Sales_Rolling_Mean_7D' which cannot be computed from a single request.
xgb_features = [c for c in X_train.columns if c not in ['#Order', 'Sales_Rolling_Mean_7D']]
X_train_xgb = X_train[xgb_features]
try:
    from xgboost import XGBRegressor  # may raise ImportError
    model_xgb_final = XGBRegressor(random_state=42)
    model_xgb_final.fit(X_train_xgb, y_train)
    print("Trained XGBRegressor (xgboost) on inference-ready features.")
except Exception as e:
    print("xgboost not available, falling back to sklearn HistGradientBoostingRegressor:", e)
    model_xgb_final = HistGradientBoostingRegressor(random_state=42)
    model_xgb_final.fit(X_train_xgb, y_train)
    print("Trained HistGradientBoostingRegressor (sklearn fallback) on inference-ready features.")

# Ensure scaler_current exists for downstream saving
if 'scaler_current' not in globals():
    scaler_current = scaler
    print("Set scaler_current = scaler")

xgboost not available, falling back to sklearn HistGradientBoostingRegressor: No module named 'xgboost'
Trained HistGradientBoostingRegressor (sklearn fallback) on inference-ready features.


In [88]:
import joblib

# Define paths for saving artifacts
model_path = 'model_xgb_final.joblib'
scaler_path = 'scaler_current.joblib'
columns_path = 'X_train_columns.joblib'

# Save the trained XGBoostRegressor model
joblib.dump(model_xgb_final, model_path)
print(f"XGBoostRegressor model saved to {model_path}")

# Save the scaler used for numerical features
joblib.dump(scaler_current, scaler_path)
print(f"Scaler saved to {scaler_path}")

# Save the column names used for XGBoost (the inference-ready feature order)
# If xgb_features was defined during training, save that order; otherwise save X_train.columns
try:
    cols_to_save = xgb_features
except NameError:
    cols_to_save = X_train.columns.tolist()
joblib.dump(cols_to_save, columns_path)
print(f"XGBoost feature column names saved to {columns_path}")

XGBoostRegressor model saved to model_xgb_final.joblib
Scaler saved to scaler_current.joblib
XGBoost feature column names saved to X_train_columns.joblib
