# Store Sales

In [1]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import  cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error


from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
import lightgbm as lgb
import optuna
import os

# Désactive les erreurs Ray parasites
os.environ["RAY_IGNORE_UNHANDLED_ERRORS"] = "1"

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
holidays_events = pd.read_csv('holidays_events.csv')
oil = pd.read_csv('oil.csv')
stores = pd.read_csv('stores.csv')
transactions = pd.read_csv('transactions.csv')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Génère un rapport HTML pour chaque DataFrame CSV chargé
if False:
    ProfileReport(train, title="Profiling Report - train").to_file("train_profile.html")
    ProfileReport(test, title="Profiling Report - test").to_file("test_profile.html")
    ProfileReport(holidays_events, title="Profiling Report - holidays_events").to_file("holidays_events_profile.html")
    ProfileReport(oil, title="Profiling Report - oil").to_file("oil_profile.html")
    ProfileReport(stores, title="Profiling Report - stores").to_file("stores_profile.html")
    ProfileReport(transactions, title="Profiling Report - transactions").to_file("transactions_profile.html")

In [3]:
# Remplace les valeurs manquantes dans 'dcoilwtico' par la moyenne des valeurs précédente et suivante
oil['dcoilwtico'] = oil['dcoilwtico'].interpolate(method='linear', limit_direction='both')
print(oil['dcoilwtico'].isnull().sum())  # Vérifie qu'il n'y a plus de valeurs manquantes

# Ajoute le prix de l'oil dans train et test en fonction de la date
train = train.merge(oil, on='date', how='left')
test = test.merge(oil, on='date', how='left')

# Complète les valeurs manquantes de dcoilwtico dans train et test par interpolation linéaire
train['dcoilwtico'] = train['dcoilwtico'].interpolate(method='linear', limit_direction='both')
test['dcoilwtico'] = test['dcoilwtico'].interpolate(method='linear', limit_direction='both')

# Ajoute une colonne 'day_of_week' indiquant le jour de la semaine (0=lundi, 6=dimanche)
train['day_of_week'] = pd.to_datetime(train['date']).dt.dayofweek
test['day_of_week'] = pd.to_datetime(test['date']).dt.dayofweek

# One-hot encoding de la colonne 'day_of_week'
train = pd.get_dummies(train, columns=['day_of_week'], prefix='dow')
test = pd.get_dummies(test, columns=['day_of_week'], prefix='dow')

# remplace la date par un entier représentant le jour depuis le début du dataset
train['date'] = (pd.to_datetime(train['date']) - pd.to_datetime('2013-01-01')).dt.days
test['date'] = (pd.to_datetime(test['date']) - pd.to_datetime('2013-01-01')).dt.days


# Lag de 1 jour
train["lag1_sales"] = train.groupby(["store_nbr", "family"])["sales"].shift(1)
train_clean = train.dropna(subset=["lag1_sales"])



0


In [4]:
drop_cols = ['id', 'sales']
X = train_clean.drop(columns=drop_cols)
y = train_clean['sales']

print(X.head())

cat_cols = ['store_nbr', 'family'] + [f'dow_{i}' for i in range(7)]
num_cols = ['date','dcoilwtico', 'lag1_sales', 'onpromotion']
print("Colonnes catégorielles :", cat_cols)
print("Colonnes numériques :", num_cols)
preprocessor = ColumnTransformer(
    transformers=[
        # One-hot categorical features
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        # Pass-through numerical features
        ("num", StandardScaler(), num_cols),
    ],
    remainder="drop"  # drop unused raw columns like Name, Ticket, Cabin, etc.
)

      date  store_nbr      family  onpromotion  dcoilwtico  dow_0  dow_1  \
1782     1          1  AUTOMOTIVE            0       93.14  False  False   
1783     1          1   BABY CARE            0       93.14  False  False   
1784     1          1      BEAUTY            0       93.14  False  False   
1785     1          1   BEVERAGES            0       93.14  False  False   
1786     1          1       BOOKS            0       93.14  False  False   

      dow_2  dow_3  dow_4  dow_5  dow_6  lag1_sales  
1782   True  False  False  False  False         0.0  
1783   True  False  False  False  False         0.0  
1784   True  False  False  False  False         0.0  
1785   True  False  False  False  False         0.0  
1786   True  False  False  False  False         0.0  
Colonnes catégorielles : ['store_nbr', 'family', 'dow_0', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'dow_5', 'dow_6']
Colonnes numériques : ['date', 'dcoilwtico', 'lag1_sales', 'onpromotion']


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, shuffle=False  
)


print(X.isnull().sum())


model = RandomForestRegressor()

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_valid)
y_pred = np.maximum(0, y_pred)
score = np.sqrt(mean_squared_log_error(y_valid, y_pred))
print("RMSLE :", score)

date           0
store_nbr      0
family         0
onpromotion    0
dcoilwtico     0
dow_0          0
dow_1          0
dow_2          0
dow_3          0
dow_4          0
dow_5          0
dow_6          0
lag1_sales     0
dtype: int64


In [None]:
for i in range(len(test)):
    # features disponibles
    X_test = test.loc[test.index[i], ["lag1_sales"]].values.reshape(1, -1)
    
    # prédiction
    y_pred = model.predict(X_test)[0]
    
    # enregistrer
    test.loc[test.index[i], "sales"] = y_pred
    
    # mettre à jour le lag du jour suivant
    if i + 1 < len(test):
        test.loc[test.index[i+1], "lag1_sales"] = y_pred
