In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [None]:
df = pd.read_csv('supermarket-sales.csv')
df = df.drop(["State","Customer Name","Discount","Order ID","Profit"], axis=1)
df['Orderdate'] = pd.to_datetime(df['Orderdate'], format='mixed', dayfirst=True, errors='coerce')
df['year'] = df['Orderdate'].dt.year
df['month'] = df['Orderdate'].dt.month
df['day'] = df['Orderdate'].dt.day
df["weekday"] = df["Orderdate"].dt.weekday
df["is_weekend"] = df["weekday"].isin([5,6]).astype(int)
df =df.sort_values('Orderdate')
df = pd.get_dummies(df, columns=["Category", "Subcategory", "City", "Region"], drop_first=True)

df = df.set_index('Orderdate')
for lag in [1, 7, 30]:
    df[f'sales_lag_{lag}'] = df['Sales'].shift(lag)

df['sales_roll_7'] = df['Sales'].shift(1).rolling(window=7).mean()
df['sales_roll_30'] = df['Sales'].shift(1).rolling(window=30).mean()
df = df.dropna()

In [None]:
X = df.drop(["Sales"], axis=1)
y = np.log1p(df["Sales"])
X = X.apply(pd.to_numeric, errors='coerce')
mask = ~X.isna().any(axis=1) & ~y.isna() & ~np.isinf(y)
X, y = X[mask], y[mask]

split_point = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_point], X.iloc[split_point:]
y_train, y_test = y.iloc[:split_point], y.iloc[split_point:]
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print("Best Params:", grid_search.best_params_)
y_pred = best_model.predict(X_test)