In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder, PowerTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import RFE
from sklearn.ensemble import StackingRegressor

In [None]:
# Load datasets
def load_data():
    train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
    test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
    holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')
    oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
    stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')
    transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')
    return train, test, holidays, oil, stores, transactions

train, test, holidays, oil, stores, transactions = load_data()

In [None]:
# Convert date columns to datetime
def convert_dates(df_list):
    for df in df_list:
        df['date'] = pd.to_datetime(df['date'])

convert_dates([train, test, holidays, oil, transactions])

In [None]:
train.head()

In [None]:
# Summary statistics
train.describe()

In [None]:
# Data Types
train.dtypes

In [None]:
# Time Series Analysis
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])
plt.figure(figsize=(14, 7))
plt.plot(train.groupby('date').sum()['sales'])
plt.title('Total Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()

In [None]:
# Distribution of Sales
plt.figure(figsize=(10, 5))
sns.histplot(train['sales'], bins=50, kde=True)
plt.title('Distribution of Sales')
plt.xlabel('Sales')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Sales by Store Number
plt.figure(figsize=(14, 7))
sns.boxplot(x='store_nbr', y='sales', data=train)
plt.title('Sales by Store Number')
plt.xlabel('Store Number')
plt.ylabel('Sales')
plt.show()

In [None]:

# Sales by Family
plt.figure(figsize=(14, 7))
sns.boxplot(x='family', y='sales', data=train)
plt.xticks(rotation=90)
plt.title('Sales by Product Family')
plt.xlabel('Product Family')
plt.ylabel('Sales')
plt.show()

In [None]:
# Check for missing values
print(train.isnull().sum())
print(test.isnull().sum())
print(oil.isnull().sum())
print(holidays.isnull().sum())
print(stores.isnull().sum())
print(transactions.isnull().sum())

In [None]:
# Check for duplicates
print(train.duplicated().sum())
print(test.duplicated().sum())

In [None]:
# Handle missing values
def handle_missing_values(oil, transactions):
    imputer = SimpleImputer(strategy='mean')
    oil['dcoilwtico'] = imputer.fit_transform(oil[['dcoilwtico']])
    transactions['transactions'] = imputer.fit_transform(transactions[['transactions']])
    return oil, transactions

oil, transactions = handle_missing_values(oil, transactions)

In [None]:
# Merge datasets
def merge_datasets(train, test, stores, transactions, oil, holidays):
    le_family = LabelEncoder()
    for df in [train, test]:
        df['family'] = le_family.fit_transform(df['family'])
        df = df.merge(stores, on='store_nbr', how='left')
        df = df.merge(transactions, on=['date', 'store_nbr'], how='left')
        df = df.merge(oil, on='date', how='left')
        df = df.merge(holidays, on='date', how='left')
        df.fillna(0, inplace=True)
    return train, test

train, test = merge_datasets(train, test, stores, transactions, oil, holidays)

In [None]:
# Feature engineering
def feature_engineering(df):
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofweek'] = df['date'].dt.dayofweek
    return df

train = feature_engineering(train)
test = feature_engineering(test)

In [None]:
# Remove outliers
def remove_outliers(df):
    Q1 = df['sales'].quantile(0.25)
    Q3 = df['sales'].quantile(0.75)
    IQR = Q3 - Q1
    df = df[~((df['sales'] < (Q1 - 1.5 * IQR)) | (df['sales'] > (Q3 + 1.5 * IQR)))]
    return df

train = remove_outliers(train)

In [None]:
# Lag features and rolling mean
def add_lag_features(df):
    for lag in range(1, 8):
        df[f'lag_{lag}'] = df['sales'].shift(lag)
    df['rolling_mean_7'] = df['sales'].rolling(window=7).mean()
    df['rolling_mean_14'] = df['sales'].rolling(window=14).mean()
    return df.dropna()

train = add_lag_features(train)

In [None]:
# Add lag features to test data
def add_lag_features_to_test(train, test):
    for lag in range(1, 8):
        test[f'lag_{lag}'] = train['sales'].shift(lag).values[-len(test):]
    test['rolling_mean_7'] = train['sales'].rolling(window=7).mean().values[-len(test):]
    test['rolling_mean_14'] = train['sales'].rolling(window=14).mean().values[-len(test):]
    test.fillna(0, inplace=True)
    return test

test = add_lag_features_to_test(train, test)

In [None]:
# Power Transformer
pt = PowerTransformer()
train['sales'] = pt.fit_transform(train[['sales']])
train['sales'] = np.maximum(train['sales'], 0)

In [None]:
# Feature Scaling
scaler = StandardScaler()
X = train.drop(['id', 'sales', 'date'], axis=1)
y = train['sales']
X_scaled = scaler.fit_transform(X)
X_test = scaler.transform(test.drop(['id', 'date'], axis=1))

In [None]:
# Cross-validation
tscv = TimeSeriesSplit(n_splits=5)

In [None]:
# Models with Hyperparameter Tuning
xgb_model = XGBRegressor(random_state=42)
lgbm_model = LGBMRegressor(random_state=42)

In [None]:
xgb_params = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

lgbm_params = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

In [None]:
xgb_search = RandomizedSearchCV(xgb_model, xgb_params, n_iter=10, scoring='neg_mean_squared_log_error', cv=tscv, random_state=42)
lgbm_search = RandomizedSearchCV(lgbm_model, lgbm_params, n_iter=10, scoring='neg_mean_squared_log_error', cv=tscv, random_state=42)

In [None]:
# Evaluation function for ARIMA
def evaluate_arima(y, pt):
    rmsle_scores = []
    for train_index, val_index in tscv.split(y):
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        model = ARIMA(y_train, order=(5, 1, 0))
        model_fit = model.fit()
        y_pred = model_fit.forecast(steps=len(y_val))
        y_pred = np.maximum(pt.inverse_transform(y_pred.values.reshape(-1, 1)).flatten(), 0)
        rmsle = np.sqrt(mean_squared_log_error(np.maximum(pt.inverse_transform(y_val.values.reshape(-1, 1)).flatten(), 0), y_pred))
        rmsle_scores.append(rmsle)
    return np.mean(rmsle_scores)

In [None]:
# Evaluation function for other models
def evaluate_model(model, X, y, pt):
    rmsle_scores = []
    for train_index, val_index in tscv.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        y_pred = np.maximum(pt.inverse_transform(y_pred.reshape(-1, 1)).flatten(), 0)
        rmsle = np.sqrt(mean_squared_log_error(np.maximum(pt.inverse_transform(y_val.values.reshape(-1, 1)).flatten(), 0), y_pred))
        rmsle_scores.append(rmsle)
    return np.mean(rmsle_scores)

In [None]:
# Evaluate all models
xgb_search.fit(X_scaled, y)
lgbm_search.fit(X_scaled, y)

In [None]:
model_rmsle = {
    'ARIMA': evaluate_arima(y, pt),
    'XGBoost': -xgb_search.best_score_,
    'LightGBM': -lgbm_search.best_score_
}

print(f"RMSLE for ARIMA: {model_rmsle['ARIMA']}")
print(f"RMSLE for XGBoost: {model_rmsle['XGBoost']}")
print(f"RMSLE for LightGBM: {model_rmsle['LightGBM']}")

In [None]:
# Choose the best model
best_model_name = min(model_rmsle, key=model_rmsle.get)
print(f'Best model: {best_model_name} with RMSLE: {model_rmsle[best_model_name]}')

In [None]:
# Retrain the best model on the entire training data
if best_model_name == 'ARIMA':
    best_model_fit = ARIMA(y, order=(5, 1, 0)).fit()
    forecast = best_model_fit.forecast(steps=len(test))
else:
    best_model = xgb_search.best_estimator_ if best_model_name == 'XGBoost' else lgbm_search.best_estimator_
    best_model.fit(X_scaled, y)
    forecast = best_model.predict(X_test)

In [None]:
test['sales'] = np.maximum(pt.inverse_transform(forecast.reshape(-1, 1)).flatten(), 0)
submission = test[['id', 'sales']]
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")