In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import gc
import os
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor

# Load the datasets
sales = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_evaluation.csv')
sales.name = 'sales'
calendar = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv')
calendar.name = 'calendar'
prices = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv')
prices.name = 'prices'

# Add zero sales for the remaining days 1942-1969
for d in range(1942, 1970):
    col = 'd_' + str(d)
    sales[col] = 0
    sales[col] = sales[col].astype(np.int16)  # Downcast to save memory

# Downcast function to optimize memory usage
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i, t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df

# Downcast the dataframes
sales = downcast(sales)
prices = downcast(prices)
calendar = downcast(calendar)

# Reshape the data for analysis
df = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sold').dropna()
df = pd.merge(df, calendar, on='d', how='left')
df = pd.merge(df, prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')

# Store the categories along with their codes
d_id = dict(zip(df.id.cat.codes, df.id))
d_item_id = dict(zip(df.item_id.cat.codes, df.item_id))
d_dept_id = dict(zip(df.dept_id.cat.codes, df.dept_id))
d_cat_id = dict(zip(df.cat_id.cat.codes, df.cat_id))
d_store_id = dict(zip(df.store_id.cat.codes, df.store_id))
d_state_id = dict(zip(df.state_id.cat.codes, df.state_id))

# Convert days to integers
df.d = df['d'].apply(lambda x: x.split('_')[1]).astype(np.int16)

# Convert categorical columns to codes
cols = df.dtypes.index.tolist()
types = df.dtypes.values.tolist()
for i, type in enumerate(types):
    if type.name == 'category':
        df[cols[i]] = df[cols[i]].cat.codes
df.drop('date', axis=1, inplace=True)

# Introduce lag features
lags = [1, 2, 3, 6, 12, 24, 36]
for lag in lags:
    df['sold_lag_' + str(lag)] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], as_index=False)['sold'].shift(lag).astype(np.float16)

# Add average sales features
df['iteam_sold_avg'] = df.groupby('item_id')['sold'].transform('mean').astype(np.float16)
df['state_sold_avg'] = df.groupby('state_id')['sold'].transform('mean').astype(np.float16)
df['store_sold_avg'] = df.groupby('store_id')['sold'].transform('mean').astype(np.float16)
df['cat_sold_avg'] = df.groupby('cat_id')['sold'].transform('mean').astype(np.float16)
df['dept_sold_avg'] = df.groupby('dept_id')['sold'].transform('mean').astype(np.float16)
df['cat_dept_sold_avg'] = df.groupby(['cat_id', 'dept_id'])['sold'].transform('mean').astype(np.float16)
df['store_item_sold_avg'] = df.groupby(['store_id', 'item_id'])['sold'].transform('mean').astype(np.float16)
df['cat_item_sold_avg'] = df.groupby(['cat_id', 'item_id'])['sold'].transform('mean').astype(np.float16)
df['dept_item_sold_avg'] = df.groupby(['dept_id', 'item_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_sold_avg'] = df.groupby(['state_id', 'store_id'])['sold'].transform('mean').astype(np.float16)
df['state_store_cat_sold_avg'] = df.groupby(['state_id', 'store_id', 'cat_id'])['sold'].transform('mean').astype(np.float16)
df['store_cat_dept_sold_avg'] = df.groupby(['store_id', 'cat_id', 'dept_id'])['sold'].transform('mean').astype(np.float16)

# Add rolling and expanding sales features
df['rolling_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.rolling(window=7).mean()).astype(np.float16)
df['expanding_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.expanding(2).mean()).astype(np.float16)

# Add daily average sales and trend features
df['daily_avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd'])['sold'].transform('mean').astype(np.float16)
df['avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform('mean').astype(np.float16)
df['selling_trend'] = (df['daily_avg_sold'] - df['avg_sold']).astype(np.float16)

# Drop unnecessary columns
df.drop(['daily_avg_sold', 'avg_sold'], axis=1, inplace=True)
df = df[df['d'] >= 36]

# Save memory by downcasting
df = downcast(df)

# Save data to pickle
df.to_pickle('data.pkl')

# Perform garbage collection
del df
gc.collect()

# Load data from pickle
data = pd.read_pickle('data.pkl')

# Split the data for validation and test
valid = data[(data['d'] >= 1914) & (data['d'] < 1942)][['id', 'd', 'sold']]
test = data[data['d'] >= 1942][['id', 'd', 'sold']]
eval_preds = test['sold']
valid_preds = valid['sold']

# Get the store ids
stores = sales.store_id.cat.codes.unique().tolist()
for store in stores:
    df = data[data['store_id'] == store]

    # Split the data into training and validation sets
    X_train, y_train = df[df['d'] < 1914].drop('sold', axis=1), df[df['d'] < 1914]['sold']
    X_valid, y_valid = df[(df['d'] >= 1914) & (df['d'] < 1942)].drop('sold', axis=1), df[(df['d'] >= 1914) & (df['d'] < 1942)]['sold']
    X_test = df[df['d'] >= 1942].drop('sold', axis=1)

    # Train and validate the model
    model = LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.3,
        subsample=0.8,
        colsample_bytree=0.8,
        max_depth=8,
        num_leaves=50,
        min_child_weight=300
    )
    print('*****Prediction for Store: {}*****'.format(d_store_id[store]))
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='rmse', verbose=20, early_stopping_rounds=20)
    
    valid_preds[X_valid.index] = model.predict(X_valid)
    eval_preds[X_test.index] = model.predict(X_test)
    
    # Save the model
    filename = 'model' + str(d_store_id[store]) + '.pkl'
    joblib.dump(model, filename)
    
    # Cleanup
    del model, X_train, y_train, X_valid, y_valid
    gc.collect()

# Feature importance plotting
feature_importance_df = pd.DataFrame()
features = [f for f in data.columns if f != 'sold']
for filename in os.listdir('/kaggle/working/'):
    if 'model' in filename:
        model = joblib.load(filename)
        store_importance_df = pd.DataFrame()
        store_importance_df["feature"] = features
        store_importance_df["importance"] = model.feature_importances_
        store_importance_df["store"] = filename[5:9]
        feature_importance_df = pd.concat([feature_importance_df, store_importance_df], axis=0)

# Display the feature importances
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:20].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (averaged over store predictions)')
    plt.tight_layout()

display_importances(feature_importance_df)

# Prepare the submission file
actual = False
if actual == False:
    # Get the validation results
    validation = sales[['id'] + ['d_' + str(i) for i in range(1914, 1942)]]
    validation['id'] = pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv').id
    validation.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
else:
    valid['sold'] = valid_preds
    validation = valid[['id', 'd', 'sold']]
    validation = pd.pivot(validation, index='id', columns='d', values='sold').reset_index()
    validation.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
    validation.id = validation.id.map(d_id).str.replace('evaluation', 'validation')

# Get the evaluation results
test['sold'] = eval_preds
evaluation = test[['id', 'd', 'sold']]
evaluation = pd.pivot(evaluation, index='id', columns='d', values='sold').reset_index()
evaluation.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
evaluation.id = evaluation.id.map(d_id)

# Prepare the final submission
submit = pd.concat([validation, evaluation]).reset_index(drop=True)
submit.to_csv('submission.csv', index=False)


ModuleNotFoundError: No module named 'lightgbm'