In [37]:
import pandas as pd
import numpy as np
import mlflow.lightgbm
import dagshub

dagshub.init(repo_owner='TomC333', repo_name='ml-walmart-recruiting', mlflow=True)

test_df = pd.read_csv('data/test.csv', parse_dates=['Date'])
features_df = pd.read_csv('data/features.csv', parse_dates=['Date'])
stores_df = pd.read_csv('data/stores.csv')

test_merged = test_df.merge(features_df, on=['Store', 'Date'], how='left')
test_merged = test_merged.merge(stores_df, on='Store', how='left')

if 'IsHoliday_y' in test_merged.columns:
    test_merged.drop(['IsHoliday_y'], axis=1, inplace=True)
    test_merged.rename(columns={'IsHoliday_x': 'IsHoliday'}, inplace=True)

test_merged['Month'] = test_merged['Date'].dt.month
test_merged['Year'] = test_merged['Date'].dt.year
test_merged['Week'] = test_merged['Date'].dt.isocalendar().week.astype(int)
test_merged['Quarter'] = test_merged['Date'].dt.quarter

for col in ['MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']:
    if col in test_merged.columns:
        test_merged[col] = test_merged[col].fillna(0)

if test_merged['IsHoliday'].isnull().any():
    test_merged['IsHoliday'] = test_merged['IsHoliday'].fillna(False)

test_merged['Type'] = test_merged['Type'].astype('category')

features = [
    'Store', 'Dept', 'IsHoliday', 'Month', 'Year', 'Week', 'Quarter',
    'Temperature', 'Fuel_Price', 'Size',
    'Type',
    'Is_SuperBowl', 'Is_LaborDay', 'Is_Thanksgiving', 'Is_Christmas',
    'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5',
]

for col in features:
    if col not in test_merged.columns:
        if 'Is_' in col or col == 'IsHoliday':
            test_merged[col] = False
        else:
            test_merged[col] = 0

X_test = test_merged[features].copy()

X_test['Type'] = X_test['Type'].astype('category')
X_test['IsHoliday'] = X_test['IsHoliday'].astype(int)
for col in ['Is_SuperBowl', 'Is_LaborDay', 'Is_Thanksgiving', 'Is_Christmas']:
    X_test[col] = X_test[col].astype(int)

model_name = "Best_LightGBM_Model"
model_uri = f"models:/{model_name}/latest"
model = mlflow.lightgbm.load_model(model_uri)

test_merged['Weekly_Sales'] = model.predict(X_test).clip(min=0)

submission = pd.read_csv('data/sampleSubmission.csv')
submission['Weekly_Sales'] = test_merged['Weekly_Sales'].values
submission.to_csv('submission.csv', index=False)

print("Step 8 Inference complete. submission.csv saved.")


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Step 8 Inference complete. submission.csv saved.


In [31]:
def explore_test_merged(df):
    print("=== Basic Info ===")
    print(df.info())
    print("\n=== Missing Values ===")
    print(df.isnull().sum().sort_values(ascending=False).head(20))

    print("\n=== Duplicates ===")
    print(f"Duplicate rows count: {df.duplicated().sum()}")

    print("\n=== Unique Store and Dept combinations ===")
    print(df.groupby(['Store', 'Dept']).size().reset_index(name='counts').head())

    print("\n=== Date range ===")
    print(f"Min date: {df['Date'].min()}, Max date: {df['Date'].max()}")

    print("\n=== Check for unexpected categories in categorical cols ===")
    if 'IsHoliday' in df.columns:
        print(f"IsHoliday unique values: {df['IsHoliday'].unique()}")
    if 'Store' in df.columns:
        print(f"Stores: {sorted(df['Store'].unique())}")
    if 'Dept' in df.columns:
        print(f"Departments: {sorted(df['Dept'].unique())}")

    print("\n=== Summary statistics for numeric columns ===")
    print(df.describe().transpose())

    print("\n=== Check lagged columns for NaNs or zeros ===")
    lag_cols = [col for col in df.columns if 'lag' in col]
    if lag_cols:
        print(df[lag_cols].isnull().sum())
        print(df[lag_cols].describe())

    print("\n=== Check for negative or zero values in important features ===")
    for col in ['Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']:
        if col in df.columns:
            print(f"{col}: min={df[col].min()}, max={df[col].max()}")

# Run it on your test_merged df
explore_test_merged(test_merged)


=== Basic Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115064 entries, 0 to 115063
Data columns (total 21 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Store                115064 non-null  int64         
 1   Dept                 115064 non-null  int64         
 2   Date                 115064 non-null  datetime64[ns]
 3   IsHoliday            115064 non-null  bool          
 4   Temperature          115064 non-null  float64       
 5   Fuel_Price           115064 non-null  float64       
 6   MarkDown1            114915 non-null  float64       
 7   MarkDown2            86437 non-null   float64       
 8   MarkDown3            105235 non-null  float64       
 9   MarkDown4            102176 non-null  float64       
 10  MarkDown5            115064 non-null  float64       
 11  CPI                  76902 non-null   float64       
 12  Unemployment         76902 non-null   float64       
