In [5]:
import pandas as pd
import numpy as np
import mlflow
import dagshub
from sklearn.preprocessing import LabelEncoder
from mlflow.tracking import MlflowClient

dagshub.init(repo_owner='TomC333', repo_name='ml-walmart-recruiting', mlflow=True)
model_name = "Best_RandomForest_Model"
model_uri = f"models:/{model_name}/latest"

model = mlflow.sklearn.load_model(model_uri)

train_df = pd.read_csv('train_merged_full_rf.csv', parse_dates=['Date'])  # This is your processed training data
test_df = pd.read_csv('data/test.csv', parse_dates=['Date'])
features_df = pd.read_csv('data/features.csv', parse_dates=['Date'])
stores_df = pd.read_csv('data/stores.csv')

test_merged = test_df.merge(features_df, on=['Store', 'Date'], how='left')
test_merged = test_merged.merge(stores_df, on='Store', how='left')

if 'IsHoliday_y' in test_merged.columns:
    test_merged.drop(['IsHoliday_y'], axis=1, inplace=True)
    test_merged.rename(columns={'IsHoliday_x': 'IsHoliday'}, inplace=True)

test_merged["Year"] = test_merged["Date"].dt.year
test_merged["Month"] = test_merged["Date"].dt.month
test_merged["Week"] = test_merged["Date"].dt.isocalendar().week.astype(int)
test_merged["Quarter"] = test_merged["Date"].dt.quarter

superbowl_dates = pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'])
laborday_dates = pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'])
thanksgiving_dates = pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'])
christmas_dates = pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'])

test_merged['Is_SuperBowl'] = 0
test_merged['Is_LaborDay'] = 0
test_merged['Is_Thanksgiving'] = 0
test_merged['Is_Christmas'] = 0

test_merged.loc[test_merged['Date'].isin(superbowl_dates), 'Is_SuperBowl'] = 1
test_merged.loc[test_merged['Date'].isin(laborday_dates), 'Is_LaborDay'] = 1
test_merged.loc[test_merged['Date'].isin(thanksgiving_dates), 'Is_Thanksgiving'] = 1
test_merged.loc[test_merged['Date'].isin(christmas_dates), 'Is_Christmas'] = 1

features_step2 = [
    'Store', 'Dept', 'IsHoliday',    
    'Month', 'Year', 'Week', 'Quarter',
    'Is_SuperBowl', 'Is_LaborDay', 'Is_Thanksgiving', 'Is_Christmas'
]

X_pred = test_merged[features_step2].copy()

for col in ['Store', 'Dept']:
    le = LabelEncoder()
    all_values = pd.concat([train_df[col], test_merged[col]]).unique()
    le.fit(all_values)
    X_pred[col] = le.transform(X_pred[col])

preds = model.predict(X_pred).clip(min=0)

submission = pd.DataFrame({
    'Id': test_merged['Store'].astype(str) + '_' + 
          test_merged['Dept'].astype(str) + '_' + 
          test_merged['Date'].dt.strftime('%Y-%m-%d'),
    'Weekly_Sales': preds
})

sample_submission = pd.read_csv('data/sampleSubmission.csv')
submission = sample_submission[['Id']].merge(submission, on='Id', how='left')
submission['Weekly_Sales'] = submission['Weekly_Sales'].fillna(0)

submission.to_csv('submission_rf_step2.csv', index=False)

print("Random Forest Step 2 inference complete. submission_rf_step2.csv saved.")

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Random Forest Step 2 inference complete. submission_rf_step2.csv saved.
