In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [2]:
# Load data
folder = 'walmart-recruiting-store-sales-forecasting'
train = pd.read_csv(f"./{folder}/train.csv", parse_dates=["Date"])
test = pd.read_csv(f"./{folder}/test.csv", parse_dates=["Date"])
features = pd.read_csv(f"./{folder}/features.csv", parse_dates=["Date"])
stores = pd.read_csv(f"./{folder}/stores.csv")

# Merge train and test with features and store data
train = train.merge(features, on=["Store", "Date", "IsHoliday"], how="left")
train = train.merge(stores, on="Store", how="left")

test = test.merge(features, on=["Store", "Date", "IsHoliday"], how="left")
test = test.merge(stores, on="Store", how="left")

In [3]:
# Add date-related features
for df in [train, test]:
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.isocalendar().week.astype(int)
    df['Day'] = df['Date'].dt.dayofweek

# Encode categorical variables
train['Type'] = train['Type'].map({'A': 0, 'B': 1, 'C': 2})
test['Type'] = test['Type'].map({'A': 0, 'B': 1, 'C': 2})

In [4]:
features_cols = ['Store', 'Dept', 'IsHoliday', 'Temperature', 'Fuel_Price',
                 'CPI', 'Unemployment', 'Size', 'Type',
                 'Year', 'Month', 'Week', 'Day']

X_train = train[features_cols]
y_train = train['Weekly_Sales']

X_test = test[features_cols]

In [5]:
model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

RandomForestRegressor(n_jobs=-1, random_state=42)

In [7]:
# Predict on test
predictions = model.predict(X_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [26]:
# Create Id column in the format store_dept_date
submission = test.copy()
submission['Id'] = submission['Store'].astype(str) + "_" + \
                   submission['Dept'].astype(str) + "_" + \
                   submission['Date'].dt.strftime("%Y-%m-%d")

# Add predictions
submission['Weekly_Sales'] = predictions

# Select required columns and save
submission[['Id', 'Weekly_Sales']].to_csv("Submission.csv", index=False)