In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor


In [2]:
# Load data
folder = 'data'
train = pd.read_csv(f"./{folder}/train.csv", parse_dates=["Date"])
test = pd.read_csv(f"./{folder}/test.csv", parse_dates=["Date"])
features = pd.read_csv(f"./{folder}/features.csv", parse_dates=["Date"])
stores = pd.read_csv(f"./{folder}/stores.csv")

In [3]:
# Merge datasets
train = train.merge(features, on=['Store','Date','IsHoliday'], how='left')
train = train.merge(stores, on='Store', how='left')
test = test.merge(features, on=['Store','Date','IsHoliday'], how='left')
test = test.merge(stores, on='Store', how='left')

In [4]:
# Fill missing markdown values
md_cols = ['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']
for col in md_cols:
    train[col].fillna(0, inplace=True)
    test[col].fillna(0, inplace=True)

In [5]:
# Impute numeric feature missing values with train medians
num_cols = ['Temperature','Fuel_Price','CPI','Unemployment']
for col in num_cols:
    median_val = train[col].median()
    train[col].fillna(median_val, inplace=True)
    test[col].fillna(median_val, inplace=True)

In [6]:
# Feature engineering: date parts
for df in [train, test]:
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week

In [7]:
# One-hot encode store type
train = pd.get_dummies(train, columns=['Type'], drop_first=True)
test = pd.get_dummies(test, columns=['Type'], drop_first=True)

# Align train/test columns
train, test = train.align(test, join='left', axis=1, fill_value=0)

# Define feature columns
feature_cols = ['Store','Dept','Temperature','Fuel_Price','CPI','Unemployment','Size','Year','Month','WeekOfYear','IsHoliday'] + md_cols
feature_cols += [col for col in train.columns if col.startswith('Type_')]

# Prepare training data
X_train = train[feature_cols]
y_train = train['Weekly_Sales']

In [8]:
# Train Decision Tree Regressor
model = DecisionTreeRegressor(max_depth=10, min_samples_leaf=20, random_state=42)
model.fit(X_train, y_train)

# Predict on test
X_test = test[feature_cols]
preds = model.predict(X_test)

# Create Id column in the format store_dept_date
submission = test.copy()
submission['Id'] = submission['Store'].astype(str) + "_" + \
                   submission['Dept'].astype(str) + "_" + \
                   submission['Date'].dt.strftime("%Y-%m-%d")

# Add predictions
submission['Weekly_Sales'] = preds

# Select required columns and save
submission[['Id', 'Weekly_Sales']].to_csv("Submission_DT.csv", index=False)