In [9]:
# Random Forest

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# Step 1: Load data
train = pd.read_csv("https://raw.githubusercontent.com/dustywhite7/econ8310-assignment2/main/assignment2train.csv")
test = pd.read_csv("https://raw.githubusercontent.com/dustywhite7/econ8310-assignment2/main/assignment2test.csv")

# Step 2: Feature engineering
def feature_engineer(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]

    if "DateTime" in df.columns:
        dt = pd.to_datetime(df["DateTime"], errors="coerce")
        df["hour"] = dt.dt.hour
        df["dow"] = dt.dt.dayofweek
        df["month"] = dt.dt.month

    for col in ["id", "DateTime"]:
        if col in df.columns:
            df.drop(columns=col, inplace=True)
    return df

train = feature_engineer(train)
test = feature_engineer(test)

# Step 3: Split features and target
y = train["meal"].astype(int)
X = train.drop(columns=["meal"])

# Step 4: Impute missing values
imputer = SimpleImputer(strategy="most_frequent")
X_imputed = imputer.fit_transform(X)

# Step 5: Train Random Forest model
model = RandomForestClassifier(
    n_estimators=500,
    max_features="sqrt",
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

modelFit = model.fit(X_imputed, y)

# Step 6: Prepare test set and predict
for col in X.columns:
    if col not in test.columns:
        test[col] = 0
test = test[X.columns]

X_test = imputer.transform(test)
pred = [int(x) for x in modelFit.predict(X_test)]

# Step 7: Preview prediction results
pd.DataFrame({"pred": pred}).head()





Unnamed: 0,pred
0,0
1,0
2,0
3,0
4,1
