In [2]:
# I choose Random Forest

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# Load training and testing data
train = pd.read_csv("https://github.com/dustywhite7/Econ8310/raw/master/AssignmentData/assignment3.csv")
test = pd.read_csv("https://github.com/dustywhite7/Econ8310/raw/master/AssignmentData/assignment3test.csv")

# Feature engineering
def feature_engineer(df):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]  # clean column names

    # Convert DateTime to useful numeric features
    if "DateTime" in df.columns:
        dt = pd.to_datetime(df["DateTime"], errors="coerce")
        df["hour"] = dt.dt.hour
        df["dow"] = dt.dt.dayofweek
        df["month"] = dt.dt.month

    # Drop ID-like or raw datetime columns
    for col in ["id", "DateTime"]:
        if col in df.columns:
            df.drop(columns=col, inplace=True)

    return df

train = feature_engineer(train)
test = feature_engineer(test)

# Split features and target
y = train["meal"].astype(int)
X = train.drop(columns=["meal"])

# Handle missing values
imputer = SimpleImputer(strategy="most_frequent")
X_imputed = imputer.fit_transform(X)

# Define model
def model(random_state=42):
    return RandomForestClassifier(
        n_estimators=300,
        random_state=random_state,
        n_jobs=-1,
        class_weight="balanced_subsample"
    )

# Train model
modelFit = model()
modelFit.fit(X_imputed, y)

# Prepare test data and predict

# Align columns
for col in X.columns:
    if col not in test.columns:
        test[col] = 0
test = test[X.columns]

# Impute missing values and predict
X_test = imputer.transform(test)
pred = pd.Series(modelFit.predict(X_test), name="pred")
