In [None]:
from pathlib import Path
import pandas as pd

RAW_DIR = Path("../data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)
CLEAN_DIR = Path("../data/clean")
CLEAN_DIR.mkdir(parents=True, exist_ok=True)
LOCAL_FALLBACK = RAW_DIR / "onlinefoods.csv"

In [None]:
dataset_dir = None
try:
  import kagglehub
  dataset_dir = Path(kagglehub.dataset_download("sudarshan24byte/online-food-dataset"))
  print("KaggleHub ok:", dataset_dir)
except ImportError:
  print("KaggleHub not available, using local fallback.")

# Read CSV

In [None]:
def read_first_csv(folder: Path) -> pd.DataFrame:
    files = sorted(folder.glob("*.csv"))
    """Read the first CSV file in the given folder."""
    if not files:
        raise FileNotFoundError(f"No CSV files found in {folder}")
    print("Reading file:", files[0])
    try: return pd.read_csv(files[0])
    except UnicodeDecodeError: return pd.read_csv(files[0], encoding="latin-1")

if dataset_dir:
    df = read_first_csv(dataset_dir)
elif LOCAL_FALLBACK.exists():
    df = pd.read_csv(LOCAL_FALLBACK)
else: raise FileNotFoundError("No dataset found. Please check the dataset directory or local fallback.")

print("Dataset shape:", df.shape)
df.head()

# Quick Health Check
AIM: To gain an initial understanding of the structure and quality of the raw data and to locate "obvious problems".

In [None]:
df.shape
df.dtypes


Change to binary type:
- Gender: Male-0, Female-1
- Output: No-0, Yes-1
- Feedback: Negative-0, Positive-1
Need to be dropped:
- Unnamed 12
To be transformed:
- Marital Status
- Occupation
- Monthly Income
- Educational Qualifications

In [None]:
df.isna().sum().sort_values(ascending=False).head(10)

There is no NA data in the dataset.

# Data Cleaning and Standardize
Remove the Unnamed column.

In [None]:
import numpy as np
import re

df = df.drop(columns=["Unnamed: 12"], errors="ignore")

 Change the column names.

In [None]:
df.columns = (df.columns.
              str.strip()
              .str.lower()
              .str.replace(r'[^a-z0-9]+', '_', regex=True)
              .str.strip('_'))
print(df.columns.tolist())

In [None]:
df = df.rename(columns={"educational_qualifications": "education"})

Change the type of pin_code into object.

In [None]:
df["pin_code"] = df["pin_code"].astype(str)

# Choose the response variable
Response variable: output


The relationship between demographic/location factors and online food ordering behavior.

Select the first three digits of the pincode.
AIM: to avoid noise affecting the data.

In [None]:
df["monthly_income"].value_counts()

In [None]:
income_map = {
  "No Income": 0.0,
  "Below Rs.10000": 5000.0,
  "10001 to 25000": 17500.0,
  "25001 to 50000": 37500.0,
  "More than 50000": 60000.0
}

df["income_mid"] = df["monthly_income"].map(income_map)
print("Income midpoints:", df["income_mid"].value_counts())

In [None]:
df["y_ordered"] = df["output"].map({"No": 0, "Yes": 1})
df_model = df[df["y_ordered"].isin([0, 1])].copy()
print("counts:", df_model["y_ordered"].value_counts())

In [None]:
data = df_model
print("shape:", data.shape)
print("\nhead():")
print(data.head(3))
print("\ndtypes:")
print(data.dtypes)

In [None]:
print(data.isna().sum().sort_values(ascending=False))

In [None]:
num_cols = [ c for c in ["age", "family_size", "income_mid", "pin_code"]]



In [None]:
from sklearn.metrics import log_loss, roc_auc_score, f1_score

props = df_model["y_ordered"].value_counts(normalize=True)
y_true = df_model["y_ordered"]
p_pos = props.get(1, 0.0)
maj = 1 if p_pos > 0.5 else 0
pred_majority = np.full_like(y_true, maj)

null_acc = (pred_majority == y_true).mean()
null_f1 = f1_score(y_true, pred_majority)

print(f"\nNull accuracy (always predict {maj}): {null_acc:.3f}")
print(f"Null F1 (positive class): {null_f1:.3f}")

proba_null = np.full(y_true.shape[0], float(p_pos))
print("Null LogLoss:", round(log_loss(y_true, proba_null), 3))
print("Null AUC    :", roc_auc_score(y_true, proba_null))


The accuracy of null model is 0.776, which means the dataset is unbalance.
For the future model, the LogLoss should be lower than Null LogLoss - 0.532, and the AUC should be higher than 0.5.

In [None]:
num_cols = [ c for c in ["age", "family_size", "income_mid"]]
cat_cols = [ c for c in ["gender", "marital_status", "occupation", "education", "pin3"]]

x = df_model[num_cols + cat_cols].copy()
y = df_model["y_ordered"].copy()

print("numerical features:", num_cols)
print("categorical features:", cat_cols)
print("x shape:", x.shape, "| y shape:", y.shape)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

features = num_cols + cat_cols
rows = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for f in features:
    xf = x[[f]].copy()
    if f in num_cols:
        prep = ColumnTransformer([("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler())
        ]), [f])])
    else:
        prep = ColumnTransformer([("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh", OneHotEncoder(handle_unknown="ignore"))
        ]), [f])])

    clf = Pipeline([("prep", prep),
                    ("lr", LogisticRegression(max_iter=300, class_weight="balanced"))])

    scores = []
    for tr, va in skf.split(xf, y):
        clf.fit(xf.iloc[tr], y.iloc[tr])
        proba = clf.predict_proba(xf.iloc[va])[:, 1]
        scores.append(roc_auc_score(y.iloc[va], proba))

    rows.append({"feature": f,
                 "cv_auc_mean": float(np.mean(scores)),
                 "cv_auc_std":  float(np.std(scores))})

uni_auc = pd.DataFrame(rows).sort_values("cv_auc_mean", ascending=False)
print(uni_auc)