## Imports and Helper Functions

In [2]:
import os
import pandas as pd
import numpy as np

def load_combined_df():
    """Load red+white CSVs, add 'color', return combined DataFrame."""
    base_dir = os.path.join("..", "data")
    red_path = os.path.join(base_dir, "winequality-red.csv")
    white_path = os.path.join(base_dir, "winequality-white.csv")

    red_df = pd.read_csv(red_path, sep=";")
    white_df = pd.read_csv(white_path, sep=";")
    red_df["color"], white_df["color"] = "red", "white"

    return pd.concat([red_df, white_df], ignore_index=True)

def create_label(df):
    """Add binary 'good' column: 1 if quality ≥ 6, else 0."""
    df = df.copy()
    df["good"] = (df["quality"] >= 6).astype(int)
    return df

def stratified_split(df, label_col="good", test_frac=0.2, seed=42):
    """Stratified shuffle-split into train/test DataFrames."""
    np.random.seed(seed)
    df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    train_idxs, test_idxs = [], []
    for lbl in df_shuffled[label_col].unique():
        subset = df_shuffled[df_shuffled[label_col] == lbl]
        n_test = int(np.floor(test_frac * len(subset)))
        test_idxs.extend(subset.index[:n_test])
        train_idxs.extend(subset.index[n_test:])

    train_df = df_shuffled.loc[train_idxs].reset_index(drop=True)
    test_df  = df_shuffled.loc[test_idxs].reset_index(drop=True)
    return train_df, test_df

def standardize_features(df_train, df_test, feature_cols):
    """
    Compute means/stds on df_train, then return standardized
    X_train, X_test, plus the means & stds Series.
    """
    means = df_train[feature_cols].mean()
    stds  = df_train[feature_cols].std().replace(0, 1)

    X_train = (df_train[feature_cols] - means) / stds
    X_test  = (df_test[feature_cols]  - means) / stds
    return X_train, X_test, means, stds


## Load & Label

In [5]:
# Load and label your data
df = load_combined_df()
df = create_label(df)

# Check shapes and class balance
print("Total samples:", len(df))
print("Good vs. Bad counts:\n", df["good"].value_counts(normalize=False))


Total samples: 6497
Good vs. Bad counts:
 good
1    4113
0    2384
Name: count, dtype: int64


## Train/Test Split

In [8]:
# Select feature columns (drop quality, color, good)
exclude = ["quality", "color", "good"]
features = [c for c in df.columns if c not in exclude]

# Do the stratified split
train_df, test_df = stratified_split(df, label_col="good", test_frac=0.2, seed=42)
print(f"Train: {len(train_df)} samples, Test: {len(test_df)} samples")

# Separate X & y
X_train_raw = train_df[features]
y_train     = train_df["good"]
X_test_raw  = test_df[features]
y_test      = test_df["good"]


Train: 5199 samples, Test: 1298 samples


## Standardize & Save

In [None]:
# Standardize using training stats
X_train, X_test, means, stds = standardize_features(train_df, test_df, features)

# Convert back to DataFrame
X_train = pd.DataFrame(X_train, columns=features)
X_test  = pd.DataFrame(X_test,  columns=features)

# Prepare output directory
processed_dir = os.path.join("..", "data", "processed")
os.makedirs(processed_dir, exist_ok=True)

# Save processed CSVs
pd.concat([X_train, y_train.rename("good")], axis=1).to_csv(
    os.path.join(processed_dir, "train_processed.csv"), index=False)
pd.concat([X_test, y_test.rename("good")], axis=1).to_csv(
    os.path.join(processed_dir, "test_processed.csv"), index=False)

# Save means & stds for later reuse
means.to_csv(os.path.join(processed_dir, "feature_means.csv"), header=True)
stds.to_csv(os.path.join(processed_dir, "feature_stds.csv"),  header=True)

print("Saved processed data to:", processed_dir)
