In [1]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the datasets
df_red = pd.read_csv('winequality-red.csv', sep=';')
df_white = pd.read_csv('winequality-white.csv', sep=';')

# Add a column for wine type
df_red['wine_type'] = 'red'
df_white['wine_type'] = 'white'

# Combine the datasets
df = pd.concat([df_red, df_white], ignore_index=True)

# One-hot encode wine_type
df = pd.get_dummies(df, columns=['wine_type'], drop_first=True)

# Separate features and target
X = df.drop('quality', axis=1)
y = df['quality']

# Train/test split BEFORE scaling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features AFTER split
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Final confirmation
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_test_scaled.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")


X_train_scaled shape: (5197, 12)
X_test_scaled shape: (1300, 12)
y_train shape: (5197,), y_test shape: (1300,)


In [9]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

# Load datasets
df_red = pd.read_csv('winequality-red.csv', sep=';')
df_white = pd.read_csv('winequality-white.csv', sep=';')

# Add wine type column
df_red['wine_type'] = 'red'
df_white['wine_type'] = 'white'

# Combine datasets
df = pd.concat([df_red, df_white], ignore_index=True)

# One-hot encode wine type
df = pd.get_dummies(df, columns=['wine_type'], drop_first=True)

# Separate features and target
X = df.drop('quality', axis=1)
y = df['quality']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features AFTER split
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Prepare shifted versions of labels for XGBoost
y_train_shifted = y_train - y_train.min()
y_test_shifted = y_test - y_train.min()  # same offset (usually min is 3)

# Initialize results list
results = []

# Model definitions
model_names = ["Logistic Regression", "Random Forest", "XGBoost"]

# Model training and evaluation loop
for name in model_names:
    if name == "Logistic Regression":
        model = LogisticRegression(max_iter=1000, random_state=42)
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

    elif name == "Random Forest":
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

    elif name == "XGBoost":
        model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
        model.fit(X_train_scaled, y_train_shifted)
        y_pred = model.predict(X_test_scaled)
        y_pred = y_pred + y_train.min()  # shift back to match original labels

    # Calculate evaluation metrics
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    results.append({
        "Model": name,
        "Precision (macro)": round(precision, 3),
        "Recall (macro)": round(recall, 3),
        "F1 Score (macro)": round(f1, 3)
    })

# Show results
results_df = pd.DataFrame(results).sort_values(by="F1 Score (macro)", ascending=False).reset_index(drop=True)
print("\nModel Comparison:")
print(results_df)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Model Comparison:
                 Model  Precision (macro)  Recall (macro)  F1 Score (macro)
0        Random Forest              0.510           0.363             0.392
1              XGBoost              0.444           0.356             0.380
2  Logistic Regression              0.378           0.228             0.233
