In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np

def random_forest_model(csv_file, target_column='quality', scale=False):
    # Load the CSV
    data = pd.read_csv(csv_file, sep=',')

    # Convert quality to binary class: 1 if quality >= 7, else 0
    y = data[target_column].apply(lambda value: 1 if value >= 7 else 0)

    # Separate features
    X = data.drop(target_column, axis=1)
    column_names = list(X.columns)

    if scale:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train Random Forest
    model = RandomForestClassifier(n_estimators=200, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n✅ Accuracy: {accuracy:.4f}\n")



    return {
        "accuracy": accuracy
    }


In [11]:
metrics = random_forest_model("../cleanDatasets/winequality-red_cleaned.csv", target_column="quality", scale=True)

metrics = random_forest_model("../cleanDatasets/winequality-white_cleaned.csv", target_column="quality", scale=True)


✅ Accuracy: 0.9044


✅ Accuracy: 0.8096

