In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Load processed data
data = pd.read_csv("../data/winequality_processed.csv")

# Separate features (X) and target labels (y)
X = data.drop('quality_label', axis=1)
y = data['quality_label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Create a pipeline to streamline preprocessing and model training
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=42))
])

# Define the hyperparameter grid for tuning RandomForestClassifier
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5]
}

# Set up GridSearchCV to perform hyperparameter tuning
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# Fit the model using training data and perform grid search
grid_search.fit(X_train, y_train)

# Retrieve the best model (pipeline with optimal hyperparameters)
best_model = grid_search.best_estimator_


In [4]:
import joblib
joblib.dump(best_model, "../data/best_model.pkl")


['../data/best_model.pkl']

## Model Training and Hyperparameter Tuning

### Objective
This notebook implements the machine learning model, trains it on the processed dataset, and performs hyperparameter tuning.

### Key Tasks Performed
- Split the dataset into training and testing sets
- Built a machine learning pipeline with feature scaling
- Trained a Random Forest classifier
- Tuned hyperparameters using GridSearchCV with cross-validation
- Saved the best-performing model

### Model Justification
Random Forest was selected due to:
- Its robustness to noise
- Ability to model non-linear relationships
- Strong performance on numerical data

### Relevance to Project Requirements
This notebook satisfies the **Model Selection**, **Training Procedure**, and **Hyperparameter Tuning** requirements.