# Data Preprocessing

## Introduction
This document presents the data preprocessing procedures designed to optimize model performance and maximize predictive capability. The analysis begins with the requisite library imports.

In [None]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path

project_root = Path.cwd()
while not (project_root / "src").exists():
    project_root = project_root.parent

sys.path.append(str(project_root / "src"))

## Encoding

Initially, non-numeric columns such as wine type must be encoded appropriately.

In [None]:
red_wine = pd.read_csv('../data/raw/winequality-red.csv', sep=';')
white_wine = pd.read_csv('../data/raw/winequality-white.csv', sep=';')

red_wine['wine_type'] = 'red'
white_wine['wine_type'] = 'white'
wine_data = pd.concat([red_wine, white_wine], axis=0, ignore_index=True)
wine_data = pd.get_dummies(wine_data, columns=['wine_type'], dtype=int)

X = wine_data.drop(columns='quality')
y = wine_data['quality']

## Data Splitting
The data must then be partitioned into training and testing sets to prevent issues such as data leakage.

In [None]:
def split_train_test(X, y, test_size=0.2, random_state=None, stratify=None):

    if random_state is not None:
        np.random.seed(random_state)
    
    n_samples = len(X)
    n_test = int(n_samples * test_size)

    if stratify is not None:
        indices_train = []
        indices_test = []
        
        for class_val in np.unique(stratify):
            class_indices = np.where(stratify == class_val)[0]
            n_class_test = int(len(class_indices) * test_size)
            
            np.random.shuffle(class_indices)
            
            indices_test.extend(class_indices[:n_class_test])
            indices_train.extend(class_indices[n_class_test:])
        
        train_idx = np.array(indices_train)
        test_idx = np.array(indices_test)

    else:
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        
        test_idx = indices[:n_test]
        train_idx = indices[n_test:]

    return X.iloc[train_idx].copy(), X.iloc[test_idx].copy(), y.iloc[train_idx].copy(), y.iloc[test_idx].copy()


X_train, X_test, y_train, y_test = split_train_test(X, y, random_state=42, stratify=y)

## Feature Engineering
Based on the correlations identified during exploratory analysis, four features will be addressed: *free sulfur dioxide*, *total sulfur dioxide*, *density*, and *alcohol*. The first pair exhibits a correlation of 0.72, and a potentially effective solution involves creating a unified feature by calculating the ratio of free sulfur dioxide to total sulfur dioxide.

In [None]:
X_train['free sulfur dioxide ratio'] = X_train['free sulfur dioxide'] / X_train['total sulfur dioxide']
X_train = X_train.drop(columns=['total sulfur dioxide', 'free sulfur dioxide'])

X_test['free sulfur dioxide ratio'] = X_test['free sulfur dioxide'] / X_test['total sulfur dioxide']
X_test = X_test.drop(columns=['total sulfur dioxide', 'free sulfur dioxide'])


For the second pair, the feature with lower correlation to the target variable (*density*) will be removed from the dataset.

In [None]:
print(f'Alcohol Correlation: {wine_data['quality'].corr(wine_data['alcohol'])}')
print(f'Density Correlation: {wine_data['quality'].corr(wine_data['density'])}')

X_train = X_train.drop(columns='density')
X_test = X_test.drop(columns='density')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

CORRELATION_THRESHOLD = 0.35

correlation_matrix = X_train.corr(numeric_only=True)

triangle_mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
weak_corr_mask = abs(correlation_matrix) < CORRELATION_THRESHOLD
combined_mask = triangle_mask | weak_corr_mask

plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix,
            mask=combined_mask,
            annot=True,
            cmap='coolwarm',
            center=0)

plt.title(f'Strong Correlations (>{CORRELATION_THRESHOLD})')
plt.tight_layout()
plt.show()

## Scaling

To facilitate model optimization, the data will be standardized using standard scaling techniques.

In [None]:
from util import StandardScaler

standard_scaler = StandardScaler()
X_train = standard_scaler.fit_transform(X_train)
X_test = standard_scaler.transform(X_test)