# Data Preprocessing

## Introduction
This document presents the data preprocessing procedures designed to optimize model performance and maximize predictive capability. The analysis begins with the requisite library imports.

In [2]:
import pandas as pd
import numpy as np

## Data Splitting
Initially, prior to any preprocessing procedures, the data must be partitioned into training and testing sets to prevent issues such as data leakage.

In [12]:
red_wine = pd.read_csv('../data/winequality-red.csv', sep=';')
white_wine = pd.read_csv('../data/winequality-white.csv', sep=';')

red_wine['wine_type'] = 'red'
white_wine['wine_type'] = 'white'
wine_data = pd.concat([red_wine, white_wine], axis=0, ignore_index=True)

X = wine_data.drop(columns='quality')
y = wine_data['quality']


def split_train_test(X, y, test_size=0.2, random_state=None, stratify=None):

    if random_state is not None:
        np.random.seed(random_state)
    
    n_samples = len(X)
    n_test = int(n_samples * test_size)

    if stratify is not None:
        indices_train = []
        indices_test = []
        
        for class_val in np.unique(stratify):
            class_indices = np.where(stratify == class_val)[0]
            n_class_test = int(len(class_indices) * test_size)
            
            np.random.shuffle(class_indices)
            
            indices_test.extend(class_indices[:n_class_test])
            indices_train.extend(class_indices[n_class_test:])
        
        train_idx = np.array(indices_train)
        test_idx = np.array(indices_test)
        
    else:
        indices = np.arange(n_samples)
        np.random.shuffle(indices)
        
        test_idx = indices[:n_test]
        train_idx = indices[n_test:]

    return X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]


X_train, X_test, y_train, y_test = split_train_test(X, y, random_state=42, stratify=y)
X_test.to_csv('../data/X_test.csv')
y_test.to_csv('../data/y_test.csv')