# Train and Test Split

Split the dataset into training (80%) and test (20%) sets in 5 different folds. Train the simple linear regression model (using gradient descent) for each split on the train-test data in each fold. Evaluate the model on the test set in each fold using:

- Mean Squared Error (MSE)
- Root Mean Squared Error (RMSE)
- R2 score

In [19]:
import pandas as pd

# Load the dataset
df = pd.read_pickle("../data/winequality.pkl")

In [20]:
from sklearn.model_selection import KFold
import numpy as np

# Prepare the data
X_alcohol = df[["alcohol"]].values
X_chlorides = df[["chlorides"]].values
y = df["quality"].values

# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [21]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

def train_splits(X, y, kf, model):
    """
    Generator function to train and evaluate the model on each fold.
    """
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        
        yield mse, rmse, r2

# Initialize the model
model = SGDRegressor(
    max_iter=1000, 
    tol=1e-3, 
    learning_rate='invscaling', 
    eta0=0.01, 
    random_state=42
)

# Calculate metrics for Alcohol
mse_alc, rmse_alc, r2_alc = [], [], []
for mse, rmse, r2 in train_splits(X_alcohol, y, kf, model):
    mse_alc.append(mse)
    rmse_alc.append(rmse)
    r2_alc.append(r2)

# Calculate metrics for Chlorides
mse_chl, rmse_chl, r2_chl = [], [], []
for mse, rmse, r2 in train_splits(X_chlorides, y, kf, model):
    mse_chl.append(mse)
    rmse_chl.append(rmse)
    r2_chl.append(r2)


In [22]:
import pandas as pd

results = pd.DataFrame({
    "Alcohol": {
        "MSE mean": np.mean(mse_alc),
        "MSE var": np.var(mse_alc),
        "RMSE mean": np.mean(rmse_alc),
        "RMSE var": np.var(rmse_alc),
        "R2 mean": np.mean(r2_alc),
        "R2 var": np.var(r2_alc)
    },
    "Chlorides": {
        "MSE mean": np.mean(mse_chl),
        "MSE var": np.var(mse_chl),
        "RMSE mean": np.mean(rmse_chl),
        "RMSE var": np.var(rmse_chl),
        "R2 mean": np.mean(r2_chl),
        "R2 var": np.var(r2_chl)
    }
})

results

Unnamed: 0,Alcohol,Chlorides
MSE mean,0.552525,0.653667
MSE var,0.005241,0.00452
RMSE mean,0.741718,0.80741
RMSE var,0.00238,0.001755
R2 mean,0.149198,-0.008209
R2 var,0.002416,7e-06
