In [1]:
from google.colab import files
uploaded = files.upload()

Saving winequality-red.csv to winequality-red.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge # Used as it is the most stable LR-family model
from sklearn.metrics import r2_score

# Load the dataset
df = pd.read_csv('winequality-red.csv')

# Define features (X) and target (y)
X = df.drop('quality', axis=1)
y = df['quality']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 1. Preprocessing Pipeline ---

# a. Scaling the features (Essential for stable LR performance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# b. Generating Polynomial Features (Degree 2)
# This step maximizes the score by introducing interaction and squared terms.
poly_2 = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_2.fit_transform(X_train_scaled)
X_test_poly = poly_2.transform(X_test_scaled)

# --- 2. Model Training and Evaluation ---

# Use Ridge Regression (Regularized Linear Regression) with optimized alpha=1.0
ridge_model = Ridge(alpha=1.0, random_state=42)
ridge_model.fit(X_train_poly, y_train)

# Predict and Calculate R2 score
y_pred = ridge_model.predict(X_test_poly)
r2 = r2_score(y_test, y_pred)

print(f"Maximum R-squared Score for Linear Regression Family: {r2:.4f}")

Maximum R-squared Score for Linear Regression Family: 0.4167
