# ANA680 – Problem 2A (No Container)
Linear Regression on the combined Wine Quality dataset (red + white) in SageMaker Studio JupyterLab.

In [1]:
# --- Imports (keep all imports here) ---
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib  # if you want to save the model

## Load & Combine Datasets
We load both CSVs, tag them by type (for EDA only), and combine into one DataFrame.

In [2]:
# Load both CSVs (already uploaded to the working directory)
red   = pd.read_csv("winequality-red.csv",   sep=";")
white = pd.read_csv("winequality-white.csv", sep=";")

# Optional: keep a 'type' column for analysis (not used for training)
red["type"]   = "red"
white["type"] = "white"

# Combine
df = pd.concat([red, white], ignore_index=True)

# Quick sanity check
display(df.head())
print("Combined shape:", df.shape)
df["type"].value_counts()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


Combined shape: (6497, 13)


type
white    4898
red      1599
Name: count, dtype: int64

## Train / Test Split
We drop the `type` column for modeling; the target is `quality`.

In [3]:
# Features & target
X = df.drop(columns=["quality", "type"])  # drop 'type' so the model learns from physicochemical features only
y = df["quality"]

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

X_train.shape, X_test.shape

((5197, 11), (1300, 11))

## Train Linear Regression & Evaluate
We fit a scikit-learn LinearRegression and report MSE, RMSE, and R².

In [4]:
# Train
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict & metrics
y_pred = lr.predict(X_test)
mse  = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2   = r2_score(y_test, y_pred)

print(f"MSE : {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²  : {r2:.4f}")

MSE : 0.5467
RMSE: 0.7394
R²  : 0.2598


## Save Trained Model (optional)
Saved locally in the Studio space; you can also upload to S3 if desired.

In [5]:
joblib.dump(lr, "wine_quality_lr_no_container.pkl")
["wine_quality_lr_no_container.pkl"]

['wine_quality_lr_no_container.pkl']

## Feature Order (for any future API/app)
Record the feature order expected by the model’s `.predict()` so client code can send features in the correct order.

In [6]:
feature_order = list(X.columns)
feature_order

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']