In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.linear_model import  LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.pipeline import make_pipeline



In [None]:
df = pd.read_csv("winequality.csv")
print(df.head())
print(df.info())

In [None]:
print(df.describe())

In [None]:
print(df.isnull().sum())

In [None]:
for col, value in df.items():
    if col != 'type':
        df[col] = df[col].fillna(df[col].mean())

In [None]:
df.isnull().sum()

In [None]:
fig, ax = plt.subplots(ncols = 6, nrows = 2 , figsize = (20,10))
index = 0
ax = ax.flatten()

for col, value in df.items():
    if col != 'type':
        sns.boxplot(y=col, data = df, ax = ax[index])
        index +=1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)

In [None]:
# Copy original data (excluding non-numeric columns)
df_z = df.select_dtypes(include=[np.number]).copy()

# Compute z-scores
z_scores = np.abs(zscore(df_z))

# Define threshold
threshold = 3

# Replace outliers with upper/lower limits
for i, col in enumerate(df_z.columns):
    col_mean = df_z[col].mean()
    col_std = df_z[col].std()
    upper_limit = col_mean + threshold * col_std
    lower_limit = col_mean - threshold * col_std

    # Replace values outside the threshold
    df_z[col] = np.where(z_scores[:, i] > threshold,
                         np.where(df_z[col] > upper_limit, upper_limit, lower_limit),
                         df_z[col])

# Now df_z contains data with outliers capped
print("✅ Outliers handled using Z-score method.")

# Optionally, you can replace the original df values:
# df[df_z.columns] = df_z


In [None]:
plt.figure(figsize =(12,8))
sns.heatmap(df.corr(numeric_only = True),annot = True, cmap ='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:
fig, ax = plt.subplots(ncols = 6, nrows = 2 , figsize = (20,10))
index = 0
ax = ax.flatten()

for col, value in df.items():
    if col != 'type':
        sns.distplot(value, ax=ax[index])
        index +=1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)

In [None]:
X = df.drop(columns =['type','quality'])
y = df['quality']

In [None]:
y.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=4)
# transform the dataset
X, y = oversample.fit_resample(X, y)

In [None]:
y.value_counts()

In [None]:
df.replace([np.inf,-np.inf],np.nan,inplace = True)
df.dropna(inplace=True)
# Remove or encode non-numeric column
df.columns = df.columns.str.strip()  # remove any leading/trailing spaces

if 'type' in df.columns:
    df = pd.get_dummies(df, columns=['type'], drop_first=True)

X_simple = df[['free sulfur dioxide']]  
X_multi = df[['free sulfur dioxide','type_white']]
y = df['total sulfur dioxide'] 

In [None]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X_simple, y, test_size=0.25, random_state=42)

# Train model
model_simple = LinearRegression()
model_simple.fit(x_train, y_train)
y_pred = model_simple.predict(x_test)

# Evaluation metrics
mse_simple = mean_squared_error(y_test, y_pred)
rmse_simple = np.sqrt(mse_simple)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print("📘 Simple Linear Regression on 'residual sugar'")
print(f"R² Score      : {r2:.4f}")
print(f"MSE           : {mse_simple:.4f}")
print(f"RMSE          : {rmse_simple:.4f}")
print(f"MAE           : {mae:.4f}")
print(f"MAPE          : {mape * 100:.2f}%")


In [None]:
plt.figure(figsize=(10, 5))
plt.plot(np.arange(len(y_test)), y_test.values, color='blue', label='Actual')
plt.plot(np.arange(len(y_test)), y_pred, color='red', alpha=0.6, label='Predicted')
plt.title("🔍 Simple Linear Regression Line Graph")
plt.xlabel("Index")
plt.ylabel("Quality")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_multi, y, test_size=0.25, random_state=42)

model_multi = LinearRegression()
model_multi.fit(x_train, y_train)
y_pred = model_multi.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("📘 Multiple Linear Regression")
print(f"R² Score: {r2:.4f}")
print(f"MSE     : {mse:.4f}")
print(f"RMSE    : {rmse:.4f}")
print(f"MAE     : {mae:.4f}")
print(f"MAPE    : {mape * 100:.2f}%")

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(np.arange(len(y_test)), y_test.values, color='blue', label='Actual')
plt.plot(np.arange(len(y_test)), y_pred, color='orange', alpha=0.6, label='Predicted')
plt.title('Multiple Linear Regression – Line Graph Over Index')
plt.xlabel('Index')
plt.ylabel('Quality')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Polynomial transformation (Degree 2)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_simple)

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.25, random_state=42)

# Train model
model_poly = LinearRegression()
model_poly.fit(x_train, y_train)
y_pred = model_poly.predict(x_test)

# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print("📘 Polynomial Regression (Degree = 4) on 'residual sugar'")
print(f"R² Score: {r2:.4f}")
print(f"MSE     : {mse:.4f}")
print(f"RMSE    : {rmse:.4f}")
print(f"MAE     : {mae:.4f}")
print(f"MAPE    : {mape * 100:.2f}%")

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(np.arange(len(y_test)), y_test.values, color='blue', label='Actual')
plt.plot(np.arange(len(y_test)), y_pred, color='green', alpha=0.6, label='Predicted (Poly)')
plt.title('Polynomial Regression (Degree 4) – Line Graph Over Index')
plt.xlabel('Index')
plt.ylabel('Quality')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Simple Linear Regression
mse_simple = mse_simple
rmse_simple = rmse_simple
mape_simple = mape * 100  # already done in your code

# Multiple Linear Regression
mse_multi = mse
rmse_multi = rmse
mape_multi = mape * 100

# Polynomial Regression
mse_poly = mse
rmse_poly = rmse
mape_poly = mape * 100


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Model labels
models = ['Simple Linear', 'Multiple Linear', 'Polynomial (deg=2)']

mae_values = [mse_simple, mse_multi, mse_poly]
rmse_values = [rmse_simple, rmse_multi, rmse_poly]


# Bar width and x-axis setup
x = np.arange(len(models))  # the label locations
bar_width = 0.35

# Plot
plt.figure(figsize=(8, 6))
plt.bar(x - bar_width/2, mae_values, bar_width, label='MAE', color='skyblue')
plt.bar(x + bar_width/2, rmse_values, bar_width, label='RMSE', color='salmon')

# Labels and formatting
plt.ylabel('Error in $')
plt.title('Model Comparison - Error Metrics (Lower is better)')
plt.xticks(x, models)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.6)

# Add text labels on top of bars
for i in range(len(models)):
    plt.text(x[i] - bar_width/2, mae_values[i] + 500, f'{mae_values[i]:,}', ha='center', va='bottom')
    plt.text(x[i] + bar_width/2, rmse_values[i] + 500, f'{rmse_values[i]:,}', ha='center', va='bottom')

plt.tight_layout()
plt.show()


In [None]:


# Assuming X_multi and y are already defined
model_multi = LinearRegression()

# Store results
split_range = range(3, 11)
average_r2_scores = []

for n in split_range:
    kfold = KFold(n_splits=n, shuffle=True, random_state=42)
    scores = cross_val_score(model_multi, X_multi, y, cv=kfold, scoring='r2')
    average_score = scores.mean()
    average_r2_scores.append(average_score)
    
    # Print individual R2 scores
    print(f"\nK-Fold Split = {n}")
    print("R2 Scores:", scores)
    print("Average R2 Score:", average_score)

# Plotting the average R² scores
plt.figure(figsize=(8, 5))
plt.plot(split_range, average_r2_scores, marker='o', color='blue', linestyle='-')
plt.title('Average R² Score vs. Number of K-Folds')
plt.xlabel('Number of Splits (K)')
plt.ylabel('Average R² Score')
plt.grid(True)
plt.xticks(split_range)
plt.tight_layout()
plt.show()
