In [None]:

import pandas as pd
import numpy as np
from sklearn.kernel_ridge import KernelRidge
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    train_test_split,
    ShuffleSplit,
    KFold,
    LeaveOneOut
)
from sklearn.pipeline import make_pipeline


file_path = '/content/winequality-white.csv'
df = pd.read_csv(file_path, sep=';')


print(f'Number of rows before removing missing values: {df.shape[0]}')


df = df.dropna()


print(f'Number of rows after removing missing values: {df.shape[0]}')


print(f'Number of rows before outlier removal: {df.shape[0]}')


def remove_outliers_iqr(df, exclude_columns=None):
    if exclude_columns is None:
        exclude_columns = []
    df_clean = df.copy()
    numeric_cols = df_clean.select_dtypes(include=['number']).columns.difference(exclude_columns)
    for column in numeric_cols:
        Q1 = df_clean[column].quantile(0.25)
        Q3 = df_clean[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[column] >= lower_bound) & (df_clean[column] <= upper_bound)]
    return df_clean


df = remove_outliers_iqr(df)


print(f'Number of rows after outlier removal: {df.shape[0]}')

def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    epsilon = 1e-10
    denominator = np.where(denominator == 0, epsilon, denominator)
    return np.mean(numerator / denominator) * 100


X = df.drop(columns=['quality'])
y = df['quality']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


pipeline = make_pipeline(StandardScaler(), PLSRegression(n_components=9))


validation_results = {}


def compute_metrics(model, X, y, cv, compute_r2=True):
    mae_scores = []
    rmse_scores = []
    smape_scores = []
    if compute_r2:
        r2_scores = []

    for train_index, val_index in cv.split(X):
        X_train_cv, X_val_cv = X.iloc[train_index], X.iloc[val_index]
        y_train_cv, y_val_cv = y.iloc[train_index], y.iloc[val_index]

        model.fit(X_train_cv, y_train_cv)
        y_pred_cv = model.predict(X_val_cv)

        mae_scores.append(mean_absolute_error(y_val_cv, y_pred_cv))
        rmse_scores.append(
            np.sqrt(mean_squared_error(y_val_cv, y_pred_cv))
        )
        smape_scores.append(
            symmetric_mean_absolute_percentage_error(y_val_cv, y_pred_cv)
        )
        if compute_r2:
            r2_scores.append(r2_score(y_val_cv, y_pred_cv))

    metrics = {
        'MAE': np.mean(mae_scores),
        'RMSE': np.mean(rmse_scores),
        'SMAPE': np.mean(smape_scores)
    }
    if compute_r2:
        metrics['R2'] = np.mean(r2_scores)
    return metrics

# 1. Random Cross-Validation (ShuffleSplit with 5 splits)
random_cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
scores = compute_metrics(pipeline, X_train, y_train, random_cv)
validation_results['Random CV (5 splits)'] = scores

# Random Cross-Validation (ShuffleSplit with 10 splits)
random_cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=42)
scores = compute_metrics(pipeline, X_train, y_train, random_cv)
validation_results['Random CV (10 splits)'] = scores

# 2. K-Fold Cross-Validation with various folds (Shuffled)
for n_splits in [5, 7, 10]:
    kfold_shuffle_cv = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = compute_metrics(pipeline, X_train, y_train, kfold_shuffle_cv)
    validation_results[f'K-Fold ({n_splits} folds, shuffled)'] = scores

# 3. Leave-One-Out Cross-Validation (LOO)
loo_cv = LeaveOneOut()
scores = compute_metrics(pipeline, X_train, y_train, loo_cv, compute_r2=False)
validation_results['Leave-One-Out'] = scores

# 4. Venetian Blinds Cross-Validation
num_partitions = 10
indices = np.arange(len(X_train))
venetian_mae_scores = []
venetian_rmse_scores = []
venetian_smape_scores = []
venetian_r2_scores = []

for i in range(num_partitions):
    test_mask_vb = indices % num_partitions == i
    train_mask_vb = ~test_mask_vb

    X_train_vb = X_train.iloc[train_mask_vb]
    X_val_vb = X_train.iloc[test_mask_vb]
    y_train_vb = y_train.iloc[train_mask_vb]
    y_val_vb = y_train.iloc[test_mask_vb]

    pipeline.fit(X_train_vb, y_train_vb)
    y_pred_vb = pipeline.predict(X_val_vb)

    venetian_mae_scores.append(mean_absolute_error(y_val_vb, y_pred_vb))
    venetian_rmse_scores.append(
        np.sqrt(mean_squared_error(y_val_vb, y_pred_vb))
    )
    venetian_smape_scores.append(
        symmetric_mean_absolute_percentage_error(y_val_vb, y_pred_vb)
    )
    venetian_r2_scores.append(r2_score(y_val_vb, y_pred_vb))

validation_results['Venetian Blinds'] = {
    'MAE': np.mean(venetian_mae_scores),
    'RMSE': np.mean(venetian_rmse_scores),
    'SMAPE': np.mean(venetian_smape_scores),
    'R2': np.mean(venetian_r2_scores)
}

# 5. K-Fold Cross-Validation with 7 Folds (No Shuffling)
kfold_cv = KFold(n_splits=7, shuffle=False)
scores = compute_metrics(pipeline, X_train, y_train, kfold_cv)
validation_results['K-Fold (7 folds, no shuffle)'] = scores


pipeline.fit(X_train, y_train)
y_train_pred = pipeline.predict(X_train)
mae_train = mean_absolute_error(y_train, y_train_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
smape_train = symmetric_mean_absolute_percentage_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)


y_test_pred = pipeline.predict(X_test)
mae_test = mean_absolute_error(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
smape_test = symmetric_mean_absolute_percentage_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)


delta_mae_train = abs(mae_train - mae_test)
delta_rmse_train = abs(rmse_train - rmse_test)
delta_smape_train = abs(smape_train - smape_test)
delta_r2_train = abs(r2_train - r2_test)


results_data = []

results_data.append({
    'Technique': 'Training Set',
    'MAE': mae_train,
    'RMSE': rmse_train,
    'SMAPE': smape_train,
    'R2': r2_train,
    'Delta MAE': delta_mae_train,
    'Delta RMSE': delta_rmse_train,
    'Delta SMAPE': delta_smape_train,
    'Delta R2': delta_r2_train,
    'Precision MAE (%)': 0.0,
    'Precision RMSE (%)': 0.0,
    'Precision SMAPE (%)': 0.0,
    'Precision R2 (%)': 0.0,
    'Overall Precision (%)': 0.0
})

results_data.append({
    'Technique': 'Test Set',
    'MAE': mae_test,
    'RMSE': rmse_test,
    'SMAPE': smape_test,
    'R2': r2_test,
    'Delta MAE': 0.0,
    'Delta RMSE': 0.0,
    'Delta SMAPE': 0.0,
    'Delta R2': 0.0,
    'Precision MAE (%)': 100.0,
    'Precision RMSE (%)': 100.0,
    'Precision SMAPE (%)': 100.0,
    'Precision R2 (%)': 100.0,
    'Overall Precision (%)': 100.0
})


for method, metrics in validation_results.items():
    mae_val = metrics['MAE']
    rmse_val = metrics['RMSE']
    smape_val = metrics['SMAPE']
    r2_val = metrics.get('R2', None)

    delta_mae = abs(mae_val - mae_test)
    delta_rmse = abs(rmse_val - rmse_test)
    delta_smape = abs(smape_val - smape_test)
    delta_r2 = abs(r2_val - r2_test) if r2_val is not None else 'N/A'

    precision_mae = (1 - (delta_mae / delta_mae_train)) * 100 if delta_mae_train != 0 else 0
    precision_rmse = (1 - (delta_rmse / delta_rmse_train)) * 100 if delta_rmse_train != 0 else 0
    precision_smape = (1 - (delta_smape / delta_smape_train)) * 100 if delta_smape_train != 0 else 0
    if delta_r2_train != 0 and r2_val is not None:
        precision_r2 = (1 - (delta_r2 / delta_r2_train)) * 100
    else:
        precision_r2 = 'N/A'

    precision_values = [precision_mae, precision_rmse, precision_smape]
    if isinstance(precision_r2, float):
        precision_values.append(precision_r2)
    overall_precision = np.mean([p for p in precision_values if isinstance(p, (int, float))])

    result_entry = {
        'Technique': method,
        'MAE': mae_val,
        'RMSE': rmse_val,
        'SMAPE': smape_val,
        'Delta MAE': delta_mae,
        'Delta RMSE': delta_rmse,
        'Delta SMAPE': delta_smape,
        'Precision MAE (%)': precision_mae,
        'Precision RMSE (%)': precision_rmse,
        'Precision SMAPE (%)': precision_smape,
        'Overall Precision (%)': overall_precision
    }

    if r2_val is not None:
        result_entry['R2'] = r2_val
        result_entry['Delta R2'] = delta_r2
        result_entry['Precision R2 (%)'] = precision_r2
    else:
        result_entry['R2'] = 'N/A'
        result_entry['Delta R2'] = 'N/A'
        result_entry['Precision R2 (%)'] = 'N/A'

    results_data.append(result_entry)

results_table = pd.DataFrame(results_data)

metrics_to_format = ['MAE', 'RMSE', 'SMAPE', 'Delta MAE', 'Delta RMSE', 'Delta SMAPE',
                     'Precision MAE (%)', 'Precision RMSE (%)', 'Precision SMAPE (%)',
                     'Overall Precision (%)']

results_table_formatted = results_table.copy()
results_table_formatted[metrics_to_format] = results_table_formatted[metrics_to_format].applymap(
    lambda x: f"{x:.4f}" if isinstance(x, (int, float)) else x
)

def format_r2(x):
    if isinstance(x, float):
        return f"{x:.4f}"
    else:
        return x

results_table_formatted['R2'] = results_table_formatted['R2'].apply(format_r2)
results_table_formatted['Delta R2'] = results_table_formatted['Delta R2'].apply(format_r2)
results_table_formatted['Precision R2 (%)'] = results_table_formatted['Precision R2 (%)'].apply(format_r2)

print("\nFormatted Results:")
print(results_table_formatted)



In [None]:

import pandas as pd
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt




param_grid = {
    'plsregression__n_components': list(range(1, X_train.shape[1] + 1))
}


pipeline = make_pipeline(StandardScaler(), PLSRegression())


grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=10,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    return_train_score=True
)


grid_search.fit(X_train, y_train)


best_model = grid_search.best_estimator_
print(f"Best number of components: {grid_search.best_params_['plsregression__n_components']}")


mean_test_scores = -grid_search.cv_results_['mean_test_score']
mean_train_scores = -grid_search.cv_results_['mean_train_score']
n_components = param_grid['plsregression__n_components']


plt.figure(figsize=(8, 6))
plt.plot(n_components, mean_train_scores, label='Training MAE', marker='o')
plt.plot(n_components, mean_test_scores, label='Cross-Validation MAE', marker='s')
plt.xlabel('Number of PLS Components')
plt.ylabel('Mean Absolute Error')
plt.title('Validation Curve for PLSRegression')
plt.legend()
plt.grid(True)
plt.show()



In [None]:
results_table_formatted.to_csv('WinePLSR_N.csv', index=False)
from google.colab import files
files.download('WinePLSR_N.csv')