## Preparando os Preditores

In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [1]:
from ucimlrepo import fetch_ucirepo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge

In [2]:
wine_quality = fetch_ucirepo(id=186)

X = wine_quality.data.features # Preditores
Y = wine_quality.data.targets # Output

# metadata
print(wine_quality.metadata)

print(wine_quality.variables)

{'uci_id': 186, 'name': 'Wine Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/186/wine+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/186/data.csv', 'abstract': 'Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 4898, 'num_features': 11, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['quality'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Wed Nov 15 2023', 'dataset_doi': '10.24432/C56S3T', 'creators': ['Paulo Cortez', 'A. Cerdeira', 'F. Almeida', 'T. Matos', 'J. Reis'], 'intro_paper': {'ID': 252, 'type': 'NATIVE', 'title': 'Modeling wine preferences

In [48]:
df_white = pd.read_csv("../dataset/winequality-white.csv", sep=';',index_col=False)
df_white['type'] = 1
df_red = pd.read_csv("../dataset/winequality-red.csv", sep=';',index_col=False)
df_red['type'] = 0
df_wine = pd.concat([df_red, df_white], ignore_index=True)
df_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,0
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,1
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,1
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,1
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,1


In [63]:
X = df_wine.drop("citric acid", axis=1)
y = df_wine["citric acid"]

In [64]:
# shuffle indexes
idx = np.arange(len(df_wine))
np.random.seed(42)
np.random.shuffle(idx)

# 80% train, 20% test
split_point = int(0.8 * len(df_wine))

train_idx = idx[:split_point]
test_idx  = idx[split_point:]

X_train = X.iloc[train_idx]
y_train = y.iloc[train_idx]

X_test = X.iloc[test_idx]
y_test = y.iloc[test_idx]

In [65]:
continuous_cols = [col for col in X_train.columns if col != 'type']
binary_cols = ['type']  # ou a coluna que for binária

# Normaliza só as contínuas
scaler = (X_train[continuous_cols].mean(), X_train[continuous_cols].std())

X_train_cont = (X_train[continuous_cols] - scaler[0]) / scaler[1]
X_test_cont  = (X_test[continuous_cols]  - scaler[0]) / scaler[1]

# Junta de volta com a coluna binária (sem mexer nela)
X_train_scaled = pd.concat([X_train_cont, X_train[binary_cols]], axis=1)
X_test_scaled  = pd.concat([X_test_cont,  X_test[binary_cols]],  axis=1)


##Learning a L2 or L1 penalised linear regression

### Ridge Function

#### Ridge Implementado

In [66]:
def custom_ridge_regression(X, y, lamb):
  n_features = X.shape[1]
  I = np.eye(n_features)
  return np.linalg.inv(X.T @ X + lamb * I) @ (X.T @ y)


Adicionando o Intercepto:

In [67]:
X_train_adj = np.hstack([np.ones((X_train_scaled.shape[0], 1)), X_train_scaled])
X_test_adj  = np.hstack([np.ones((X_test_scaled.shape[0], 1)), X_test_scaled])

Treinando:

In [68]:
w = custom_ridge_regression(X_train_adj, y_train, lamb=0.1)

In [69]:
y_pred1 = X_test_adj @ w

y_pred1 = pd.Series(y_pred1, index=y_test.index)

Comparando o y_pred1 com o y_test

In [70]:
print(y_pred1)

513     0.429650
4177    0.244424
1308    0.293479
914     0.260445
3452    0.408462
          ...   
3772    0.374643
5191    0.257978
5226    0.323295
5390    0.335704
860     0.159511
Length: 1300, dtype: float64


In [71]:
print(y_test)

513     0.64
4177    0.27
1308    0.32
914     0.39
3452    0.39
        ... 
3772    0.58
5191    0.27
5226    0.20
5390    0.50
860     0.06
Name: citric acid, Length: 1300, dtype: float64


Calculando o RMSE:

In [72]:
rmse_custom = np.sqrt(np.mean((y_test.values - y_pred1)**2))
print(rmse_custom)

0.11891108088722914


Calculando o $R^2$

In [73]:
ss_res = np.sum((y_test - y_pred1)**2)
ss_tot = np.sum((y_test - np.mean(y_test))**2)

r2_custom = 1 - ss_res / ss_tot
print("R² custom:", r2_custom)

R² custom: 0.36191361649160725


#### Ridge da Biblioteca

In [74]:
model_rid = Ridge(alpha=0.1)   # Atenção!!! Nesse caso o lambda = alpha
model_rid.fit(X_train_scaled, y_train)

0,1,2
,alpha,0.1
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [75]:
y_pred2 = model_rid.predict(X_test_scaled)


RMSE da biblioteca

In [76]:
rmse_lib = np.sqrt(np.mean((y_test.values - y_pred2)**2))
print(rmse_lib)

0.1189110684542083


$R^2$ da biblioteca

In [98]:
ss_res = np.sum((y_test - y_pred2)**2)
ss_tot = np.sum((y_test - np.mean(y_test))**2)

r2_lib = 1 - ss_res / ss_tot
print("R² custom:", r2_lib)

R² custom: 0.36191374992477143


#### Diferenças entre os dois RIDGEs

Diferença entre os RMSEs

In [78]:
diff_abs = abs(rmse_custom - rmse_lib)
print("Diferença absoluta:", diff_abs)

diff_percent = abs(rmse_custom - rmse_lib) / rmse_lib * 100
print(f"Diferença percentual: {diff_percent:.5f}%")

ratio = rmse_custom / rmse_lib
print("Razão (custom / lib):", ratio)

Diferença absoluta: 1.2433020835089792e-08
Diferença percentual: 0.00001%
Razão (custom / lib): 1.0000001045573048


Diferença entre os $R^2$

In [79]:
diff_abs = abs(r2_custom - r2_lib)
print("Diferença absoluta:", diff_abs)

diff_percent = abs(r2_custom - r2_lib) / r2_lib * 100
print(f"Diferença percentual: {diff_percent:.5f}%")

ratio = r2_custom / r2_lib
print("Razão (custom / lib):", ratio)

Diferença absoluta: 1.3343316418445994e-07
Diferença percentual: 0.00004%
Razão (custom / lib): 0.9999996313122554


### K-Fold

In [80]:
def best_k_fold(X, y, lambdas, k):
    n = len(y)
    indices = np.arange(n)
    np.random.shuffle(indices)

    folds = np.array_split(indices, k)

    rmse_per_lambda = {}

    for lamb in lambdas:
        rmse_list = []

        for i in range(k):
            val_idx = folds[i]
            train_idx = np.hstack([folds[j] for j in range(k) if j != i])

            # INDEXAÇÃO CORRETA
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_val,   y_val   = X.iloc[val_idx],  y.iloc[val_idx]

            # modelo ridge feito do zero
            w = custom_ridge_regression(X_train.values, y_train.values, lamb)

            # predição
            y_pred = X_val.values @ w

            # RMSE
            rmse_list.append(np.sqrt(np.mean((y_val.values - y_pred)**2)))

        rmse_per_lambda[lamb] = np.mean(rmse_list)

    best_lambda = min(rmse_per_lambda, key=rmse_per_lambda.get)

    return best_lambda, rmse_per_lambda


Usando o 10-Fold

In [81]:
lambdas = np.logspace(-4, 3, 10)  # 10 valores entre 1e-4 e 1e3

best_lambda, scores = best_k_fold(X_train, y_train, lambdas, k=10)

print("Melhor λ:", best_lambda)
print("RMSE médios:", scores)

Melhor λ: 0.0001
RMSE médios: {np.float64(0.0001): np.float64(0.11318813962195926), np.float64(0.0005994842503189409): np.float64(0.1131881407899499), np.float64(0.003593813663804626): np.float64(0.11318814917912319), np.float64(0.021544346900318846): np.float64(0.11318824872021097), np.float64(0.1291549665014884): np.float64(0.11319049200899248), np.float64(0.774263682681127): np.float64(0.11324358060962862), np.float64(4.641588833612782): np.float64(0.11376560637364172), np.float64(27.825594022071257): np.float64(0.11514328472060913), np.float64(166.81005372000593): np.float64(0.11848949101798614), np.float64(1000.0): np.float64(0.12411600940340399)}


#### Testando com o Lambda do 10-Fold implementado

In [82]:
w1 = custom_ridge_regression(X_train_adj, y_train, best_lambda)

In [83]:
y_pred3 = X_test_adj @ w1

y_pred3 = pd.Series(y_pred3, index=y_test.index)

Comparando o y_pred3 com o y_test

In [84]:
print(y_pred3)

513     0.429644
4177    0.244422
1308    0.293494
914     0.260489
3452    0.408434
          ...   
3772    0.374640
5191    0.257986
5226    0.323295
5390    0.335724
860     0.159534
Length: 1300, dtype: float64


In [85]:
print(y_test)

513     0.64
4177    0.27
1308    0.32
914     0.39
3452    0.39
        ... 
3772    0.58
5191    0.27
5226    0.20
5390    0.50
860     0.06
Name: citric acid, Length: 1300, dtype: float64


Calculando o RMSE

In [86]:
rmse_custom2 = np.sqrt(np.mean((y_test.values - y_pred3)**2))
print(rmse_custom2)

0.11891093276417677


Antigo RMSE -> Lambda = 1.0

In [87]:
print(rmse_custom)

0.11891108088722914


Diferença entre o RMSE calculado com lambda = 1.0 com o RMSE com Lambda Ideal

In [88]:
diff_abs = abs(rmse_custom2 - rmse_custom)
print("Diferença absoluta:", diff_abs)

diff_percent = abs(rmse_custom2 - rmse_custom) / rmse_custom * 100
print(f"Diferença percentual: {diff_percent:.4f}%")

ratio = rmse_custom2 / rmse_custom
print("Razão (custom / lib):", ratio)

Diferença absoluta: 1.481230523653343e-07
Diferença percentual: 0.0001%
Razão (custom / lib): 0.9999987543376845


Calculando $R^2$

In [89]:
ss_res1 = np.sum((y_test - y_pred3)**2)
ss_tot1 = np.sum((y_test - np.mean(y_test))**2)

r2_custom1 = 1 - ss_res1 / ss_tot
print("R² custom:", r2_custom1)

R² custom: 0.3619152061709413


Diferença entre os $R^2$

In [90]:
diff_abs = abs(r2_custom1 - r2_custom)
print("Diferença absoluta:", diff_abs)

diff_percent = abs(r2_custom1 - r2_custom) / r2_custom * 100
print(f"Diferença percentual: {diff_percent:.4f}%")

ratio = r2_custom1 / r2_custom
print("Razão (custom / lib):", ratio)

Diferença absoluta: 1.5896793340575144e-06
Diferença percentual: 0.0004%
Razão (custom / lib): 1.0000043924275341


#### Testando com o Lambda do 10-Fold da biblioteca

In [91]:
lambdas = np.logspace(-4, 3, 10)
scores = [cross_val_score(Ridge(alpha=l), X_train, y_train,
                          scoring="neg_root_mean_squared_error", cv=10).mean() for l in lambdas]

best_lambda = lambdas[np.argmax(scores)]
print("Melhor λ:", best_lambda)

Melhor λ: 0.0001


In [92]:
w2 = custom_ridge_regression(X_train_adj, y_train, best_lambda)

In [93]:
y_pred4 = X_test_adj @ w2

y_pred4 = pd.Series(y_pred4, index=y_test.index)

Calculando o RMSE

In [94]:
rmse_custom3 = np.sqrt(np.mean((y_test.values - y_pred4)**2))
print(rmse_custom3)

0.11891093276417677


Calculando o $R^2$

In [95]:
ss_res2 = np.sum((y_test - y_pred4)**2)
ss_tot2 = np.sum((y_test - np.mean(y_test))**2)

r2_custom2 = 1 - ss_res2/ ss_tot2
print("R² custom:", r2_custom2)

R² custom: 0.3619152061709413


#### Comparando o 10-fold Implementado com o da biblioteca

RMSE

In [96]:
diff_abs = abs(rmse_custom3 - rmse_custom2)
print("Diferença absoluta:", diff_abs)

diff_percent = abs(rmse_custom3 - rmse_custom2) / rmse_custom2 * 100
print(f"Diferença percentual: {diff_percent:.16f}%")

ratio = rmse_custom3 / rmse_custom2
print("Razão (custom / lib):", ratio)

Diferença absoluta: 0.0
Diferença percentual: 0.0000000000000000%
Razão (custom / lib): 1.0


$R^2$

In [97]:
diff_abs = abs(r2_custom2 - r2_custom1)
print("Diferença absoluta:", diff_abs)

diff_percent = abs(r2_custom2 - r2_custom1) / r2_custom1 * 100
print(f"Diferença percentual: {diff_percent:.16f}%")

ratio = r2_custom2 / r2_custom1
print("Razão (custom / lib):", ratio)

Diferença absoluta: 0.0
Diferença percentual: 0.0000000000000000%
Razão (custom / lib): 1.0
