In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

# Read the data
wine = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
wine.head()

In [None]:
wine.describe()

In [None]:
wine.isnull().sum()

In [None]:
# Plot histogram of features
import matplotlib.pyplot as plt
wine.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# Boxplot the continuous features
cont_features = list(wine.columns)

fig = make_subplots(
    rows=3,
    cols=4,
    subplot_titles=cont_features
)

fig.append_trace(go.Box(y=wine[cont_features[0]]),1,1)
fig.append_trace(go.Box(y=wine[cont_features[1]]),1,2)
fig.append_trace(go.Box(y=wine[cont_features[2]]),1,3)
fig.append_trace(go.Box(y=wine[cont_features[3]]),1,4)
fig.append_trace(go.Box(y=wine[cont_features[4]]),2,1)
fig.append_trace(go.Box(y=wine[cont_features[5]]),2,2)
fig.append_trace(go.Box(y=wine[cont_features[6]]),2,3)
fig.append_trace(go.Box(y=wine[cont_features[7]]),2,4)
fig.append_trace(go.Box(y=wine[cont_features[8]]),3,1)
fig.append_trace(go.Box(y=wine[cont_features[8]]),3,2)
fig.append_trace(go.Box(y=wine[cont_features[8]]),3,3)
fig.append_trace(go.Box(y=wine[cont_features[8]]),3,4)

fig.show()

In [None]:
corr = wine.corr()
corr.style.background_gradient(cmap='plasma').set_precision(2)

In [None]:
wine = wine[
            (wine['volatile acidity'] <= wine['volatile acidity'].quantile(.99)) &
            (wine['residual sugar'] <= wine['residual sugar'].quantile(.99)) &
            (wine['chlorides'] <= wine['chlorides'].quantile(.99)) &
            (wine['free sulfur dioxide'] <= wine['free sulfur dioxide'].quantile(.99)) &
            (wine['total sulfur dioxide'] <= wine['total sulfur dioxide'].quantile(.99))
    ]

# tr1 = FunctionTransformer(lambda x: x[(wine1)])

In [None]:
# Break off validation set from training data
y = wine.quality
X = wine.drop(['quality'], axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8, test_size=0.2,
                                                    random_state=0)



# Normalize data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X_train)
Xtrain_norm = scaler.transform(X_train)
Xtest_norm = scaler.transform(X_test)

# Logistic regression model (normalized & Lasso regularization)

In [None]:
# Find optimal regularization strength ('c')
c_vals = [100, 10, 1, 0.1]

for c in c_vals:
    logreg_model = LogisticRegression(C=c, penalty="l1", solver='saga', max_iter=10000,
                                      multi_class='ovr')
    logreg_model.fit(Xtrain_norm, y_train)
    accuracy = logreg_model.score(Xtrain_norm, y_train)
    test_accuracy = logreg_model.score(Xtest_norm, y_test)

    print(f"c: {c}, accuracy: {accuracy}, test_accuracy: {test_accuracy}")

# Fit softmax model
logreg_model = LogisticRegression(C=10, penalty="l1", solver="saga", max_iter=10000, multi_class='multinomial')\
                                    .fit(Xtrain_norm, y_train)
predictions = logreg_model.predict(Xtest_norm)

print(f"\n{(predictions == y_test).sum()}/{y_test.shape[0]} clasificados correctamente\n")

from sklearn.model_selection import cross_val_score
scores = cross_val_score(logreg_model, Xtrain_norm, y_train, cv=5)

print("Exactitud de cada particion:", scores)
print("Exactitud Promedio:", scores.mean())

# Simple linear regression (normalized)

In [None]:
Xtrain_norm_lrs = X_train[['alcohol']]

lrs_model = LinearRegression().fit(Xtrain_norm_lrs, y_train)
# score = lr_model.score(Xtrain_norm, y_train)
scores = cross_val_score(lrs_model, Xtrain_norm_lrs, y_train, cv=5)

print("Exactitud de cada particion:", scores)
print("Exactitud Promedio:", scores.mean())

# Multiple linear regression (normalized)

In [None]:
lr_model = LinearRegression().fit(Xtrain_norm, y_train)
# score = lr_model.score(Xtrain_norm, y_train)
scores = cross_val_score(lr_model, Xtrain_norm, y_train, cv=5)

print("Exactitud de cada particion:", scores)
print("Exactitud Promedio:", scores.mean())

# Polynomial regression

In [None]:
# Fitting Polynomial Regression to the dataset
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=2)
X_poly = poly_reg.fit_transform(X_train)
pol_model = LinearRegression().fit(X_poly, y_train)

scores = cross_val_score(pol_model, X_poly, y_train, cv=5)

print("Exactitud de cada particion:", scores)
print("Exactitud Promedio:", scores.mean())

# Akaike Information Criteron

In [None]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import log

def calculate_aic(label, model, X, y):
    print('Model: {}'.format(label))
    
    # numero de parametros
    num_params = len(model.coef_) + 1
    print('Number of parameters: %d' % (num_params))

    # predicciones
    yhat = model.predict(X)

    # calcular el mean squared error
    mse = mean_squared_error(y, yhat)
    print('MSE: %.3f' % mse)
    
    n = len(y)
    
    '''calculate aic for linear regression'''
    aic = n * log(mse) + 2 * num_params
    return print('AIC: %.3f' % aic)

In [None]:
from sklearn.metrics import log_loss

def calculate_aic_log(label, logmodel, X, y):
    print('Model: {}'.format(label))
    
    # numero de parametros
    num_params = len(logmodel.coef_) + 1
    print('Number of parameters: %d' % (num_params))

    # predicciones
    yhat = logmodel.predict_proba(X)

    # calcular el mean squared error
    logloss = log_loss(y, yhat)
    print('Logloss: %.3f' % logloss)
    
    n = len(y)
    
    '''calculate aic for logistic regression'''
    aic = (-2/n) * log(logloss) + (2 * (num_params/n))
    return print('AIC: %.3f' % aic)

In [None]:
calculate_aic('Simple Linear Regression', lrs_model, Xtrain_norm_lrs, y_train)

In [None]:
calculate_aic('Multiple Linear Regression', lr_model, Xtrain_norm, y_train)

In [None]:
calculate_aic('Polynomial Linear Regression',pol_model, X_poly, y_train)

In [None]:
calculate_aic_log('Logistic Regression', logreg_model, Xtrain_norm, y_train)

# Bayesian Information Criterion

In [None]:
from math import log
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def calculate_bic(label, model, X, y):
    print('Model: {}'.format(label))

    # numero de parametros
    num_params = len(model.coef_) + 1
    print('Number of parameters: %d' % (num_params))

    # predicciones
    yhat = model.predict(X)

    # calcular el mean squared error
    mse = mean_squared_error(y, yhat)
    print('MSE: %.3f' % mse)

    n = len(y)
    
    '''calculate bic for linear regression'''
    bic = n * log(mse) + num_params * log(n)
    return print('BIC: %.3f' % bic)

In [None]:
def calculate_bic_log(label, logmodel, X, y):
    print('Model: {}'.format(label))

    # numero de parametros
    num_params = len(logmodel.coef_) + 1
    print('Number of parameters: %d' % (num_params))

    # predicciones
    yhat = logmodel.predict_proba(X)

    # calcular el mean squared error
    logloss = log_loss(y, yhat)
    print('Logloss: %.3f' % logloss)

    n = len(y)
    
    '''calculate aic for logistic regression'''
    bic = -2 * log(logloss) + log(n) * num_params
    return print('BIC: %.3f' % bic)

In [None]:
calculate_bic('Simple Linear Regression', lrs_model, Xtrain_norm_lrs, y_train)

In [None]:
calculate_bic('Multiple Linear Regression', lr_model, Xtrain_norm, y_train)

In [None]:
calculate_bic('Polynomial Linear Regression',pol_model, X_poly, y_train)

In [None]:
calculate_bic_log('Logistic Regression', logreg_model, Xtrain_norm, y_train)

# Selección de modelo

In [None]:
print('El modelo de regresión polinomial es el que presenta menores valores para los criterios Akaike y bayesiano, lo que indica que es el modelo que presenta la mejor combinación de desempeño y complejidad. Esto a pesar de que su exactitud es menor que otros modelos')
