In [98]:
# Authors: Rahul Ramachandran and Rishit D

# Importing the required libraries
try:
    import numpy as np
    import pandas as pd
    from pandas.api.types import CategoricalDtype
    from ucimlrepo import fetch_ucirepo 
    from sklearn.linear_model import LinearRegression
    from statsmodels.miscmodels.ordinal_model import OrderedModel
    import scipy.stats as stats
except:
    print(f'[INFO] Installing the required libraries')
    !pip install numpy
    !pip install pandas
    !pip install ucimlrepo
    !pip install scikit-learn
    !pip install statsmodels
    !pip install scipy
    import numpy as np
    import pandas as pd
    from pandas.api.types import CategoricalDtype
    from ucimlrepo import fetch_ucirepo 
    from sklearn.linear_model import LinearRegression
    from statsmodels.miscmodels.ordinal_model import OrderedModel
    import scipy.stats as stats


In [151]:
# fetch dataset 
wine_quality = fetch_ucirepo(id=186) 
print(f'[INFO] Dataset loaded successfully')

# data (as pandas dataframes) 
X = wine_quality.data.features 
y = wine_quality.data.targets 

# metadata 
print(wine_quality.metadata) 
# variable information 
print(wine_quality.variables) 

[INFO] Dataset loaded successfully
{'uci_id': 186, 'name': 'Wine Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/186/wine+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/186/data.csv', 'abstract': 'Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 4898, 'num_features': 11, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['quality'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C56S3T', 'creators': ['Paulo Cortez', 'A. Cerdeira', 'F. Almeida', 'T. Matos', 'J. Reis'], 'intro_paper': {'title': 'Modeling wine prefe

In [152]:
# Prepare training and testing data
nX, ny = len(X), len(y)
train_size = int(0.8 * nX)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [145]:
# Fit the Linear Regression Model
reg_model = LinearRegression().fit(X_train, y_train)
print(f'Linear Regression Score: {reg_model.score(X_train, y_train)}')

Linear Regression Score: 0.29926906612869675


In [146]:
# Evaluate the Linear Regression Model
pred, gt = np.array(reg_model.predict(X_test)), np.array(y_test)
print(f'Linear Regression MAE: {np.mean(np.abs(pred - gt))}')
print(f'Linear Regression MSE: {np.mean((pred - gt)**2)}')
print(f'Correct Predictions: {np.sum(np.rint(pred) == gt)} out of {len(gt)}')

Linear Regression MAE: 0.5513533210016455
Linear Regression MSE: 0.507210463667933
Correct Predictions: 518 out of 980


In [147]:
# Create an ordered categorical type for ordinal regression
print(f'Range of values: {int(y.min().iloc[0])} to {int(y.max().iloc[0])}')
ctype = CategoricalDtype(categories=list(range(int(y.min().iloc[0]), int(y.max().iloc[0])+1)), ordered=True)
y_train = y_train.astype(ctype)

Range of values: 3 to 9


In [148]:
# Fit the Ordinal Regression Model

# CLogLog for proportional hazards
class CLogLog(stats.rv_continuous):
    def _ppf(self, q):
        return np.log(-np.log(1 - q))

    def _cdf(self, x):
        return 1 - np.exp(-np.exp(x))


cloglog = CLogLog()

ord_model = OrderedModel(y_train, X_train, distr=cloglog)
ord_model = ord_model.fit(method='bfgs')

Optimization terminated successfully.
         Current function value: 1.154877
         Iterations: 132
         Function evaluations: 138
         Gradient evaluations: 138


In [149]:
# Evaluate the Ordinal Regression Model
probs, gt = np.array(ord_model.model.predict(ord_model.params, X_test)), np.array(y_test) 
pred = np.sum(probs * np.array(list(range(int(y.min().iloc[0]), int(y.max().iloc[0])+1))), axis=1)[:, np.newaxis] # predictions are expected values of the distribution
print(f'Ordinal Regression MAE: {np.mean(np.abs(pred - gt))}')
print(f'Ordinal Regression MSE: {np.mean((pred - gt)**2)}')
print(f'Correct Predictions: {np.sum((probs.argmax(1) + 3)[:, np.newaxis] == gt)} out of {len(gt)}')

Ordinal Regression MAE: 0.5371400005088605
Ordinal Regression MSE: 0.4860243626072449
Correct Predictions: 553 out of 980
