### Load the data

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

wine = pd.read_csv('Data/winequality-red.csv', delimiter = ';')
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [46]:
#train_val, test = train_test_split(wine, test_size = 0.2, random_state = 0)
#train, val = train_test_split(train_val, test_size = 0.25, random_state = 0)

train, test = train_test_split(wine, test_size = 0.2, random_state = 0)

In [47]:
import statsmodels.api as sm

X_train = train.drop(['quality'], axis=1)
X_train = sm.add_constant(X_train)
y_train = train['quality']

X_test = test.drop(['quality'], axis=1)
X_test = sm.add_constant(X_test)
y_test = test['quality']

### Best Subset Selection 

In [48]:
import statsmodels.api as sm
import itertools
import time
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold

def processSubset(feature_set):
    scaler = StandardScaler()
    log_reg = LogisticRegression(random_state=0, max_iter=10000)
    # Use Stratified K-Fold to maintain the ratio of classes in each fold
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=0)
    pipe = Pipeline([('scaler', scaler), ('log_reg', log_reg)])
    scores = cross_val_score(pipe, X_train[list(feature_set)], y_train, cv=skf, scoring='accuracy')
    return {"model": pipe, "score": np.mean(scores), "features": feature_set}

def getBest(k):
    tic = time.time()
    results = []
    for combo in itertools.combinations(X_train.columns, k):
        results.append(processSubset(combo))
    models = pd.DataFrame(results)
    best_model = models.loc[models['score'].idxmax()]
    toc = time.time()
    print("Processed", models.shape[0], "models on", k, "predictors in", toc - tic, "seconds.")

    return best_model

# Run model selection with cross-validation
models_cv = pd.DataFrame(columns=["score", "model", "features"])
tic = time.time()
for i in range(1, len(X_train.columns) + 1): 
    models_cv.loc[i] = getBest(i)
toc = time.time()

Processed 12 models on 1 predictors in 0.32472801208496094 seconds.
Processed 66 models on 2 predictors in 2.046166181564331 seconds.
Processed 220 models on 3 predictors in 7.543124198913574 seconds.


In [None]:
# Identify the best model
best_index = models_cv['score'].idxmax()
best_overall_model = models_cv.loc[best_index]

print("Best features:", best_overall_model['features'])

Best features: ('volatile acidity', 'free sulfur dioxide', 'total sulfur dioxide', 'alcohol')


In [None]:
predictors_bs = best_overall_model['features']

scaler = StandardScaler()
log_reg = LogisticRegression(random_state=0, max_iter=10000)
pipe = Pipeline([('scaler', scaler), ('log_reg', log_reg)])
pipe.fit(X_train[predictors_bs], y_train)

train_pred = pipe.predict(X_train[predictors_bs])
test_pred = pipe.predict(X_test[predictors_bs])

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Accuracy
train_accuracy = accuracy_score(y_train, train_pred)
test_accuracy = accuracy_score(y_test, test_pred)

# F1-score
train_f1 = f1_score(y_train, train_pred, average='weighted')
test_f1 = f1_score(y_test, test_pred, average='weighted')

# AUC
train_prob = pipe.predict_proba(X_train[predictors_bs])
test_prob = pipe.predict_proba(X_test[predictors_bs])
train_auc = roc_auc_score(y_train, train_prob, multi_class="ovo", average="weighted")
test_auc = roc_auc_score(y_test, test_prob, multi_class="ovo", average="weighted")

# Printing the metrics
print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Training F1-Score: {train_f1}")
print(f"Test F1-Score: {test_f1}")
print(f"Training AUC: {train_auc}")
print(f"Test AUC: {test_auc}")


Training Accuracy: 0.5860271115745568
Test Accuracy: 0.6125
Training F1-Score: 0.5636185835564183
Test F1-Score: 0.5970177889218415
Training AUC: 0.7768948827935154
Test AUC: 0.7312524733840754
