In [1]:
import copy

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_predict,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC

In [3]:
train_df = pd.read_csv("../data/processed/train_df.csv", index_col = 0)
test_df = pd.read_csv("../data/processed/test_df.csv", index_col = 0)
train_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
2457,8.1,0.33,0.36,7.4,0.037,36.0,156.0,0.99592,3.19,0.54,10.6,6,white
524,5.6,0.35,0.37,1.0,0.038,6.0,72.0,0.9902,3.37,0.34,11.4,5,white
4551,7.0,0.23,0.32,1.8,0.048,25.0,113.0,0.9915,3.11,0.47,11.1,6,white
1056,8.9,0.48,0.53,4.0,0.101,3.0,10.0,0.99586,3.21,0.59,12.1,7,red
3759,7.8,0.19,0.32,7.4,0.015,47.0,124.0,0.99278,2.99,0.39,11.0,6,white


In [53]:
train_df.loc[train_df['quality'] == 3, 'quality'] = '<=4'
train_df.loc[train_df['quality'] == 4, 'quality'] = '<=4'
train_df.loc[train_df['quality'] == 8, 'quality'] = '>=8'
train_df.loc[train_df['quality'] == 9, 'quality'] = '>=8'
train_df['quality'] = train_df['quality'].map(str)

In [54]:
X_train = train_df.drop(columns = ['quality'])
y_train = train_df['quality']
X_test = test_df.drop(columns = ['quality'])
y_test = test_df['quality']

In [55]:
# helper function, adapted from 573 lecture 4
# https://pages.github.ubc.ca/mds-2021-22/DSCI_573_feat-model-select_students/lectures/04_feat-importances-selection.html
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [56]:
numeric_features = list(X_train.select_dtypes(include='number').columns)
categorical_features = ['type']

In [57]:
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown="ignore", sparse=False), categorical_features)
)

In [37]:
# models
dummy = make_pipeline(preprocessor, DummyClassifier(random_state = 123))
svc = make_pipeline(preprocessor, SVC(kernel='linear', random_state = 123))  # gamma, C, class_weight
lr = make_pipeline(preprocessor, LogisticRegression(max_iter = 5000, random_state = 123))  # C, class_weight, 
rf = make_pipeline(preprocessor, RandomForestClassifier(random_state = 123))  # n_estimator, max_depth

models = {}
models['Dummy'] = dummy
models['SVC'] = svc
models['Logistic Regression'] = lr
models['Random Forest'] = rf

classes = np.sort(y_test.unique()).tolist()

# scoring metrics
# scoring = ["accuracy", "f1", "recall", "precision", "roc_auc", "average_precision", "neg_mean_squared_error"]

In [32]:
# hyperparameter optimization
param_grid_svc = {
    "svc__class_weight": [None, "balanced"],
    "svc__C": np.logspace(-3, 2, 6),
    "svc__gamma": np.logspace(-3, 2, 6),
}

param_grid_lr = {
    "logisticregression__class_weight": [None, "balanced"],
    "logisticregression__C": np.logspace(-3, 2, 30),
}

param_grid_rf = {
    "randomforestclassifier__n_estimators": np.logspace(1, 4, 10, dtype=int),
    "randomforestclassifier__max_depth": np.linspace(1, 30, 10, dtype=int),
}

search_svc = RandomizedSearchCV(
    svc,
    param_distributions=param_grid_svc,
    return_train_score=True,
    n_jobs=-1,
    n_iter=30,
    cv=5,
    random_state=123,
)

search_lr = RandomizedSearchCV(
    lr,
    param_distributions=param_grid_lr,
    return_train_score=True,
    n_jobs=-1,
    n_iter=30,
    cv=5,
    random_state=123,
)

search_rf = RandomizedSearchCV(
    rf,
    param_distributions=param_grid_rf,
    return_train_score=True,
    n_jobs=-1,
    n_iter=30,
    cv=5,
    random_state=123,
)

In [18]:
search_lr.fit(X_train, y_train);

In [23]:
search_lr.cv_results_
search_lr.best_params_
search_lr.best_score_

{'logisticregression__class_weight': None,
 'logisticregression__C': 0.38566204211634725}

In [28]:
results = pd.DataFrame(search_lr.cv_results_).sort_values("rank_test_score")[
    [
        "rank_test_score",
        "param_logisticregression__class_weight",
        "param_logisticregression__C",
        "mean_test_score",
] ]
results.head()

Unnamed: 0,rank_test_score,param_logisticregression__class_weight,param_logisticregression__C,mean_test_score
18,1,,0.385662,0.547178
21,2,,6.210169,0.545859
7,3,,2.807216,0.54564
27,4,,20.433597,0.5452
12,4,,45.203537,0.5452


In [45]:
search_rf.fit(X_train, y_train);

In [26]:
results = {}
for key, value in models.items():
    results[key] = mean_std_cross_val_scores(value, X_train, y_train, return_train_score = True)
pd.DataFrame(results).rename(
    index={"test_score": "cross_validation_score"}
).T  # cross-validation scores

Unnamed: 0,fit_time,score_time,cross_validation_score,train_score
Dummy,0.012 (+/- 0.012),0.004 (+/- 0.002),0.330 (+/- 0.021),0.330 (+/- 0.007)
SVC,0.397 (+/- 0.009),0.071 (+/- 0.001),0.536 (+/- 0.013),0.540 (+/- 0.002)
Logistic Regression,0.157 (+/- 0.014),0.003 (+/- 0.000),0.546 (+/- 0.013),0.550 (+/- 0.006)
Random Forest,0.475 (+/- 0.006),0.020 (+/- 0.000),0.648 (+/- 0.010),1.000 (+/- 0.000)


In [None]:
# plot confusion matrix, save in dict
# save all plots in .png; change the output directory!...
train_cms = {}
test_cms = {}
for key, model in models.items():
#     train_cms[key] = ConfusionMatrixDisplay.from_estimator(
#         model, X_train, y_train, values_format="d", display_labels=classes
#     )
#     name = '_'.join(key.lower().split())
#     plt.savefig(f'{name}_train_cm.png')
    
    test_cms = ConfusionMatrixDisplay.from_estimator(
        model, X_test, y_test, values_format="d", display_labels=classes
    )
    plt.savefig(f'{name}_test_cm.png')

In [None]:
############################# end of first script

In [None]:
# test scores table
test_scores = {}
for key, model in models.items():
    test_scores[key] = model.score(X_test, y_test)

In [None]:
# output table, change directory!..
test_scores = pd.Series(test_scores, name='test score')
out_table_1 = pd.DataFrame(results).append(test_scores).rename(
    index={"test_score": "cross_validation_score"}
).T
out_table_1.to_csv("test_scores.csv")
out_table_1

In [None]:
top_3_feats = lr_coefs.mean(axis=1).sort_values(ascending=False).index.tolist()[:3]
top_3_feats

In [None]:
# Max coef feature for each class
results = {}
for c in classes:
    results[c] = [lr_coefs.index.tolist()[lr_coefs[c].argmax()]]

pd.DataFrame(results, index = ["Max coef feature for each class"])

In [None]:
lr_feats = copy.deepcopy(lr_coefs)
for c in classes:
    lr_feats[c] = lr_feats[c].sort_values(ascending = False).index.tolist()
    
lr_feats.reset_index().drop(columns=['index']) #.iloc[[0,1,2]]