#### Baseline - classification - most common class

In [1]:
# IMPORT

import numpy as np
import scipy.linalg as linalg
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
import pandas as pd
import sklearn.linear_model as lm
from matplotlib.pyplot import figure, legend, plot, show, xlabel, ylabel

# exercise 8.1.1
import torch
import importlib_resources

from matplotlib.pylab import (
    figure,
    grid,
    legend,
    loglog,
    semilogx,
    show,
    subplot,
    title,
    xlabel,
    ylabel,
)
from scipy.io import loadmat
from sklearn import model_selection
from scipy import stats

from dtuimldmtools import draw_neural_net, train_neural_net, rlr_validate, similarity

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from collections import Counter
from sklearn.neural_network import MLPRegressor


In [2]:
# OFFLINE LOADING OF DATA
X = np.loadtxt('../wine/wine.data', delimiter=',')

# Standardizing the data
X[:, 1:] = (X[:, 1:] - np.mean(X[:, 1:], axis=0)) / np.std(X[:, 1:], axis=0)

attributeNames = [
    "Class",
    "Alcohol",
    "Malic acid",
    "Ash",
    "Alcalinity of ash",
    "Magnesium",
    "Total phenols",
    "Flavanoids",
    "Nonflavanoid phenols",
    "Proanthocyanins",
    "Color intensity",
    "Hue",
    "OD280/OD315 of diluted wines",
    "Proline",
]

y = X[:, 0]
X = X[:, 1:]

In [3]:
# Step 1: Convert the NumPy array 'X' to a DataFrame with column names
# Ensure attributeNames has the correct number of column names
attributeNames_corrected = attributeNames[:X.shape[1]]
df = pd.DataFrame(X, columns=attributeNames_corrected)

# Display the first few rows of the DataFrame
print(df.head())


      Class   Alcohol  Malic acid       Ash  Alcalinity of ash  Magnesium  \
0  1.518613 -0.562250    0.232053 -1.169593           1.913905   0.808997   
1  0.246290 -0.499413   -0.827996 -2.490847           0.018145   0.568648   
2  0.196879  0.021231    1.109334 -0.268738           0.088358   0.808997   
3  1.691550 -0.346811    0.487926 -0.809251           0.930918   2.491446   
4  0.295700  0.227694    1.840403  0.451946           1.281985   0.808997   

   Total phenols  Flavanoids  Nonflavanoid phenols  Proanthocyanins  \
0       1.034819   -0.659563              1.224884         0.251717   
1       0.733629   -0.820719             -0.544721        -0.293321   
2       1.215533   -0.498407              2.135968         0.269020   
3       1.466525   -0.981875              1.032155         1.186068   
4       0.663351    0.226796              0.401404        -0.319276   

   Color intensity       Hue  OD280/OD315 of diluted wines  
0         0.362177  1.847920                     

In [4]:
# Baseline model: Predict the most common class in the training set
# Define the number of folds
K = 10
CV = model_selection.KFold(n_splits=K, shuffle=True)

# Initialize variables to store errors
Error_train = np.empty(K)
Error_test = np.empty(K)

k = 0
for train_index, test_index in CV.split(X):
    print(f"Computing CV fold: {k + 1}/{K}..")

    # Extract training and test set for the current CV fold
    X_train, y_train = X[train_index, :], y[train_index]
    X_test, y_test = X[test_index, :], y[test_index]

    # Baseline model: Predict the most common class in the training set
    most_common_class = Counter(y_train).most_common(1)[0][0]

    # Predict the most common class for all test and train samples
    y_est_train = np.full(len(y_train), most_common_class)
    y_est_test = np.full(len(y_test), most_common_class)

    # Evaluate misclassification rate for train and test sets
    misclass_rate_train = np.sum(y_est_train != y_train) / len(y_train)
    misclass_rate_test = np.sum(y_est_test != y_test) / len(y_test)

    print("    Top class: ", most_common_class, "  Test error: ", round(misclass_rate_test,3))

    # Store the errors
    Error_train[k] = misclass_rate_train
    Error_test[k] = misclass_rate_test

    k += 1


# Print the mean and standard deviation of errors across folds
print(f"\nAverage training error: {np.mean(Error_train):.4f}")
print(f"Average test error: {np.mean(Error_test):.4f}")



Computing CV fold: 1/10..
    Top class:  2.0   Test error:  0.556
Computing CV fold: 2/10..
    Top class:  2.0   Test error:  0.722
Computing CV fold: 3/10..
    Top class:  2.0   Test error:  0.667
Computing CV fold: 4/10..
    Top class:  2.0   Test error:  0.556
Computing CV fold: 5/10..
    Top class:  2.0   Test error:  0.556
Computing CV fold: 6/10..
    Top class:  2.0   Test error:  0.611
Computing CV fold: 7/10..
    Top class:  2.0   Test error:  0.667
Computing CV fold: 8/10..
    Top class:  2.0   Test error:  0.444
Computing CV fold: 9/10..
    Top class:  2.0   Test error:  0.588
Computing CV fold: 10/10..
    Top class:  2.0   Test error:  0.647

Average training error: 0.6011
Average test error: 0.6013
