In [1]:
PATH_train = 'data/wine_train.csv'
PATH_test = 'data/wine_test.csv'

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.base import clone
from sklearn.metrics import r2_score


# 3 digits floating points prints
pd.set_option('display.precision', 3)

rd_state = 42

# Load and Prepare the Dataset

In [3]:
# Load the dataset
data = pd.read_csv(PATH_train)
data.drop(columns=['wine_ID'], inplace=True)

data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,wine_type,target
0,7.2,0.16,0.26,7.1,0.054,41.0,224.0,0.997,3.38,0.55,10.1,0,5
1,7.3,0.22,0.31,2.3,0.018,45.0,80.0,0.989,3.06,0.34,12.9,0,7
2,8.9,0.13,0.49,1.0,0.028,6.0,24.0,0.993,2.91,0.32,9.9,0,5
3,6.0,0.17,0.29,9.7,0.044,33.0,98.0,0.995,3.12,0.36,9.2,0,6
4,7.5,0.19,0.34,2.6,0.037,33.0,125.0,0.992,3.1,0.49,11.1,0,7


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4547 entries, 0 to 4546
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4547 non-null   float64
 1   volatile acidity      4547 non-null   float64
 2   citric acid           4547 non-null   float64
 3   residual sugar        4547 non-null   float64
 4   chlorides             4547 non-null   float64
 5   free sulfur dioxide   4547 non-null   float64
 6   total sulfur dioxide  4547 non-null   float64
 7   density               4547 non-null   float64
 8   pH                    4547 non-null   float64
 9   sulphates             4547 non-null   float64
 10  alcohol               4547 non-null   float64
 11  wine_type             4547 non-null   int64  
 12  target                4547 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 461.9 KB


In [5]:
# Split the dataset into features and target variable
X = data.drop(columns=['target'])
y = data['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rd_state)

In [6]:
# Standardize the features to have a mean=0 and variance=1
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define Classification Models
Define the classification models to be used for comparaison.

In [7]:
# Import the necessary regression models from sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier


all_models = []
all_models_names = []
all_params = []


# Define the regression models and their hyperparameters

#1
model = LogisticRegression()
params = {}
all_models.append(model)
all_models_names.append('LogisticRegression')
all_params.append(params)


#2
model = RandomForestClassifier(random_state=rd_state)
params = {
    'n_estimators': [100, 500, 1000], 
    # 'max_depth': [None, 5, 10, 20],
}
all_models.append(model)
all_models_names.append('RandomForest')
all_params.append(params)


#3
model = ExtraTreesClassifier(random_state=rd_state)
params = {
    'n_estimators': [100, 500, 1000], 
    # 'max_depth': [None, 5, 10, 20],
}
all_models.append(model)
all_models_names.append('ExtraTrees')
all_params.append(params)

#4
model = SVC(degree=5)
params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 10], 
}
all_models.append(model)
all_models_names.append('SVM')
all_params.append(params)

#5
model = KNeighborsClassifier()
params = {
    # 'n_neighbors': [3, 5, 7, 9, 11], 
    'n_neighbors': np.round(np.linspace(2, 20, 5), 0).astype(int), 
    'weights': ['uniform', 'distance']
}
all_models.append(model)
all_models_names.append('KNeighbors')
all_params.append(params)


#6
model = MLPClassifier(max_iter=500, random_state=rd_state)
params = {
    'hidden_layer_sizes': [(8, 16), (8, 16, 32), (16, 32, 16)], 
    # 'activation': ['relu', 'tanh', 'logistic'],
    'alpha': [0.0001, 0.01, 0.1, 1],
}
all_models.append(model)
all_models_names.append('MLP')
all_params.append(params)

##



def create_lists_for_subset(models_names):
    models = []
    params = []
    for i, name in enumerate(all_models_names) :
        if name in models_names:
            models.append(all_models[i])
            params.append(all_params[i])        


    return models, params

# models_names = ['RandomForest', 'ExtraTrees']#, 'KNeighbors']
models_names = all_models_names

models, params = create_lists_for_subset(models_names)

# Hyperparameter Tuning with GridSearchCV
Use GridSearchCV to tune the hyperparameters of each model and find the best performing set of parameters.

In [8]:
# Import necessary libraries for hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# Create empty lists to store the best parameters and best scores for each model
dfs = []
best_params = []
best_scores = []

# Loop through each model and its corresponding hyperparameters
for model, param in zip(models, params):
    print(model.__class__.__name__)
    
    # Use GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(clone(model), param, cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)
    

    # Get the best parameters and best score
    dfs.append(pd.DataFrame(grid_search.cv_results_))
    best_params.append(grid_search.best_params_)
    best_scores.append(grid_search.best_score_)

# Create a dataframe to store the best parameters and best scores for each model
df_grid_search_score = pd.DataFrame({'Model': models_names, 'Best Parameters': best_params, 'best_CVscore': best_scores})

# Display the best parameters and best scores for each model
df_grid_search_score

LogisticRegression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

RandomForestClassifier




ExtraTreesClassifier




SVC




KNeighborsClassifier




MLPClassifier




Unnamed: 0,Model,Best Parameters,best_CVscore
0,LogisticRegression,{},0.178
1,RandomForest,{'n_estimators': 500},0.381
2,ExtraTrees,{'n_estimators': 500},0.365
3,SVM,"{'C': 1, 'kernel': 'rbf'}",0.241
4,KNeighbors,"{'n_neighbors': 20, 'weights': 'distance'}",0.325
5,MLP,"{'alpha': 1, 'hidden_layer_sizes': (16, 32, 16)}",0.253


# Train and evaluate each best models 

In [12]:
# checking the model performance on the test set (using the best hyperparameters)

# Create empty lists to store the R-squared scores for each model
train_scores = []
test_scores = []

# Loop through each model and its corresponding best parameters
for model, param in zip(models, best_params):
    # Fit the model on the training data using the best parameters
    model.set_params(**param)
    model.fit(X_train, y_train)
    
    # Predict on the training and test data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate the performance using the R-squared score
    train_score = r2_score(y_train, y_train_pred)
    test_score = r2_score(y_test, y_test_pred)
    test_score_classif = r2_score(y_test, np.round(y_test_pred, 0))
    
    # Append the R-squared scores to the corresponding lists
    train_scores.append(train_score)
    test_scores.append(test_score)

# Create a dataframe to store the R-squared scores for each model
df_test_score = pd.DataFrame({'Model': models_names, 'Train Score': train_scores, 'test_score': test_scores})

# Display the R-squared scores for each model
df_grid_search_score['test_score'] = df_test_score['test_score']
df_test_score


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Train Score,test_score
0,LogisticRegression,0.194,0.19
1,RandomForest,1.0,0.378
2,ExtraTrees,1.0,0.355
3,SVM,0.311,0.238
4,KNeighbors,1.0,0.265
5,MLP,0.317,0.244


In [13]:
df_grid_search_score.drop(columns=['Best Parameters'])


Unnamed: 0,Model,best_CVscore,test_score,test_score_classif
0,LogisticRegression,0.178,0.19,0.19
1,RandomForest,0.381,0.378,0.378
2,ExtraTrees,0.365,0.355,0.355
3,SVM,0.241,0.238,0.238
4,KNeighbors,0.325,0.265,0.265
5,MLP,0.253,0.244,0.244


# train the best model on the whole training set and make predictions on the test set

using best model for the grid scores ?
or the test on the test dataset left out from the real training dataset ?

In [15]:
# for the best model for the grid search score
i_model = df_grid_search_score['best_CVscore'].idxmax()

# for the best model on the "fake" test dataset
i_model = df_test_score['test_score'].idxmax()

model = clone(models[i_model])
name_model = df_grid_search_score['Model'][i_model].replace(' ', '').replace('_', '')
param_model = df_grid_search_score['Best Parameters'][i_model]


# load the test dataset and make prediction using the best model 
test_data = pd.read_csv(PATH_test)
train_data = pd.read_csv(PATH_train)

X_submit_train, y_submit_train = train_data.drop(columns=['wine_ID', 'target']), train_data['target']

X_submit, wine_ids = test_data.drop(columns=['wine_ID']), test_data['wine_ID']

scaler = StandardScaler()
X_submit_train, X_submit = scaler.fit_transform(X_submit_train), scaler.transform(X_submit)


# Predict on the test data using random forest
model.fit(X_submit_train, y_submit_train)
y_submit = model.predict(X_submit)

In [16]:
# Save predictions to file
param_model_str = ''
for key, value in param_model.items():
    param_model_str += key + '_' + str(value) + '_'
param_model_str = param_model_str[:-1]


filename = f'submits/wine_submit_class_{name_model}_{param_model_str}.csv'

pd.DataFrame({'wine_ID':wine_ids, 'target':y_submit}).to_csv(filename, index=False)