In [1]:
PATH_train = 'data/wine_train.csv'
PATH_test = 'data/wine_test.csv'

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.base import clone
from sklearn.metrics import r2_score


# 3 digits floating points prints
pd.set_option('display.precision', 3)

rd_state = 42

# Load and Prepare the Dataset

In [3]:
# Load the dataset
data = pd.read_csv(PATH_train)
data.drop(columns=['wine_ID'], inplace=True)

data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,wine_type,target
0,7.2,0.16,0.26,7.1,0.054,41.0,224.0,0.997,3.38,0.55,10.1,0,5
1,7.3,0.22,0.31,2.3,0.018,45.0,80.0,0.989,3.06,0.34,12.9,0,7
2,8.9,0.13,0.49,1.0,0.028,6.0,24.0,0.993,2.91,0.32,9.9,0,5
3,6.0,0.17,0.29,9.7,0.044,33.0,98.0,0.995,3.12,0.36,9.2,0,6
4,7.5,0.19,0.34,2.6,0.037,33.0,125.0,0.992,3.1,0.49,11.1,0,7


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4547 entries, 0 to 4546
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4547 non-null   float64
 1   volatile acidity      4547 non-null   float64
 2   citric acid           4547 non-null   float64
 3   residual sugar        4547 non-null   float64
 4   chlorides             4547 non-null   float64
 5   free sulfur dioxide   4547 non-null   float64
 6   total sulfur dioxide  4547 non-null   float64
 7   density               4547 non-null   float64
 8   pH                    4547 non-null   float64
 9   sulphates             4547 non-null   float64
 10  alcohol               4547 non-null   float64
 11  wine_type             4547 non-null   int64  
 12  target                4547 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 461.9 KB


In [5]:
# Split the dataset into features and target variable
X = data.drop(columns=['target'])
y = data['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rd_state)

# create different dataset for wine type
X0_train, X1_train = X_train[X_train['wine_type'] == 0].drop(columns=['wine_type']), X_train[X_train['wine_type'] == 1].drop(columns=['wine_type'])
y0_train, y1_train = y_train[X_train['wine_type'] == 0], y_train[X_train['wine_type'] == 1]

X0_test, X1_test = X_test[X_test['wine_type'] == 0].drop(columns=['wine_type']), X_test[X_test['wine_type'] == 1].drop(columns=['wine_type'])
y0_test, y1_test = y_test[X_test['wine_type'] == 0], y_test[X_test['wine_type'] == 1]

data0 = pd.concat([X0_train, y0_train], axis=1)
data1 = pd.concat([X1_train, y1_train], axis=1)



In [6]:
# Standardize the features to have a mean=0 and variance=1
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

scaler0, scaler1 = StandardScaler(), StandardScaler()
X0_train = scaler0.fit_transform(X0_train)
X0_test = scaler0.transform(X0_test)

X1_train = scaler1.fit_transform(X1_train)
X1_test = scaler1.transform(X1_test)

# Define Regression Models
Define the regression models to be used for comparison.

In [7]:
# Import the necessary regression models from sklearn

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor


all_models = []
all_models_names = []
all_params = []


# Define the regression models and their hyperparameters

#1
model = LinearRegression()
params = {}
all_models.append(model)
all_models_names.append('LinearRegression')
all_params.append(params)


#2
model = RandomForestRegressor(random_state=rd_state)
params = {
    'n_estimators': [100, 500, 1000], 
    # 'max_depth': [None, 5, 10, 20],
}
all_models.append(model)
all_models_names.append('RandomForest')
all_params.append(params)


#3
model = ExtraTreesRegressor(random_state=rd_state)
params = {
    'n_estimators': [100, 500, 1000], 
    # 'max_depth': [None, 5, 10, 20],
}
all_models.append(model)
all_models_names.append('ExtraTrees')
all_params.append(params)

#4
model = SVR(degree=5)
params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 10], 
}
all_models.append(model)
all_models_names.append('SVM')
all_params.append(params)

#5
model = KNeighborsRegressor()
params = {
    # 'n_neighbors': [3, 5, 7, 9, 11], 
    'n_neighbors': np.round(np.linspace(2, 20, 5), 0).astype(int), 
    'weights': ['uniform', 'distance']
}
all_models.append(model)
all_models_names.append('KNeighbors')
all_params.append(params)


#6
model = MLPRegressor(max_iter=500, random_state=rd_state)
params = {
    'hidden_layer_sizes': [(8, 16), (8, 16, 32), (16, 32, 16)], 
    # 'activation': ['relu', 'tanh', 'logistic'],
    'alpha': [0.0001, 0.01, 0.1, 1],
}
all_models.append(model)
all_models_names.append('MLP')
all_params.append(params)

##



def create_lists_for_subset(models_names):
    models = []
    params = []
    for i, name in enumerate(all_models_names) :
        if name in models_names:
            models.append(all_models[i])
            params.append(all_params[i])        


    return models, params

# models_names = ['RandomForest', 'ExtraTrees']#, 'KNeighbors']
models_names = all_models_names

models, params = create_lists_for_subset(models_names)

# Hyperparameter Tuning with GridSearchCV
Use GridSearchCV to tune the hyperparameters of each model and find the best performing set of parameters.

In [8]:
# specific model for each wine type

from sklearn.model_selection import GridSearchCV

# Create empty lists to store the best parameters and best scores for each model
best_params0 = []
best_scores0 = []

best_params1 = []
best_scores1 = []


# Loop through each model and its corresponding hyperparameters
for model, param in zip(models, params):
    print(model.__class__.__name__)

    # Use GridSearchCV to find the best hyperparameters
    grid_search0 = GridSearchCV(clone(model), param, cv=5, scoring='r2')
    grid_search0.fit(X0_train, y0_train)
    
    # Get the best parameters and best score
    best_params0.append(grid_search0.best_params_)
    best_scores0.append(grid_search0.best_score_)

    grid_search1 = GridSearchCV(clone(model), param, cv=5, scoring='r2')
    grid_search1.fit(X1_train, y1_train)
    
    # Get the best parameters and best score
    best_params1.append(grid_search1.best_params_)
    best_scores1.append(grid_search1.best_score_)


# Create a dataframe to store the best parameters and best scores for each model
df_grid_search_score0 = pd.DataFrame({'Model': models_names, 'best_params': best_params0, 'best_CVscore': best_scores0})
df_grid_search_score1 = pd.DataFrame({'Model': models_names, 'best_params': best_params1, 'best_CVscore': best_scores1})

df_grid_search_score0


LinearRegression
RandomForestRegressor
ExtraTreesRegressor
SVR
KNeighborsRegressor
MLPRegressor




Unnamed: 0,Model,best_params,best_CVscore
0,LinearRegression,{},0.272
1,RandomForest,{'n_estimators': 1000},0.455
2,ExtraTrees,{'n_estimators': 1000},0.478
3,SVM,"{'C': 1, 'kernel': 'rbf'}",0.375
4,KNeighbors,"{'n_neighbors': 16, 'weights': 'distance'}",0.444
5,MLP,"{'alpha': 1, 'hidden_layer_sizes': (8, 16, 32)}",0.372


In [9]:
df_grid_search_score1

Unnamed: 0,Model,best_params,best_CVscore
0,LinearRegression,{},0.333
1,RandomForest,{'n_estimators': 1000},0.437
2,ExtraTrees,{'n_estimators': 1000},0.452
3,SVM,"{'C': 1, 'kernel': 'rbf'}",0.376
4,KNeighbors,"{'n_neighbors': 20, 'weights': 'distance'}",0.399
5,MLP,"{'alpha': 1, 'hidden_layer_sizes': (16, 32, 16)}",0.338


# Train and evaluate each best models 

In [10]:
# checking the model performance on the test set (models depend on the wine type)

# Create empty lists to store the R-squared scores for each model
train_scores = []
test_scores = []
test_scores_classif = []

train0_scores = []
test0_scores = []

train1_scores = []
test1_scores = []


y_train_concat = np.concatenate((y0_train, y1_train))
y_test_concat = np.concatenate((y0_test, y1_test))

# Loop through each model and its corresponding best parameters
for model, param0, param1 in zip(models, best_params0, best_params1):

    # Fit the model on the training data using the best parameters for wine type 0
    model0 = clone(model)
    model0.set_params(**param0)
    model0.fit(X0_train, y0_train)
    
    # Predict on the training and test data
    y_train_pred0 = model0.predict(X0_train)
    y_test_pred0 = model0.predict(X0_test)

    train0_score = r2_score(y0_train, y_train_pred0)
    test0_score = r2_score(y0_test, y_test_pred0)

    train0_scores.append(train0_score)
    test0_scores.append(test0_score)



    # Fit the model on the training data using the best parameters for wine type 1
    model1 = clone(model)
    model1.set_params(**param1)
    model1.fit(X1_train, y1_train)
    
    # Predict on the training and test data
    y_train_pred1 = model1.predict(X1_train)
    y_test_pred1 = model1.predict(X1_test)

    train1_score = r2_score(y1_train, y_train_pred1)
    test1_score = r2_score(y1_test, y_test_pred1)

    train1_scores.append(train1_score)
    test1_scores.append(test1_score)

    # Overall performance

    # Concatenate the predictions on the training and test data
    y_train_pred = np.concatenate((y_train_pred0, y_train_pred1))
    y_test_pred = np.concatenate((y_test_pred0, y_test_pred1))

    # Evaluate the performance using the R-squared score
    train_score = r2_score(y_train_concat, y_train_pred)
    test_score = r2_score(y_test_concat, y_test_pred)
    
    # Append the R-squared scores to the corresponding lists
    train_scores.append(train_score)
    test_scores.append(test_score)
    test_scores_classif.append(r2_score(y_test_concat, np.round(y_test_pred, 0)))

    # break

# Create a dataframe to store the R-squared scores for each model
df_test_score_bytype = pd.DataFrame({'Model': models_names, 
                                     'train_score': train_scores, 'test_score': test_scores, 'test_score_classif': test_scores_classif,
                                     'train_score0': train0_scores, 'test_score0': test0_scores,
                                     'train_score1': train1_scores, 'test_score1': test1_scores, 
                                     })

# Display the R-squared scores for each model
df_test_score_bytype




Unnamed: 0,Model,train_score,test_score,test_score_classif,train_score0,test_score0,train_score1,test_score1
0,LinearRegression,0.31,0.319,0.207,0.286,0.282,0.365,0.362
1,RandomForest,0.929,0.437,0.357,0.929,0.427,0.925,0.406
2,ExtraTrees,1.0,0.468,0.359,1.0,0.458,1.0,0.437
3,SVM,0.525,0.358,0.27,0.512,0.326,0.551,0.389
4,KNeighbors,1.0,0.391,0.314,1.0,0.375,1.0,0.375
5,MLP,0.465,0.362,0.278,0.443,0.332,0.521,0.387


# train the best model on the whole training set and make predictions on the test set

using the best model on each class

In [11]:
# for the best model for the grid search score
i_model0 = df_grid_search_score0['best_CVscore'].idxmax()
i_model1 = df_grid_search_score0['best_CVscore'].idxmax()

# for the best model on the "fake" test dataset
i_model0 = df_test_score_bytype['test_score0'].idxmax()
i_model1 = df_test_score_bytype['test_score1'].idxmax()

model0 = clone(models[i_model0])
name_model0 = df_grid_search_score0['Model'][i_model0].replace(' ', '').replace('_', '')
param_model0 = df_grid_search_score0['best_params'][i_model0]

model1 = clone(models[i_model1])
name_model1 = df_grid_search_score1['Model'][i_model1].replace(' ', '').replace('_', '')
param_model1 = df_grid_search_score1['best_params'][i_model1]


# load the test dataset and make prediction using the best model 
test_data = pd.read_csv(PATH_test)
train_data = pd.read_csv(PATH_train)

X_submit_train, y_submit_train = train_data.drop(columns=['wine_ID', 'target']), train_data['target']

X0_submit_train = X_submit_train[X_submit_train['wine_type'] == 0].drop(columns=['wine_type'])
X1_submit_train = X_submit_train[X_submit_train['wine_type'] == 1].drop(columns=['wine_type'])
y0_submit_train = y_submit_train[X_submit_train['wine_type'] == 0]
y1_submit_train = y_submit_train[X_submit_train['wine_type'] == 1]

X_submit, wine_ids = test_data.drop(columns=['wine_ID']), test_data['wine_ID']
X0_submit = X_submit[X_submit['wine_type'] == 0].drop(columns=['wine_type'])
X1_submit = X_submit[X_submit['wine_type'] == 1].drop(columns=['wine_type'])

wine0_ids = wine_ids[X_submit['wine_type'] == 0]
wine1_ids = wine_ids[X_submit['wine_type'] == 1]

scaler0, sclaer1 = StandardScaler(), StandardScaler()
X0_submit_train, X0_submit = scaler0.fit_transform(X0_submit_train), scaler0.transform(X0_submit)
X1_submit_train, X1_submit = scaler1.fit_transform(X1_submit_train), scaler1.transform(X1_submit)


# Predict on the test data using random forest
model0.fit(X0_submit_train, y0_submit_train)
model1.fit(X1_submit_train, y1_submit_train)
y0_submit = model0.predict(X0_submit)
y1_submit = model1.predict(X1_submit)




In [20]:
filename = f'submits/wine_submit_bytype.csv'

pd.DataFrame({'wine_ID':np.concatenate((wine0_ids, wine1_ids)), 
              'target':np.concatenate((y0_submit, y1_submit))}
            ).to_csv(filename, index=False)