In [1]:
import os

In [11]:
from src.download_data import download_data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer

In [3]:
white_wine_df = download_data('wine-quality','winequality-white.csv')

# Data splitting

In [4]:
train_df, test_df = train_test_split(white_wine_df, test_size=0.25, random_state=123)

X_train = train_df.drop(columns=['quality'])
y_train = train_df['quality']

X_test = test_df.drop(columns=['quality'])
y_test = test_df['quality']

# Preprocessing and pipeline

In [5]:
numeric_features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides',
                    'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']

numeric_transformer = make_pipeline(StandardScaler())

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
    ]
)

# Cross-validation on different models

In [6]:
results_dict = {}

In [7]:
models = {
     
    "dummyregressor": DummyRegressor(),
    "ridge": Ridge(),
    "randomforest": RandomForestRegressor(),

}

scoring={
        "neg_mean_squared_error": "neg_mean_squared_error",
        "neg_root_mean_square": "neg_root_mean_squared_error",
        "neg_mean_absolute_error": "neg_mean_absolute_error",
        "r2": "r2"
     }

for model_name, model in models.items():
    pipe = make_pipeline(preprocessor, model)
    scores = cross_validate(pipe, X_train, y_train, return_train_score= True, scoring = scoring)
    scores_df = pd.DataFrame(scores).mean()
    results_dict[model_name] = scores_df

In [8]:
pd.DataFrame(results_dict)

Unnamed: 0,dummyregressor,ridge,randomforest
fit_time,0.014554,0.010238,1.613766
score_time,0.007105,0.005526,0.027779
test_neg_mean_squared_error,-0.789925,-0.579467,-0.395085
train_neg_mean_squared_error,-0.789685,-0.568744,-0.05586
test_neg_root_mean_square,-0.888473,-0.761035,-0.628162
train_neg_root_mean_square,-0.888624,-0.754138,-0.23634
test_neg_mean_absolute_error,-0.676655,-0.590999,-0.457988
train_neg_mean_absolute_error,-0.676591,-0.587174,-0.170919
test_r2,-0.00076,0.2655,0.499623
train_r2,0.0,0.279763,0.929264


We chose `DummyRegressor` as the baseline here. `Ridge` and `RandomForestRegressor` all preform better than the base line `DummyRegressor`. Moreover, we observe that `RandomForestRegressor` seems to be a better model by comparing `Ridge` since we got bigger negative mean squared error, bigger negative root mean squared error, bigger negative mean absolute error and bigger r2 score in the `RandomForestRegressor` model. Therefore, for predicting the test set, we will use `RandomForestRegressor` model, but we still need to investigate more in the future.

# Hyperparameter optimazation with RandomForestRegressor

In [16]:
pipe_randomforest = make_pipeline(preprocessor, RandomForestRegressor(random_state=2020))

param_grid = {
    "randomforestregressor__n_estimators": [10, 50, 100, 150, 200, 250, 300],
    "randomforestregressor__max_depth": [3,5,7, 10]
}

random_search = RandomizedSearchCV(
    pipe_randomforest, 
    param_distributions=param_grid,
    n_iter=28,
    cv=3,
    n_jobs=-1,
    random_state=2020
)




In [18]:
random_search.fit(X_train, y_train)

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('num',
                                                                               Pipeline(steps=[('standardscaler',
                                                                                                StandardScaler())]),
                                                                               ['fixed '
                                                                                'acidity',
                                                                                'volatile '
                                                                                'acidity',
                                                                                'citric '
                                                                                'acid',
                                                  

In [19]:
random_search.best_params_

{'randomforestregressor__n_estimators': 300,
 'randomforestregressor__max_depth': 10}

# Final results by using RandomForestRegressor

In [20]:
print("Random Search best model score: " + str(random_search.best_score_))
print("Train score on the train set: " + str(random_search.score(X_train, y_train)))
print("Test score on the test set: " + str(random_search.score(X_test, y_test)))

Random Search best model score: 0.43738839475956953
Train score on the train set: 0.7260837603778569
Test score on the test set: 0.4311472751504126


From above, we can see that the final test score is around 0.4311 which seems to be not reasonable here. Therefore, `RandomForestregressor` may not be a good model to use here. However, we can find other models to improve our test scores.