In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
import numpy as np
import seaborn as sns

In [2]:
#Import the data fram from scrapping.ipynb
df=pd.read_csv("baseDF.csv")
df.drop(columns="Unnamed: 0", inplace=True)

In [4]:
#First test set with limited columns to establish a baseline
def Case1(regressor, dfCase1):

## Features list on data we can directly use
    features_list = ['vintage_wine_vintage_type', 'vintage_wine_is_natural', 'vintage_wine_taste_structure_acidity','vintage_wine_taste_structure_intensity', 'vintage_wine_taste_structure_sweetness','vintage_wine_taste_structure_tannin']

    X = dfCase1.loc[:,features_list] 
    y = dfCase1.loc[:,"vintage_wine_statistics_ratings_average"] # target is the rating
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42) 
    numeric_features = [0,2,3,4,5]
    numeric_transformer = StandardScaler()

    categorical_features = [1] 
    categorical_transformer = OneHotEncoder()

#preprocessing train set

    feature_encoder = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features),    
            ('num', numeric_transformer, numeric_features)
            ]
        )
    X_train = feature_encoder.fit_transform(X_train)
    regressor.fit(X_train, y_train)

# Predictions on training set
    y_train_pred = regressor.predict(X_train)
#Preprocessing test set
    X_test = feature_encoder.transform(X_test)

# Print R^2 scores
    print("R2 score on training set : ", regressor.score(X_train, y_train))
    print("R2 score on test set : ", regressor.score(X_test, y_test))

In [5]:
#Baseline, testing case 1 with a linear regression
regressor = LinearRegression()
dfCase1= df
Case1(regressor, dfCase1)

R2 score on training set :  0.2448545624042886
R2 score on test set :  0.23643589741861992


Ok, so baseline with an R2 of 0.23, let's improve it!

In [6]:
def Cleanregions(df):
    #Clean the region names so we can do machine learning on it
    
    #Creating a second dataframe for this case
    df2=df.copy()
    
    #Correcting the most common prefixes/suffixes to merge data names
    df2['vintage_wine_region_name'] = df2['vintage_wine_region_name'].str.replace(' Villages', '')
    df2['vintage_wine_region_name'] = df2['vintage_wine_region_name'].str.replace(' Premier Cru', '')
    df2['vintage_wine_region_name'] = df2['vintage_wine_region_name'].str.replace(' Grand Cru', '')
    df2['vintage_wine_region_name'] = df2['vintage_wine_region_name'].str.replace('Castillon - ', '')
    df2['vintage_wine_region_name'] = df2['vintage_wine_region_name'].str.replace('Blaye - ', '')
    
    #Saint Emilion is written in a lot of different ways, harmonizing it to use it
    #first boolean to detect if the line should be a saint emilion or not
    has_emi = []
    for x in df2['vintage_wine_region_name']:
        if 'Saint-Émilion' in x:
            has_emi.append(1)
        else:
            has_emi.append(0)
    df2['emi'] = has_emi
    #Rewriting all saint emilion lines in the same way
    df2.loc[(df2.emi == 1 ),'vintage_wine_region_name']='Saint-Émilion'
    #dropping the boolean we used
    df2.drop("emi", axis=1, inplace=True)
    
    #Removing the regions appearing less than 10 times
    countregion=df2.groupby("vintage_wine_region_name")["vintage_wine_name"].count()
    df2 = df2[df2['vintage_wine_region_name'].isin(countregion[countregion > 10].index)].copy()
    
    return df2
    

In [8]:
#Second test set which is the first set and the regions names
def Case2(regressor, dfCase2):
    ## Adding region name to the feature list
    features_list = ["vintage_wine_region_name",'vintage_wine_vintage_type', 'vintage_wine_is_natural', 'vintage_wine_taste_structure_acidity','vintage_wine_taste_structure_intensity', 'vintage_wine_taste_structure_sweetness','vintage_wine_taste_structure_tannin',"vintage_wine_statistics_ratings_count"]

    X = dfCase2.loc[:,features_list] 
    y = dfCase2.loc[:,"vintage_wine_statistics_ratings_average"] # target is the rating
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42) 
    numeric_features = [1,3,4,5,6,7] 
    numeric_transformer = StandardScaler()

    categorical_features = [0,2] 
    categorical_transformer = OneHotEncoder()

#preprocessing train set

    feature_encoder = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features),    
            ('num', numeric_transformer, numeric_features)
            ]
        )
    X_train = feature_encoder.fit_transform(X_train)
    regressor.fit(X_train, y_train)

# Predictions on training set
    y_train_pred = regressor.predict(X_train)
#Preprocessing test set
    X_test = feature_encoder.transform(X_test)

# Print R^2 scores
    print("R2 score on training set : ", regressor.score(X_train, y_train))
    print("R2 score on test set : ", regressor.score(X_test, y_test))

In [9]:
#Calculating case 2 with linear regression, adding the region name
regressor = LinearRegression()
Case2(regressor, Cleanregions(df))

R2 score on training set :  0.66721748829355
R2 score on test set :  0.6688552196124722


### R2 score goes up siginificantly to 0.66! Now using all columns:

In [16]:
#Test with all the features in the dataframe, auto detect if it's numerical or categorical
def CaseAll(regressor, dfCase3):
    ## Adding region name to the feature list
    feature_list = [i for i in dfCase3.columns if i != 'vintage_wine_statistics_ratings_average' and i!="vintage_wine_winery_name" and i!="vintage_wine_region_name" and i!="vintage_wine_name" and i!="vintage_name"]
    
    X = dfCase3.loc[:,feature_list] 
    y = dfCase3.loc[:,"vintage_wine_statistics_ratings_average"] # target is the rating
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42) 
    #Select columns where type of data is numerical
    query_cols1=X.select_dtypes(include=['number']).columns
    #Get list of column indexes fitting previous test
    numeric_features = [X.columns.get_loc(col) for col in query_cols1]
    numeric_transformer = StandardScaler()

    #Select columns where type of data is categorical
    query_cols2=X.select_dtypes(include=['object']).columns
    #Get list of column indexes fitting previous test
    categorical_features = [X.columns.get_loc(col) for col in query_cols2]
    categorical_transformer = OneHotEncoder()

#preprocessing train set

    feature_encoder = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features),    
            ('num', numeric_transformer, numeric_features)
            ]
        )
    X_train = feature_encoder.fit_transform(X_train)
    regressor.fit(X_train, y_train)

# Predictions on training set
    y_train_pred = regressor.predict(X_train)
#Preprocessing test set
    X_test = feature_encoder.transform(X_test)

# Print R^2 scores
    print("R2 score on training set : ", regressor.score(X_train, y_train))
    print("R2 score on test set : ", regressor.score(X_test, y_test))

In [20]:
regressor = LinearRegression()
CaseAll(regressor,Cleanregions(df))

R2 score on training set :  0.4521056769086279
R2 score on test set :  0.4739317659804927


Score is going down, we would need to clean outliers and take better care of the data to improve the score with a linear regressor
Instead, we will try to go forward with other models to keep the outliers

In [18]:
regressor = RandomForestRegressor(n_estimators=20, max_leaf_nodes=25,max_depth=9)
CaseAll(regressor,Cleanregions(df))

R2 score on training set :  0.8156302658646886
R2 score on test set :  0.8066060409995337


R2 now at 0.8, we are on the right path!

---

#### Now continuing our investigation, we will try to use the wine name, with the CountVectorizer function

In [62]:
import nltk
    # Getting a library of french stop words, to exclude them from the vectorizing and 
    #improve the relevance of the features
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
#adding vectorized names to the dataframe
def VectorizingName(df):
    df.reset_index(drop=True, inplace=True)
#Count vectorizing name to split it into grams and test it for ML
    from sklearn.feature_extraction.text import CountVectorizer
    from nltk.corpus import stopwords
    #losing in R² with grams of 3 and min df higher than 6, stopwords is an improvement
    cv = CountVectorizer(ngram_range=(1,2), max_df= 1000, min_df=10, stop_words=stopwords.words('french'))
    df3 = cv.fit_transform(df['vintage_wine_name'])
    df3 = pd.DataFrame(df3.toarray(), columns=cv.get_feature_names_out())

#Adding the previous columns

    df3=pd.concat([df3, df.drop('vintage_wine_name', axis=1)], axis=1)

    return df3

In [64]:
regressor = RandomForestRegressor(n_estimators=20, max_leaf_nodes=25,max_depth=9)
CaseAll(regressor,VectorizingName(Cleanregions(df)))

R2 score on training set :  0.8181499606879179
R2 score on test set :  0.8109905451668868


Ok, score is similar, but with unoptimized parameters, we will now optimize the models

First without the names, to see which is better

In [95]:
#Test with all the features in the dataframe, auto detect if it's numerical or categorical
def GridSearch(dfCase3):
    #Regressor is set, as it impacts the parameters
    regressor = RandomForestRegressor()
    ## Adding region name to the feature list
    feature_list = [i for i in dfCase3.columns if i != 'vintage_wine_statistics_ratings_average' and i!="vintage_wine_winery_name" and i!="vintage_wine_region_name" and i!="vintage_wine_name" and i!="vintage_name"]
    
    X = dfCase3.loc[:,feature_list] 
    y = dfCase3.loc[:,"vintage_wine_statistics_ratings_average"] # target is the rating
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42) 
    #Select columns where type of data is numerical
    query_cols1=X.select_dtypes(include=['number']).columns
    #Get list of column indexes fitting previous test
    numeric_features = [X.columns.get_loc(col) for col in query_cols1]
    numeric_transformer = StandardScaler()

    #Select columns where type of data is categorical
    query_cols2=X.select_dtypes(include=['object']).columns
    #Get list of column indexes fitting previous test
    categorical_features = [X.columns.get_loc(col) for col in query_cols2]
    categorical_transformer = OneHotEncoder()

#preprocessing train and Test set

    feature_encoder = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features),    
            ('num', numeric_transformer, numeric_features)
            ]
        )
    X_train = feature_encoder.fit_transform(X_train)
    X_test = feature_encoder.transform(X_test)

    params = {
    'n_estimators':  [115],
    'max_leaf_nodes': [165],
    'max_depth': [9]
        }
    gridsearch = GridSearchCV(regressor, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
    gridsearch.fit(X_train, y_train)
    print("Best hyperparameters : ", gridsearch.best_params_)
    print("R2 score on training set : ", gridsearch.score(X_train, y_train))
    print("R2 score on test set : ", gridsearch.score(X_test, y_test))
    return gridsearch


In [91]:
gridsearch=GridSearch(Cleanregions(df))

Best hyperparameters :  {'max_depth': 7, 'max_leaf_nodes': 165, 'n_estimators': 115}
R2 score on training set :  0.8513333408562684
R2 score on test set :  0.829210136788451


~20 minutes \
Best hyperparameters :  {'max_depth': 7, 'max_leaf_nodes': 165, 'n_estimators': 115} \
R2 score on training set :  0.8513333408562684 \
R2 score on test set :  0.829210136788451 \

In [96]:
gridsearch2=GridSearch(VectorizingName(Cleanregions(df)))

Best hyperparameters :  {'max_depth': 9, 'max_leaf_nodes': 165, 'n_estimators': 115}
R2 score on training set :  0.877156418432801
R2 score on test set :  0.8450551417947416
