# Import Libraries

In [18]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score, confusion_matrix
from sklearn.utils import resample
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


# Load Datasets

In [2]:
data_train = pd.read_parquet('../data/processed/unbalanced_data_train.parquet')

#### Split data in 30/70 

In [3]:
label = {'Style'}
columns_set = set(data_train.columns.values)
x = data_train[list(columns_set-label)]
y = data_train[list(label)]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Machine Learning
* Select technique 

In [4]:
model_params = {
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,50,100]
        }
    },
    'kneighbors_classifier':{
        'model': KNeighborsClassifier(),
        'params': {
            'algorithm': ['ball_tree', 'kd_tree']
        }
    }
}

Before we do any experiment with the classifiers, we need to treat the unbalanced classes

In [5]:
scores = []

for model_name, mp in model_params.items():
    clf1 =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=True)
    print(f'---- training {model_name} ----')
    clf1.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf1.best_score_,
        'best_params': clf1.best_params_
        
    })
    
models_info_data_frame = pd.DataFrame(scores,columns=['model','best_score','best_params'])
models_info_data_frame

---- training random_forest ----


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


---- training kneighbors_classifier ----


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Unnamed: 0,model,best_score,best_params
0,random_forest,0.474686,{'n_estimators': 100}
1,kneighbors_classifier,0.308449,{'algorithm': 'ball_tree'}


After seing that the two algorithms that we have here, are having the best score less than 50% we decided to balanced the target column categories. 

## Resampling the Target column

In [6]:
count_df = data_train.groupby(['Style'])['Style'].count()

styles_unbalanced = [k for k, v in count_df.items() if v <= 100]

In [7]:
from sklearn.utils import resample


df_sampled =pd.DataFrame()
for j in styles_unbalanced:
    
    df_minority_j = data_train[data_train.Style==j]
    df_minority_upsampled = resample(df_minority_j, 
                                 replace=True,     
                                 n_samples=400,    
                                 stratify= df_minority_j,
                                 random_state=123)
    df_sampled = pd.concat([df_sampled, df_minority_upsampled])
     

In [8]:
data_train = pd.concat([data_train, df_sampled])


In [9]:
label = {'Style'}
columns_set = set(data_train.columns.values)
x = data_train[list(columns_set-label)]
y = data_train[list(label)]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

In [10]:
scores = []

for model_name, mp in model_params.items():
    clf1 =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=True)
    print(f'---- training {model_name} ----')
    clf1.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': clf1.best_score_,
        'best_params': clf1.best_params_
        
    })
    
models_info_data_frame = pd.DataFrame(scores,columns=['model','best_score','best_params'])
models_info_data_frame

---- training random_forest ----


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


---- training kneighbors_classifier ----


  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Unnamed: 0,model,best_score,best_params
0,random_forest,0.7339,{'n_estimators': 100}
1,kneighbors_classifier,0.610904,{'algorithm': 'ball_tree'}


After trating the unbalanced data, we can see the algorithms score increase significantly. Also we choose the random_forest classifier.

In [11]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
score = accuracy_score(y_test, y_pred)
print('Accuracy: {}'.format(score))
print('Precision score: ', precision_score(y_test, y_pred, average='micro'))
print('Recall score: ', recall_score(y_test, y_pred, average='micro'))

  clf.fit(X_train, y_train)


Accuracy: 0.7514833311990439
Precision score:  0.7514833311990439
Recall score:  0.7514833311990439


# k-fold Cross Validation

Se hace seleccion de los mejores modelos usando el Training Set y k-fold Cross Validation

In [12]:
kFold = model_selection.KFold(n_splits=10)
scoring = 'accuracy'
score = (model_selection.cross_val_score(clf, X_train, y_train,  scoring = scoring, cv = kFold))
print (f"( {score.mean()}, {score.std()})")

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


( 0.7393889498719355, 0.006228471166309521)


# Final model testing

Withe the model used in **Train -set** we do the validation with **Test - set**, additionaly we compare the results  **Train -set** vs **Test - set** 

In [13]:
X = data_train.drop('Style', axis=1)
y = data_train['Style']

In [41]:
clf_final = RandomForestClassifier(n_estimators=100)
clf_final.fit(X, y)

In [99]:
data_test = pd.read_parquet('../data/interim/data_test.parquet')

In [100]:
categoric_keys = ['SugarScale', 'BrewMethod', 'Style']
data_test_dummies = pd.get_dummies(data_test, columns=categoric_keys[:2])
label_encoder = LabelEncoder()

data_test_dummies['Style'] = label_encoder.fit_transform(data_test_dummies['Style'])

In [101]:
def remove_outliers(data_frame: pd.DataFrame) -> None:
    numeric_keys = ['PrimaryTemp', 'ABV', 'FG', 'IBU', 'OG']
    for n in numeric_keys:
        description = data_frame[n].describe()
        max = description[7]
        min = description[3]
        data_frame.loc[((data_frame[n] <= min) | (data_frame[numeric_keys[0]] >= max)), [n]] = np.nan
        data_frame = data_frame.dropna()

In [102]:
data_test_dummies = data_test_dummies.drop_duplicates()
data_test_dummies.drop('Size(L)', axis=1, inplace=True)

In [103]:
categories_test = set(data_test_dummies.Style.unique())
categories_train = set(data_train.Style.unique())

In [104]:
categories_test == categories_train

True

In [105]:
label = {'Style'}
columns_set = set(data_test_dummies.columns.values)
X_test = data_test_dummies[list(columns_set-label)]
y_test = data_test_dummies[list(label)]

In [106]:
y_pred_test = clf_final.predict(X_test)
score = accuracy_score(y_test, y_pred_test)
print('Accuracy: {}'.format(score))
print('Precision score: ', precision_score(y_test, y_pred_test, average='micro'))
print('Recall score: ', recall_score(y_test, y_pred_test, average='micro'))

Feature names must be in the same order as they were in fit.



Accuracy: 0.01742084942084942
Precision score:  0.01742084942084942
Recall score:  0.01742084942084942


Validating the model with the data test, we get low scores. We proceed to clean the data test set. 

In [108]:
remove_outliers(data_test_dummies)
data_test_dummies = data_test_dummies.dropna()

In [109]:
label = {'Style'}
columns_set = set(data_test_dummies.columns.values)
X_test = data_test_dummies[list(columns_set-label)]
y_test = data_test_dummies[list(label)]

y_pred_test = clf_final.predict(X_test)
score = accuracy_score(y_test, y_pred_test)
print('Accuracy: {}'.format(score))
print('Precision score: ', precision_score(y_test, y_pred_test, average='micro'))
print('Recall score: ', recall_score(y_test, y_pred_test, average='micro'))

Feature names must be in the same order as they were in fit.



Accuracy: 0.010465446289201078
Precision score:  0.010465446289201078
Recall score:  0.010465446289201078


| Score | Train      | Test |
| ----------- | ----------- | ----------- |
| Accuracy | 75.14%      | 1%       |
| Precision | 75.14%   | 1%        |
| Recall | 75.14%   | 1%        |


After cleaning we decided to do a cross validation test. 

In [113]:
pd.concat([data_train, data_test_dummies]).to_parquet('../data/processed/data_complete.parquet')

In [114]:
data_complete = pd.read_parquet('../data/processed/data_complete.parquet')
X = data_complete.drop('Style', axis=1)
y = data_complete['Style']

In [115]:
clf_final = RandomForestClassifier(n_estimators=100)
clf_final.fit(X, y)

Doing the cross validation with the test, we have a 73% of accuracy.

In [116]:
kFold = model_selection.KFold(n_splits=10)
scoring = 'accuracy'
score = (model_selection.cross_val_score(clf_final, X, y,  scoring = scoring, cv = kFold))
print (f"( {score.mean()}, {score.std()})")

( 0.7321866965359289, 0.17013179551547764)


| Score | Train      | Test |
| ----------- | ----------- | ----------- |
| Accuracy | 73.93%      | 73.21%       |
| STD | 0.006   | 0.170        |

# Deploying model

1. Save the model. 

In [117]:
#import pickle # Esta es una libreria de serializacion nativa de python, puede tener problemas de seguridad
from joblib import dump # libreria de serializacion

# garbar el modelo en un archivo
#dump(clf_final, '../models/random_forest_classifier.joblib')

['../models/random_forest_classifier.joblib']

# Conclusions

The dataset has a lot of beer styles, and also was unbalanced. Around 178 categories were detected. It was needed to do a resample of the classes with less than 100 elements.

The variables 'PitchRate', 'MashThickness', 'BeerID', 'StyleID', 'Name', 'PrimingAmount', 'PrimingMethod', 'UserId' were removed due to various reasons, some of the variables were having no useful information (for exapmle BeerID, StyleID and Name) to the analysis and some of them have a lot of null values (as MashThickness). The variables 'BoilSize', 'Unnamed: 0', 'Unnamed: 22', 'Unnamed: 23', 'index', 'nhbhgv' and 'ugtft' were not in the original dataset and also had a lot of inconsistency with the data, a lot of duplicated values and null values among the columns, so those columns were also removed.

After doing the first cleaning general process, we splited the dataset in two datasets, with a 30/70 proportion test and train respectively.  

In the univariate analysis, we found that the variable Size(L) had a lot of variance, and we decided to not include it in the model too. Also we found out most of the variables have a normal distribution with postive skewness and some (not in majority) with negative skewness. The variables OG and FG have a weird relationship in the way that we did the univariate analysis, that means the distributions are similar. We also found that for the categories varibles specifically SugarScale, there is problem with the class balanced, and almost 98% of the data is around the SpecificGravity SugarScale category.

For the bivariate analysis we found interesting stuffs. The variables OG and FG have almost the same behaviour with the rest of the columns. For example, both of them increases at Specific Gravity (in specific gravity units) value when the Alcohol By Volume is around 3 and 10 grades. Also these two variables has a similar behaviour with some others variables, as IBU. We expected to have a relationship between the Alcohol By Volume and the Color, however we did found a little linear relationship (found it graphically). Also We thought that the realtionship between IBU and ABV would be something, but we found that just some samples of darker beers were the most bitter ones, but in low amount of them.

For the feature engineering we did a one hot encoding for the variables SugarScale and BrewMethod, and we did a label encoding for the target class (Style), since this class has more than 170 categories. 

We select the best model using the GridSearchCV algorithm, finding that the best algorithm to use for this dataset was the RandomForestClassifier with the param of 100 estimators. However we did needed to balanced the classes, because the best score we have in the part was around 40% (accuracy). After resampling the categories with less than 100 elements, we get an accuracy around 73%. Testing it with the test dataset we had an accuracy of 73%. 



# Ayudas Y Referencias

- [Home_Brewed Beer code where we reused the resampling and modeling](https://www.kaggle.com/code/samch08/home-brewed-beer)

- [Data set](https://www.kaggle.com/datasets/jtrofe/beer-recipes)