# Package Initialisation

In [None]:
from fastapi import FastAPI
from starlette.responses import JSONResponse
from joblib import dump
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from category_encoders.ordinal import OrdinalEncoder
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Import Dataset

In [None]:
df_data_raw = pd.read_csv('../data/raw/beer_reviews.csv')

# Dataset Filtering

In [None]:
df_data_clean1 = df_data_raw.copy()

In [None]:
#Define columns to be used for various steps later
col_usable = ['brewery_name','review_aroma','review_appearance','review_palate','review_taste','beer_abv','beer_style']
independent_cols = ['brewery_name','review_aroma','review_appearance','review_palate','review_taste','beer_abv']
numerical_cols = ['review_aroma','review_appearance','review_palate','review_taste','beer_abv']
factor_cols = ['brewery_name']
target_col = ['beer_style']

In [None]:
#Select only the required columns
df_data_reduced1 = df_data_clean1.loc[:,col_usable]

In [None]:
#Take another copy for filtering out null values and split off the target variable column
df_data_reduced2 = df_data_reduced1.copy()
df_data_reduced2 = df_data_reduced2.dropna()
target = df_data_reduced2.pop('beer_style')
df_data_reduced2.describe()

# Define transformations

In [None]:
#Factor column
factor_encoder = OrdinalEncoder()
df_data_reduced2[factor_cols] = factor_encoder.fit_transform(df_data_reduced2[factor_cols])
#Numerical columns - all set to standard default range of [0.0,1.0]
numerical_encoder = MinMaxScaler()
df_data_reduced2[numerical_cols] = numerical_encoder.fit_transform(df_data_reduced2[numerical_cols])
#Target column
target_encoder = LabelEncoder()
target_out = target_encoder.fit_transform(target)

# Dataset splitting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_data_reduced2, target_out, train_size=0.7, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.7, random_state=42)

# ML Models

## Logistic regression

In [None]:
logreg = LogisticRegression(max_iter=10000)
model = logreg.fit(X_train,y_train)
y_pred_train = proba_to_class(model.predict_proba(X_train))
y_pred_val = proba_to_class(model.predict_proba(X_val))
y_pred_test = proba_to_class(model.predict_proba(X_test))

In [None]:
acc_logreg_train = accuracy_score(y_train,y_pred_train)
acc_logreg_val = accuracy_score(y_val,y_pred_val)
acc_logreg_test = accuracy_score(y_test,y_pred_test)

## Neural network

In [None]:
hidden_layer_sizes=512
nn_classifier_model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, solver='sgd', learning_rate='adaptive', random_state=42)
model3 = nn_classifier_model.fit(X_train,y_train)
y_pred_train_nn01 = proba_to_class(model3.predict_proba(X_train))
y_pred_val_nn01 = proba_to_class(model3.predict_proba(X_val))
y_pred_test_nn01 = proba_to_class(model3.predict_proba(X_test))

In [None]:
acc_nn01_train = accuracy_score(y_train,y_pred_train_nn01)
acc_nn01_val = accuracy_score(y_val,y_pred_val_nn01)
acc_nn01_test = accuracy_score(y_test,y_pred_test_nn01)

## Random forest

In [None]:
randfor = RandomForestClassifier(random_state=42,max_depth=4)
model2 = randfor.fit(X_train,y_train)
y_pred_train_rf = proba_to_class(model2.predict_proba(X_train))
y_pred_val_rf = proba_to_class(model2.predict_proba(X_val))
y_pred_test_rf = proba_to_class(model2.predict_proba(X_test))

In [None]:
acc_rf_train = accuracy_score(y_train,y_pred_train_rf)
acc_rf_val = accuracy_score(y_val,y_pred_val_rf)
acc_rf_test = accuracy_score(y_test,y_pred_test_rf)

### Perform a search on minimum depth

In [None]:
randfor2 = RandomForestClassifier(random_state=42)
max_depth = np.arange(start=2,stop=8)
param_grid1 = {'max_depth':max_depth}
cv = 10
scoring = 'accuracy'

In [None]:
clf1 = GridSearchCV(estimator=randfor2,param_grid=param_grid1,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf1.fit(X_train,y_train)
clf1.best_estimator_
clf1.score(X_train,y_train)
clf1.score(X_val, y_val)
clf1.score(X_test, y_test)

### Perform a search on minimum samples per node

In [None]:
randfor3 = RandomForestClassifier(random_state=42,max_depth=7)
min_samples_split = np.linspace(start=2, stop=256, num=5).astype(int)
param_grid2 = {'min_samples_split':min_samples_split}

In [None]:
clf2 = GridSearchCV(estimator=randfor3,param_grid=param_grid2,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf2.fit(X_train,y_train)
clf2.best_estimator_
clf2.score(X_val, y_val)
clf2.score(X_test, y_test)

### Perform a search on number of samples per node

In [None]:
randfor4 = RandomForestClassifier(random_state=42,max_depth=7,min_samples_split=65)
n_estimators = np.linspace(start=2, stop=256, num=5).astype(int)
param_grid3 = {'n_estimators':n_estimators}

In [None]:
clf3 = GridSearchCV(estimator=randfor4,param_grid=param_grid3,cv=cv,scoring=scoring,return_train_score=True,verbose=3)
clf3.fit(X_train,y_train)
clf3.best_estimator_
clf3.score(X_val, y_val)
clf3.score(X_test, y_test)

## PyTorch models

### Useful functions

In [None]:
def proba_to_class(probs):
    return np.argmax(probs, axis=1)

def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') # don't have GPU 
    return device

class PytorchDataset(Dataset):
    """
    Pytorch dataset
    ...

    Attributes
    ----------
    X_tensor : Pytorch tensor
        Features tensor
    y_tensor : Pytorch tensor
        Target tensor

    Methods
    -------
    __getitem__(index)
        Return features and target for a given index
    __len__
        Return the number of observations
    to_tensor(data)
        Convert Pandas Series to Pytorch tensor
    """
        
    def __init__(self, X, y):
        self.X_tensor = self.to_tensor(X)
        self.y_tensor = self.to_tensor(y)
    
    def __getitem__(self, index):
        return self.X_tensor[index], self.y_tensor[index]
        
    def __len__ (self):
        return len(self.X_tensor)
    
    def to_tensor(self, data):
        if type(data) == pd.core.frame.DataFrame:
            data_out = data.values
        if type(data) == np.ndarray:
            data_out = data
        return torch.Tensor(data_out)

### Neural network 1

In [None]:
class LogisticRegressionModel(nn.Module):
     def __init__(self, input_dim):
        super(LogisticRegressionModel,self).__init__()
        
        self.layer1 = nn.Linear(input_dim, 128)
        self.layerout = nn.Linear(128, 103)
        
     def forward(self, x):
         x = F.relu(self.layer1(x))
         x = F.sigmoid(self.layerout(x))
         return x

### Neural network 2

In [None]:
class LogisticRegressionModel2(nn.Module):
     def __init__(self, input_dim):
        super(LogisticRegressionModel2,self).__init__()
        
        self.layer1 = nn.Linear(input_dim, 256)
        self.layerout = nn.Linear(256, 103)
        
     def forward(self, x):
         x = F.relu(self.layer1(x))
         x = F.sigmoid(self.layerout(x))
         return x

### Neural network 3

In [None]:
class DeepNeuralNet1(nn.Module):
    def __init__(self, input_dim):
      super(DeepNeuralNet1,self).__init__()
      self.fc1 = nn.Linear(input_dim, 512)
      self.fc2 = nn.Linear(512,512)
      self.fc3 = nn.Linear(512,103)
      self.droput = nn.Dropout(0.2)
        
    def forward(self,x):
          x = F.relu(self.fc1(x))
          x = self.droput(x)
          x = F.relu(self.fc2(x))
          x = self.droput(x)
          x = self.fc3(x)
          return x

### Call NN ML class constructors

In [None]:
model1 = LogisticRegressionModel(X_train.shape[1])   
model2 = LogisticRegressionModel2(X_train.shape[1]) 
model3 = DeepNeuralNet1(X_train.shape[1]) 

### Set useful constants and functions

In [None]:
device = get_device()

train_dataset = PytorchDataset(X=X_train, y=y_train)
val_dataset = PytorchDataset(X=X_val, y=y_val)
test_dataset = PytorchDataset(X=X_test, y=y_test)

batch_size_test = y_test.shape[0]

num_epochs = 50
batch_size = 100
batch_size_test = y_test.shape[0]
learning_rate = 0.01

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(lr =learning_rate

### Define function for NN training

In [None]:
def training_loop(model,num_epochs,batch_size):
    for epoch in range(num_epochs): # monitoring the losses
        train_data = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_data = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
        training_loss = 0
        validation_loss = 0
        size = 0
        accuracy = 0
        
        model.train()
        for batch_idx, (data,label) in enumerate(train_data):
            optimizer.zero_grad() 
            output = model(data)
            label = label.to(torch.long)
            loss = criterion(output,label)
            loss.backward()
            optimizer.step()
            training_loss += loss.item()
            size += label.shape[0]
            values, indices = output.max(1)
            accuracy += (indices == label).sum()
        
        model.eval()
        for batch_idx, (data,label) in enumerate(val_data):
            output = model(data)
            label = label.to(torch.long)
            loss = criterion(output,label)
            validation_loss += loss.item()
    
        training_loss /= size
        validation_loss /= size
        accuracy = accuracy.float()/size*100
        print('Epoch: %5s, Train Loss: %6f, Validation Loss: %6f, Accuracy: %6f\n' %(str(epoch), training_loss, validation_loss, accuracy))
    return model     

### Define function for NN prediction generation

In [None]:
def prediction_generation(model,batch_size):
    model.eval()
    testing_loss = 0.0
    test_data = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    for batch_idx, (data,label) in enumerate(test_data):
        output = model(data) 
        label = label.to(torch.long)
        loss = criterion(output,label)
        testing_loss += loss.item()
        _, predictions = torch.max(output, 1)
        testing_loss /= len(test_dataset)
    return predictions

### Train for three defined NN models

In [None]:
model1 = training_loop(model1,num_epochs=num_epochs,batch_size=batch_size)
y_pred_nnlogreg1 = prediction_generation(model1,batch_size=batch_size_test)

In [None]:
model2 = training_loop(model2,num_epochs=num_epochs,batch_size=batch_size)
y_pred_nnlogreg2 = prediction_generation(model2,batch_size=batch_size_test)

In [None]:
model3 = training_loop(model3,num_epochs=num_epochs,batch_size=batch_size)
y_pred_nn3 = prediction_generation(model3,batch_size=batch_size_test)

# Pipeline Definition for API Deployment

In [None]:
cat_var_transformer = Pipeline(
    steps=[
        ('brewery_name_encoder', MinMaxScaler())
    ]
)

num_var_transformer = Pipeline(
    steps=[
        ('beer_measures_encoder', OrdinalEncoder())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('fac_cols', cat_var_transformer, factor_cols),
        ('num_cols', num_var_transformer, numerical_cols)
    ]
)

model_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('log_regression', LogisticRegression(max_iter=10000)) ## This line is set to the final model used for API
    ]
)