In [1]:
# for Tina machine only, to import .py from src/ds/
import os
current_workpath = os.getcwd()
print("Current workpath:", current_workpath)

parent_folder = os.path.dirname(current_workpath)
print("Parent_folder:", parent_folder)

import sys
sys.path.append(parent_folder+'/src')
print(sys.path[-1])

Current workpath: /home/tina4aiml/dev/notebooks
Parent_folder: /home/tina4aiml/dev
/home/tina4aiml/dev/src


# read in data

In [None]:
print ('read in data...')

In [None]:
rawfile_name = 'beer_reviews'
rawfile_format = '.csv'
rawfile_path = "../data/raw/"
datadict_path = "../references/Data_Dict.csv"

interim_folder_path = "../data/interim/"
processed_folder_path  ="../data/processed/"
model_folder_path = "../models/"

import pandas as pd
import numpy as np


In [2]:
df_raw = pd.read_csv(rawfile_path + rawfile_name + rawfile_format)
data_dict = pd.read_csv(datadict_path)
features_drop = data_dict.loc[data_dict['API Expected Parameter'] == 'No', 'Column'].tolist()
features_drop.remove('beer_style (target)')
df_prep = df_raw.drop(features_drop, axis=1)

In [3]:
from ds.data.sets import pop_target
df, target = pop_target(df_prep, 'beer_style', to_numpy=False)
df_features = df.columns.tolist()

In [None]:
np.save(processed_folder_path + 'beer_style', target.unique())
np.save(processed_folder_path + 'features_label', df_features)
np.save(processed_folder_path + 'brewery_name', df['brewery_name'].unique())

In [4]:
df.head()

Unnamed: 0,brewery_name,review_aroma,review_appearance,review_palate,review_taste,beer_abv
0,Vecchio Birraio,2.0,2.5,1.5,1.5,5.0
1,Vecchio Birraio,2.5,3.0,3.0,3.0,6.2
2,Vecchio Birraio,2.5,3.0,3.0,3.0,6.5
3,Vecchio Birraio,3.0,3.5,2.5,3.0,5.0
4,Caldera Brewing Company,4.5,4.0,4.0,4.5,7.7


In [None]:
print ('Process data for training... ')

In [None]:
from ds.data.sets import DataProcessor
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy="mean")
scaler = StandardScaler()

In [None]:
data_processor = DataProcessor(scaler, imputer)

In [None]:
X_proceesed = data_processor.process_dataframe(df,  dest = "../data/interim/", hashbuckets = 1000)

In [None]:
df_processed = pd.DataFrame(X_proceesed[1])
df_processed.columns = df_features

In [None]:
from ds.data.sets import split_sets_random

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(df_processed, target=target, test_ratio=0.2, to_numpy=True)

In [None]:
X_train_v3_unencoded = df.loc[X_train.index]
X_train_v3_unencoded.to_pickle(interim_folder_path + 'df_X_train_v3.pkl')
print(f"Total of {len(X_train)} records, with {X_train['brewery_name'].nunique()} unqiue brewery_names in X_train, and {y_train.nunique()} unique beer_style in y_train.")

In [None]:
X_val_v3_unencoded = df.loc[X_val.index]
print(X_val_v3_unencoded)
X_val_v3_unencoded.to_pickle(interim_folder_path + 'df_X_val_v3.pkl')
print(f"Total of {len(X_val)} records, with {X_val['brewery_name'].nunique()} unqiue brewery_names in X_val, and {y_val.nunique()} unique beer_style in y_val.")

In [None]:
X_test_v3_unencoded = df.loc[X_test.index]
X_test_v3_unencoded.to_pickle(interim_folder_path + 'df_X_test_v3_unencoded.pkl')
print(f"Total of {len(X_test)} records, with {X_test['brewery_name'].nunique()} unqiue brewery_names in X_test, and {y_test.nunique()} unique beer_style in y_test.")

In [None]:
from ds.data.sets import save_sets_v2

In [None]:
save_sets_v2(X_train, y_train, X_val, y_val, X_test, y_test, path='../data/processed/', suffix='_v3')

# train model

In [None]:
from ds.data.sets import load_sets_v2

In [None]:
X_train_v3, y_train_v3, X_val_v3, y_val_v3, X_test_v3, y_test_v3 = load_sets_v2(path='../data/processed/', suffix='_v3')

In [None]:
print ('Traing with input data... ')

In [None]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder on the training set
label_encoder.fit(y_train_v3)

# Save the LabelEncoder
joblib.dump(label_encoder, model_folder_path+'ANN_T_label_encoder.joblib')

# Transform the target variables
y_train_v3_encoded = label_encoder.transform(y_train_v3).astype(int)
y_val_v3_encoded = label_encoder.transform(y_val_v3).astype(int)
y_test_v3_encoded = label_encoder.transform(y_test_v3).astype(int)

In [None]:
X_train_v3 = X_train_v3.astype(float)
X_val_v3 = X_val_v3.astype(float)
X_test_v3 = X_test_v3.astype(float)

features_names = np.load(processed_folder_path+'features_label.npy',allow_pickle=True)
features_names_n = len(features_names)

In [None]:
target_class = np.load(processed_folder_path+'beer_style.npy',allow_pickle=True)
target_class_n = len(target_class)

In [None]:
import tensorflow as tf
tf.keras.backend.clear_session()

In [None]:
tf.keras.utils.set_random_seed(1)
num_classes = target_class_n
input_shape = (features_names_n,)

In [None]:
## callback that will decrease the learning rate by a factor of 0.2 if the model doesn't improve after 5 epochs
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0000001)

In [None]:
ANN_T_model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=input_shape),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

ANN_T_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
n_epochs = 20
n_batchsize = 32
es_patience = 5
monitor_metric = 'val_loss'

ANN_T_history = ANN_T_model.fit(
                          X_train_v3, 
                          y_train_v3_encoded, 
                          epochs=n_epochs, 
                          batch_size=n_batchsize, 
                          validation_data=(X_val_v3, y_val_v3_encoded),    
                          callbacks=[        
                            EarlyStopping(
                            monitor=monitor_metric,
                            patience=es_patience,
                            restore_best_weights=True
                            ),
                            reduce_lr
                        ] )
#runtime = 42m

In [None]:
from joblib import dump
dump(ANN_T_model, model_folder_path+'ANN_T_model.joblib')
dump(ANN_T_model.history, model_folder_path+'ANN_T_model_history.joblib')

In [None]:
print('Model training complete ...')

In [None]:
# Model Performance

In [None]:
print('Model structure...')

In [None]:
ANN_T_model._name = 'ANN_T_model'
ANN_T_model.summary()

In [None]:
print('Model performance on training data ...')

In [None]:
ANN_T_X_train_v3_predictions = np.argmax(ANN_T_model.predict(X_train_v3), axis=1)

In [None]:
#print overperformance metrics
from ds.models.performance import print_overall_model_metric
print_overall_model_metric(y_train_v3_encoded,ANN_T_X_train_v3_predictions, 'ANN_T_model','Trainset')

In [None]:
print('Model performance on test data ...')

In [None]:
y_test_v3_pred_probs = ANN_T_model.predict(X_test_v3)
y_test_v3_pred = y_test_v3_pred_probs.argmax(axis=1)

In [None]:
print_overall_model_metric(y_test_v3_encoded, y_test_v3_pred, 'ANN_T_model','Testset')