This version utilize grid search to get best parameter for model.fit from split dataset (numeric impute by mean)

By the end of this file
1) Target label encoder saved in "../models/" as 'ANN_GridS_label_encoder.joblib'
2) Hyperparameter grid search result saved in "../models/" as 'grid_result.pkl'
3) Hyperparameter grid search history saved in "../models/" as 'grid_result_history.pkl'
4) Model saved in "../models/" as 'ANN_GridS_model.joblib '
5) Model history saved in "../models/" as ANN_GridS_model_history.joblib
6) Classification report on the train_v3 prediction saved in ".../data/external/" as 'df_clr_ANN_GridS_X_train_v3.csv'
7) Classification report on the test_v3 prediction saved in ".../data/external/" as 'df_clr_ANN_GridS_X_test_v3.csv'
8) csv file with th original test_v3 df with prediction saved in ".../data/external/" as 'df_X_test_v3_output.csv'

Custom functions used:
1) from ds.data.sets import load_sets_v2
2) from ds.models.performance import plot_model_learningcurve, plot_model_accuracy, plot_model_loss, df_classifcation_report

In [1]:
%load_ext autoreload
%autoreload 2

# Env set up

In [2]:
# for Tina machine only, to import .py from src/ds/
import os
current_workpath = os.getcwd()
print("Current workpath:", current_workpath)

parent_folder = os.path.dirname(current_workpath)
print("Parent_folder:", parent_folder)

import sys
sys.path.append(parent_folder+'/src')
print(sys.path[-1])

Current workpath: /home/tina4aiml/dev/notebooks
Parent_folder: /home/tina4aiml/dev
/home/tina4aiml/dev/src


In [3]:
rawfile_name = 'beer_reviews'
rawfile_format = '.csv'
rawfile_path = "../data/raw/"

interim_folder_path = "../data/interim/"
processed_folder_path  ="../data/processed/"
external_folder_path  ="../data/external/"
model_folder_path = "../models/"

# Import data

In [4]:
import pandas as pd
import numpy as np
from ds.data.sets import load_sets_v2

In [5]:
X_train_v3, y_train_v3, X_val_v3, y_val_v3, X_test_v3, y_test_v3 = load_sets_v2(path='../data/processed/', suffix='_v3')

X_train_v3 shape: (951968, 6)
y_train_v3 shape: (951968,)
X_val_v3 shape: (317323, 6)
y_val_v3 shape: (317323,)
X_test_v3 shape: (317323, 6)
y_test_v3 shape: (317323,)


# Encore Target

In [6]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder on the training set
label_encoder.fit(y_train_v3)

# Save the LabelEncoder
joblib.dump(label_encoder, model_folder_path+'ANN_GridS_label_encoder.joblib')

# Transform the target variables
y_train_v3_encoded = label_encoder.transform(y_train_v3).astype(int)
y_val_v3_encoded = label_encoder.transform(y_val_v3).astype(int)
y_test_v3_encoded = label_encoder.transform(y_test_v3).astype(int)

# Transform feature variables
X_train_v3 = X_train_v3.astype(float)
X_val_v3 = X_val_v3.astype(float)
X_test_v3 = X_test_v3.astype(float)

# Define Neural Network Architecture

In [7]:
features_names = np.load(processed_folder_path+'features_label.npy',allow_pickle=True)
features_names_n = len(features_names)
target_class = np.load(processed_folder_path+'beer_style.npy',allow_pickle=True)
target_class_n = len(target_class)

In [8]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
tf.keras.backend.clear_session()

2023-07-09 14:28:13.513111: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-09 14:28:13.579179: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
tf.keras.utils.set_random_seed(1)

input_shape = (features_names_n, )

In [10]:
# Function to create the TensorFlow model
def create_ANN_GridS_model(hidden_units=128, optimizer='adam', learning_rate=0.001):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(hidden_units, activation='relu', input_shape=input_shape),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(target_class_n, activation='softmax')
    ])
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) if optimizer == 'adam' else tf.keras.optimizers.SGD(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [11]:
wrapped_model = KerasClassifier(build_fn=create_ANN_GridS_model)

  wrapped_model = KerasClassifier(build_fn=create_ANN_GridS_model)


In [12]:
param_grid = {
    'hidden_units': [128, 256],
    'optimizer': ['adam', 'sgd'],
    'learning_rate': [0.001, 0.01],
    'epochs': [20, 30],
    'batch_size': [32, 64]
}

In [13]:
grid_search = GridSearchCV(wrapped_model, param_grid, cv=3, scoring='accuracy')

grid_result = grid_search.fit(X_val_v3, y_val_v3_encoded)

ANN_GridS_model = grid_result.best_estimator_
best_params = grid_result.best_params_

print("Best Parameters:", best_params)

2023-07-09 14:28:15.266024: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-09 14:28:15.278318: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-09 14:28:15.278688: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-09 14:28:15.286898: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-07-09 14:28:15.287326: I tensorflow/compile

Epoch 1/20


2023-07-09 14:28:18.201879: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-07-09 14:28:18.205525: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f0cfb35fbb0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-07-09 14:28:18.205564: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-07-09 14:28:18.211961: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-07-09 14:28:18.396558: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-07-09 14:28:18.520295: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifeti

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

KeyboardInterrupt: 

In [14]:
joblib.dump(grid_result, model_folder_path + 'grid_result.pkl')

grid_result_history = grid_result.cv_results_
joblib.dump(grid_result_history, model_folder_path + 'grid_result_history.pkl')

NameError: name 'grid_result' is not defined

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0000001)

In [None]:
ANN_GridS_history = ANN_GridS_model.fit(
                    X_val_v3, y_val_v3_encoded, 
                    validation_data=(X_val_v3, y_val_v3),
                    epochs=best_params['epochs'], 
                    batch_size=best_params['batch_size'],
                    callbacks=[early_stopping, reduce_lr])

In [None]:
from joblib import dump
dump(ANN_GridS_model, model_folder_path+'ANN_GridS_model.joblib')
dump(ANN_GridS_model.history, model_folder_path+'ANN_GridS_model_history.joblib')

# Model Performance on training data

In [None]:
ANN_GridS_model = joblib.load(model_folder_path+'ANN_GridS_model.joblib')
ANN_GridS_history = joblib.load(model_folder_path+'ANN_GridS_model_history.joblib')

In [None]:
# Model Summary
ANN_GridS_model._name = 'ANN_GridS_model'
ANN_GridS_model.summary()

In [None]:
# Model- learning curve on train set
from ds.models.performance import plot_model_learningcurve
plot_model_learningcurve(ANN_GridS_history.history,'NeuralNet on train_v3 - Learning curve')

In [None]:
# plot performance on train set
from ds.models.performance import plot_model_accuracy, plot_model_loss

plot_model_accuracy(ANN_GridS_history.history,'NeuralNet on train_v3 - Accuracy')

In [None]:
plot_model_loss(ANN_GridS_history.history,'NeuralNet on train_v3 - Loss')

In [None]:
# Predict on data
ANN_GridS_X_train_v3_predictions = np.argmax(ANN_GridS_model.predict(X_train_v3), axis=1)

In [None]:
# print overall performance metrics
from ds.models.performance import print_overall_model_metric
print_overall_model_metric(y_train_v3_encoded,ANN_GridS_X_train_v3_predictions, 'ANN_GridS_model','Trainset')

In [None]:
# Classififcation Report
from ds.models.performance import df_classifcation_report
df_classifcation_report(
    y_encoded = y_train_v3_encoded, 
    y_prediction = ANN_GridS_X_train_v3_predictions, 
    target_names = target_class,
    filename = 'df_clr_ANN_GridS_X_train_v3.csv', 
    dest = external_folder_path
    )

# Model Performance on test data

In [None]:
y_test_v3_pred_probs = ANN_GridS_model.predict(X_test_v3)
y_test_v3_pred = y_test_v3_pred_probs.argmax(axis=1)
y_test_v3_pred

In [None]:
print_overall_model_metric(y_test_v3_encoded, y_test_v3_pred, 'ANN_GridS_model','Testset')

In [None]:
# Classification report
df_classifcation_report(
    y_encoded = y_test_v3_encoded, 
    y_prediction = y_test_v3_pred, 
    target_names = target_class,
    filename = 'df_clr_ANN_GridS_X_test_v3.csv', 
    dest = external_folder_path
    )

In [None]:
# Load the saved LabelEncoder
label_encoder = joblib.load(model_folder_path+'ANN_GridS_label_encoder.joblib')

# Revert the encoded labels back to their original text form
y_test_v3_pred_decoded = label_encoder.inverse_transform(y_test_v3_pred)
y_test_v3_pred_decoded

# Assemble back to df

In [None]:
df_X_test_v3_unencoded = pd.read_pickle(interim_folder_path + 'df_X_test_v3_unencoded.pkl')
df_X_test_v3_unencoded['prediction'] = y_test_v3_pred_decoded
df_X_test_v3_unencoded.to_csv(external_folder_path+'df_X_test_v3_output.csv', index=True)