# <center><u>AdvML Mock Belt Exam
- Authored by: Eric N. Valdez
- Data: 04/10/2024

# `Part 1: NLP`
## Imports:

In [None]:
# Imports

import pandas as pd
import spacy
import matplotlib.pyplot as plt
import nltk

import tensorflow as tf
import numpy as np
import seaborn as sns

## Import Modeling Package
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from tensorflow.keras.models import Sequential



from sklearn.naive_bayes import MultinomialNB
# Set Random Seeds
tf.keras.utils.set_random_seed(42)
tf.random.set_seed(42)
np.random.seed(42)

In [None]:
# Increase column width
pd.set_option('display.max_colwidth', 250)

## Preprocessing:

### <u>Load functions

In [None]:
%load_ext autoreload
%autoreload 2

import exam_functions as fn

In [None]:
df = pd.read_csv('data/part1-aml-belt-exam.csv')
df.head()

In [None]:
# Loading NLP Lite
nlp_lite = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
nlp_lite

## <u>Preprocess with spacy

In [None]:
# Tokens
%%time

df['tokens']= fn.batch_preprocess_texts(df['paragraph'],nlp=nlp_lite, remove_stopwords \
                                        = True, remove_punct =True, use_lemmas = False,)

In [None]:
# Lemamatized
%%time
df['lemmas']= fn.batch_preprocess_texts(df['paragraph'],nlp=nlp_lite, remove_stopwords \
                                        = True, remove_punct =True, use_lemmas = True,)

In [None]:
df.head()

In [None]:
# Joined tokens
df['joined_tokens'] = df['tokens'].map(lambda x: " ".join(x))

In [None]:
df['joined_tokens']

In [None]:
# Joined lemmas
df['joined_lemmas'] = df['lemmas'].map(lambda x: " ".join(x))

In [None]:
df['joined_lemmas']


## Perform group comparison EDA:

In [None]:
# Seperate 2 groups based on source
df['source'].value_counts()

In [None]:
grp_ConanDoyle = df.loc[df['source'] == 'Conan Doyle']

In [None]:
grp_ConanDoyle

In [None]:
grp_Christie = df.loc[df['source'] == 'Christie']

## <u>Word Clouds

In [None]:
# get the text for group CananDoyle and display

grp_ConanDoyle_text = " ".join(grp_ConanDoyle['paragraph'])
print(grp_ConanDoyle_text[:500],"\n")

In [None]:
# get the text for group Christie and display

grp_Christie_text = " ".join(grp_Christie['paragraph'])
print(grp_Christie_text[:500],"\n")

In [None]:
# Create a word clouds of above texts

from wordcloud import WordCloud, STOPWORDS
ConanDoyle_cloud = WordCloud(min_word_length=2).generate(grp_ConanDoyle_text)
Christie_cloud = WordCloud(min_word_length=2).generate(grp_Christie_text)

## Plot the Images

fig, axes = plt.subplots(ncols=2, figsize=(12, 6))
axes[0].imshow(ConanDoyle_cloud)
axes[0].set_title('ConanDoyle words')
axes[0].axis('off')

axes[1].imshow(Christie_cloud)
axes[1].set_title('Christie Words')
axes[1].axis('off');
fig.suptitle('Word Cloud - Raw text');

## <u> Word Clouds using Lemmatized Text

In [None]:
# get the text for group CananDoyle and display

grp_ConanDoyle_text = " ".join(grp_ConanDoyle['joined_lemmas'])
print(grp_ConanDoyle_text[:500],"\n")

# get the text for group Christie and display

grp_Christie_text = " ".join(grp_Christie['joined_lemmas'])
print(grp_Christie_text[:500],"\n")

In [None]:
# Create a word clouds of above texts

from wordcloud import WordCloud, STOPWORDS
ConanDoyle_cloud = WordCloud(min_word_length=2).generate(grp_ConanDoyle_text)
Christie_cloud = WordCloud(min_word_length=2).generate(grp_Christie_text)

## Plot the Images

fig, axes = plt.subplots(ncols=2, figsize=(12, 6))
axes[0].imshow(ConanDoyle_cloud)
axes[0].set_title('ConanDoyle words')
axes[0].axis('off')

axes[1].imshow(Christie_cloud)
axes[1].set_title('Christie Words')
axes[1].axis('off')
fig.suptitle('Word Cloud - Lemmas');

## <u> Top 20 most frequent bigrams:

In [None]:
grp_ConanDoyle_tokens = grp_ConanDoyle['tokens'].explode().astype(str).to_list()

In [None]:
grp_Christie_tokens = grp_Christie['tokens'].explode().astype(str).to_list()

In [None]:
grp_ConanDoyle_tokens

In [None]:
grp_Christie_tokens

In [None]:
# using custom function get_ngram_measures_finder to get the bigrams

bigram_grp_ConanDoyle = fn.get_ngram_measures_finder(grp_ConanDoyle_tokens,top_n=20)

bigram_grp_Christie = fn.get_ngram_measures_finder(grp_Christie_tokens,top_n=20)

In [None]:
bigram_grp_ConanDoyle,bigram_grp_Christie

## <u>Text Classification - Machine Learning
- ### Build, fit, and evaluate a binary MultinomialNB classifier to predict the author (source) from the original raw paragraphs text.
    -  Do not remove stopwords
    -  
No need to balance the datase
    -  
Use a pipeline to include the count vectorizer and classification mod
    -  l
You do not need to tune the mo
    -  el
Save your model pipeline to a Models folder in your repo as a joblib file.

In [None]:
df.info()

`Defining X and y`

In [None]:
# define X and y

X = df['paragraph']
y = df['source']

In [None]:
# Value Counts for y

y.value_counts()

`Train Test Split`

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

`Modeling using MultinomialNB Classifier`

In [None]:
# Modeling using raw text (paragraph) , creating pipeline with count vectorizer and classification model

## Create a model pipeline for inference.
nb_clf = MultinomialNB()

nb_pipe = Pipeline([('countvectorizer', CountVectorizer()), 
                       ('nb_classifier', nb_clf)])

In [None]:
nb_pipe.fit(X_train, y_train)

In [None]:
# Using custom classification evaluation function to evaluate
fn.evaluate_classification(nb_pipe, X_train, y_train, X_test, y_test)

## `Saving model pipeline to a Models folder in repo as a joblib file.`

In [None]:
import os

os.makedirs('Models/' ,exist_ok =True)

In [None]:
import joblib
fpath_model = "Models/nb_classifierexam.joblib"
joblib.dump(nb_pipe, fpath_model)

# <u>Text Classification - Deep NLP

### `Prepare tensorflow datasets for a train/val/test split`
- No need to balance classes
- `Note:` Make sure your target is encoded numerically (integers).
- Use a batch size of 32.
- Select a sequence length appropriate for the dataset's text.
    - Add a column to the dataframe that has the length of each paragraph
    - Use the maximum length as the sequence length
        - `Note:` if your computer is having memory/PC issues when fitting the model, reduce the sequence length and add a comment listing the maximum length and why you had to reduce it)

In [None]:
# Use labelencoder to encode the target 

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)
y_encoded

In [None]:
# Use a batch size of 32
BATCH_SIZE =32

In [None]:
df['paragraph']

In [None]:
# Looking at the length of the each text, and spliting on each space, and then get the length
df['sequence_length'] =df['paragraph'].map( lambda x: len(x.split(" ")))
df.head()

In [None]:
df['sequence_length'].describe()

- ### `The range of sequence length is from 4 to 33`

- ### `Lets take maximum sequence length as 40`

In [None]:
SEQUENCE_LENGTH = 40

In [None]:
SEQUENCE_LENGTH = 40
# Make histogram of sequence lengths
ax = df['sequence_length'].hist(bins = 'auto')
ax.set_xlabel('Paragraph')
ax.set_ylabel('count')
ax.set_title('Distribution of Sequence Lengths');

## `Build, fit, and evaluate a binary classification sequence model`

In [None]:
# Preparing the dataset for modeling
print(X)

In [None]:
# Preparing the dataset for modeling
print(X)

In [None]:
classes = np.unique(y_encoded)
classes

In [None]:
# Converting to a dataset object using Dataset.from_tensor_slices()
ds = tf.data.Dataset.from_tensor_slices((X, y_encoded))

# Shuffle dataset
ds = ds.shuffle(buffer_size=len(ds),reshuffle_each_iteration=False) 

In [None]:
# Train, test, split with a .7, .2, .1 ratio using the take, skip, approach

# Set the ratio of the train, validation, test split
split_train = .7
split_val =  .2
split_test =  1 -( split_train + split_val )

# Calculate the number of samples for training and validation data 
n_train_samples =  int(len(ds) * split_train)
n_val_samples = int(len(ds) * split_val)
n_test_samples = len(ds) -(n_train_samples + n_val_samples)

import math
# math.ceil will round up
# How many batches? 
n_train_batches = math.ceil(n_train_samples/BATCH_SIZE)
n_val_batches = math.ceil(n_val_samples/BATCH_SIZE)
n_test_batches = math.ceil(n_test_samples/BATCH_SIZE)

print(f"    - train:\t{n_train_samples} samples \t({n_train_batches} batches)")
print(f"    - val:  \t{n_val_samples} samples \t({n_val_batches} batches)")
print(f"    - test: \t{n_test_samples} samples \t({n_test_batches} batches)")

In [None]:
# Use take and skip to define each set
train_ds = ds.take(n_train_samples).batch(batch_size=BATCH_SIZE)

# Skip over the training batches and take the validation batches
val_ds = ds.skip(n_train_samples).take(n_val_samples).batch(batch_size=BATCH_SIZE)

# Skipver the train and validation batches, the remaining are the test batches
test_ds = ds.skip(n_train_samples + n_val_samples).batch(batch_size=BATCH_SIZE)

`Include a Keras TextVectorization as a layer in your model.`

In [None]:
SEQUENCE_LENGTH = 40
# Create text Vectorization layer
text_vectorizer = tf.keras.layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH
)

In [None]:
# Get just the text from ds_train
ds_texts = train_ds.map(lambda x, y_encoded: x)

# Preview the text
ds_texts.take(1).get_single_element()

In [None]:
# Train (adapt on training text data))
text_vectorizer.adapt(ds_texts)
text_vectorizer.vocabulary_size()

In [None]:
# Programmatically define size of vocab from vectorization layer
VOCAB_SIZE = text_vectorizer.vocabulary_size()
VOCAB_SIZE

`Use 100 as the embedding dimension/size`

In [None]:
EMBED_DIM = 100

In [None]:
print(f'VOCAB_SIZE is {VOCAB_SIZE}')
print(f'SEQUENCE_LENGTH is {SEQUENCE_LENGTH}')
print(f'EMBED_DIM is {EMBED_DIM}')

`Use a bidirectional GRU`

In [None]:
# using custom functions

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, optimizers, regularizers
def build_gru_model(text_vectorization_layer):
                
    gru_model = Sequential([
        text_vectorization_layer,
        tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, 
                                  output_dim=EMBED_DIM, 
                                  input_length=SEQUENCE_LENGTH)])
    # Add GRU layer *new*
    gru_model.add(layers.GRU(128, return_sequences = True))   
    gru_model.add(layers.GlobalMaxPooling1D())
    # Output layer
    gru_model.add(layers.Dense(len(classes), 
                              activation='softmax'))
        
    optimizer = optimizers.legacy.Adam()
    gru_model.compile(optimizer=optimizer,  
                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    gru_model.summary()
    return gru_model


# Include callbacks
def get_callbacks(patience=3, monitor='val_accuracy'):
    early_stop = tf.keras.callbacks.EarlyStopping(patience=patience, monitor=monitor)
    return [early_stop]

In [None]:
# Build the gru model and specify the vectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, optimizers, regularizers

gru_model = build_gru_model(text_vectorizer)

# Defien number of epocs
EPOCHS = 30

# Fit the model
history = gru_model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=val_ds,
    callbacks=get_callbacks(),
)

# Obtain the results
results = fn.evaluate_classification_network(
    gru_model, X_train=train_ds, 
    X_test=test_ds, history=history
);

### `Save your model to a Models folder in your repo in the SavedModel format (save_format='tf').`

In [None]:
fpath_model = "Models/gru"
#tf.keras.models.save_model(model, fpath_model)
gru_model.save(fpath_model, save_format='tf')

# <u>Part 2 Deployment:
- For this part of the exam, you will create a Streamlit app that will allow users to predict the price of a home by inputting certain information about it. It will include inputs for features of the home and produce a predicted price.
- In a new notebook, load in the filepaths.json file from the config folder.