Notebook Structure for POC1
Cell 1: Import Libraries

In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
import tensorflow as tf
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Ensure NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/PremGanesh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/PremGanesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Cell 2: Function Definitions

In [5]:
def clean_text(text):
    """Clean the input text, removing stopwords and non-alphanumeric characters."""
    if isinstance(text, float) and np.isnan(text):
        return ""
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalnum()]
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in words if word not in stop_words])


Cell 3: Load and Preprocess Data

In [6]:
train_file_path = '/Users/PremGanesh/Developer/AI/CyVidia/Input_Data/Training Dataset 2.xlsx'
train_df = pd.read_excel(train_file_path)
train_df['Cleaned_Description'] = train_df['Requirement Description'].apply(clean_text)


Cell 4: Tokenization and Padding

In [7]:
# Define optimal vocabulary size based on previous analysis
max_words = 10000  # Adjust this based on your analysis
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df['Cleaned_Description'])
X_train = tokenizer.texts_to_sequences(train_df['Cleaned_Description'])
X_train = pad_sequences(X_train, maxlen=100)  # Adjust maxlen based on your data


Cell 5: Label Encoding


In [8]:
area_encoder = LabelEncoder()
train_df['Requirement Area (NIST)'] = train_df['Requirement Area (NIST)'].str.lower()
y_area_train = area_encoder.fit_transform(train_df['Requirement Area (NIST)'])

bucket_encoder = LabelEncoder()
train_df['Requirement Bucket(NIST)'] = train_df['Requirement Bucket(NIST)'].str.lower()
y_bucket_train = bucket_encoder.fit_transform(train_df['Requirement Bucket(NIST)'])


Cell 7: Label Encoding

In [9]:
# Label encoding for 'Requirement Area' and 'Requirement Bucket'
area_encoder = LabelEncoder()
y_area_train = area_encoder.fit_transform(train_df['Requirement Area (NIST)'].str.lower())

bucket_encoder = LabelEncoder()
y_bucket_train = bucket_encoder.fit_transform(train_df['Requirement Bucket(NIST)'].str.lower())


Cell 9: Model Architecture


In [10]:
# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=128, input_length=100)) # Adjust as needed
model.add(Bidirectional(LSTM(128, return_sequences=True)))  # Add more LSTM layers as needed
model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(area_encoder.classes_), activation='softmax', name='output_area'))
model.add(Dense(len(bucket_encoder.classes_), activation='softmax', name='output_bucket'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


Cell 10: Model Training

In [11]:
# Train the model on the entire training data
history = model.fit(X_train, {'output_area': y_area_train, 'output_bucket': y_bucket_train}, 
                    epochs=100, batch_size=32)

Epoch 1/100


ValueError: in user code:

    File "/Users/PremGanesh/Developer/AI/CyVidia/.myenvlocal/lib/python3.11/site-packages/keras/src/engine/training.py", line 1377, in train_function  *
        return step_function(self, iterator)
    File "/Users/PremGanesh/Developer/AI/CyVidia/.myenvlocal/lib/python3.11/site-packages/keras/src/engine/training.py", line 1360, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/PremGanesh/Developer/AI/CyVidia/.myenvlocal/lib/python3.11/site-packages/keras/src/engine/training.py", line 1349, in run_step  **
        outputs = model.train_step(data)
    File "/Users/PremGanesh/Developer/AI/CyVidia/.myenvlocal/lib/python3.11/site-packages/keras/src/engine/training.py", line 1127, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/PremGanesh/Developer/AI/CyVidia/.myenvlocal/lib/python3.11/site-packages/keras/src/engine/training.py", line 1185, in compute_loss
        return self.compiled_loss(
    File "/Users/PremGanesh/Developer/AI/CyVidia/.myenvlocal/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 248, in __call__
        y_true = self._conform_to_outputs(y_pred, y_true)
    File "/Users/PremGanesh/Developer/AI/CyVidia/.myenvlocal/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 63, in _conform_to_outputs
        struct = map_to_output_names(outputs, self._output_names, struct)
    File "/Users/PremGanesh/Developer/AI/CyVidia/.myenvlocal/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 819, in map_to_output_names
        raise ValueError(

    ValueError: Found unexpected losses or metrics that do not correspond to any Model output: dict_keys(['output_area']). Valid mode output names: ['output_bucket']. Received struct is: {'output_area': <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=int64>}.


Cell 12: Save the Trained Model

In [None]:
# Check if the directory for the model exists, if not, create it
model_dir = 'models/'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save the model
model.save(os.path.join(model_dir, 'trained_model_rbi_jll_nist.h5'))
print("Model saved successfully.")


Cell 13: Visualization of Training History


In [None]:
# Plot the training history for accuracy
plt.figure(figsize=(10, 5))
plt.plot(history.history['output_area_accuracy'])
plt.plot(history.history['output_bucket_accuracy'])
plt.title('Model Accuracy over Epochs')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Area', 'Bucket'], loc='upper left')
plt.show()

# Plot the training history for loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'])
plt.title('Model Loss over Epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training Loss'], loc='upper right')
plt.show()
