In [20]:
# Ravindra Bisram
# Deep Leaning Homework 5 - AG News
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_fscore_support as score
from tensorflow.python.keras.utils.np_utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from numpy import array, asarray, zeros
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Flatten, LSTM, Input, GlobalMaxPooling1D, Activation, Dropout, Dense, Embedding, TextVectorization
from tensorflow.keras import models, layers
from sklearn.model_selection import KFold, StratifiedKFold


# https://www.kaggle.com/code/keithcooper/multi-class-classification-with-transformer-models
# https://stackabuse.com/python-for-nlp-multi-label-text-classification-with-keras/

def import_data(csv_file):
    """
    in -> csv_file - string representing the location of the csv file
    out -> pandas dataframe
    """
    df = pd.read_csv(csv_file)
    df['text'] = df['Title'] + ' ' + df['Description']
    df.rename(columns = {'Class Index': 'label'}, inplace = True)
    # The models expects numerical catagories  starting from 0 Not 1
    df['label'].replace({4:0}, inplace = True)
    df.drop(['Title', 'Description'], axis = 1, inplace = True)
    
    return df

def preprocess_text(sen):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sen)
    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)
    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence


train_val_df = import_data("../data/train.csv")
test_df = import_data("../data/test.csv")
print("Data has been imported. Starting")

x_train = []
y_train = train_val_df["label"].values

sentences = list(train_val_df["text"])

for sen in sentences:
    x_train.append(preprocess_text(sen))


# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)

VOCAB_SIZE = len(tokenizer.word_index) + 1
MAX_LEN = 50

x_train = pad_sequences(x_train, padding='post', maxlen=MAX_LEN)
print(type(x_train))

x_test = []
test_sentences = list(test_df["text"])
for sen in test_sentences:
    x_test.append(preprocess_text(sen))

x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, padding='post', maxlen=MAX_LEN)

y_test = test_df["label"]
y_test = tf.keras.utils.to_categorical(y_test)


embeddings_dictionary = dict()

# Using the pretrained Glove word vectorization model
# https://stackoverflow.com/questions/50060241/how-to-use-glove-word-embeddings-file-on-google-colaboratory
# https://nlp.stanford.edu/projects/glove/

glove_file = open('../data/glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

embedding_matrix = zeros((VOCAB_SIZE, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        

def main():
  # Perform proper cross validation by splitting the training set into sections, where each iteration has a different section be the validation set.
  # https://setscholars.net/how-to-use-kfold-cross-validation-in-keras/
  # https://stackoverflow.com/questions/48508036/sklearn-stratifiedkfold-valueerror-supported-target-types-are-binary-mul
  # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

  kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
  cvscores = []
  
  for index, (train_indices, val_indices) in enumerate(kfold.split(x_train, y_train)):
    print ("Training on fold " + str(index+1) + "/5...")
   
    # Generate batches from indices
    xtrain, xval = x_train[train_indices], x_train[val_indices]
    ytrain, yval = y_train[train_indices], y_train[val_indices]
    
    # Build the model
    deep_inputs = Input(shape=(MAX_LEN,))
    embedding_layer = Embedding(VOCAB_SIZE, 100, weights=[embedding_matrix], trainable=False)(deep_inputs)
    LSTM_Layer_1 = LSTM(128)(embedding_layer)
    dense_layer_1 = Dense(6, activation='relu')(LSTM_Layer_1)
    dense_layer_2 = Dense(128, activation='relu')(dense_layer_1)
    output = Dense(4, activation="softmax")(dense_layer_2)
    model = Model(inputs=deep_inputs, outputs=output)
    
    # model.summary()

    model.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])
    
    yval = to_categorical(yval)
    ytrain = to_categorical(ytrain)
    
    history=model.fit(xtrain, ytrain, batch_size=200, epochs=50, validation_data=(xval, yval), verbose=2)

    scores = model.evaluate(x_test, y_test, verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
      
  print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

main()

Starting
<class 'numpy.ndarray'>
Training on fold 1/5...
[     1      2      3 ... 119996 119998 119999] [     0      4      7 ... 119988 119990 119997]
Epoch 1/50
480/480 - 5s - loss: 0.4777 - accuracy: 0.8418 - val_loss: 0.3398 - val_accuracy: 0.8858 - 5s/epoch - 10ms/step
Epoch 2/50
480/480 - 3s - loss: 0.3228 - accuracy: 0.8910 - val_loss: 0.3039 - val_accuracy: 0.8978 - 3s/epoch - 7ms/step
Epoch 3/50
480/480 - 3s - loss: 0.2937 - accuracy: 0.8999 - val_loss: 0.2865 - val_accuracy: 0.9013 - 3s/epoch - 6ms/step
Epoch 4/50
480/480 - 3s - loss: 0.2749 - accuracy: 0.9053 - val_loss: 0.2751 - val_accuracy: 0.9062 - 3s/epoch - 7ms/step
Epoch 5/50
480/480 - 3s - loss: 0.2580 - accuracy: 0.9097 - val_loss: 0.2755 - val_accuracy: 0.9045 - 3s/epoch - 7ms/step
Epoch 6/50
480/480 - 3s - loss: 0.2473 - accuracy: 0.9131 - val_loss: 0.2616 - val_accuracy: 0.9089 - 3s/epoch - 6ms/step
Epoch 7/50
480/480 - 3s - loss: 0.2354 - accuracy: 0.9177 - val_loss: 0.2520 - val_accuracy: 0.9142 - 3s/epoch - 6