In [None]:
import concurrent.futures
from collections import Counter
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk import sent_tokenize, word_tokenize
from itertools import chain
from nltk.util import ngrams
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

In [None]:
preprocessor = hub.KerasLayer("https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3")
encoder = hub.KerasLayer("https://kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-10-h-128-a-2/2", trainable=False)

## Global variables

In [None]:
TRAIN_DATA_PATH = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv"
OUTPUT_PATH = '/kaggle/working/'

# Model parameters:
# as this is a simple model we use small vocabulary size and truncate to 400 words
NUM_EPOCHS = 100
VOCABULARY_SIZE = 2000
EMBEDDING_DIM = 120
MAX_LENGTH = 400
TRUNCATION_TYPE = 'post'
PADDING_TYPE = 'pre'
OOV_TOKEN = '<OOV>'
TRAIN_SIZE = .75
PATIENCE = 2

In [None]:
# that are the 7 topics of the essays and the corresponding leading tf idf terms
venus=['venus']
mars=['mars','face','alien','aliens','landform','martian','mesa']
cars=['accidents','self','smart','cars','car','driverless','driveless','usage','sensors','drive','driver','transportation','vehicle','vehicles','driving']
electoral=['vote','electoral','president','electors', 'state', 'senator','collage','ellection']
technology=['happy','huang','muscles','mona','lisa','computer','software','technology','emotions','facial','emotion','student','students','kids','teachers','feeling','teacher','feel','expressions']
luke=['luke','cowboy', 'seagoing','seagoing','cowboys','program','animals']
pollution=['air','pollution','smog','emissions','environment','city','paris','cities']
topic_words = venus + mars + cars + electoral +technology + luke + pollution
topic_words

## Loading data

In [None]:
df_train = pd.read_csv(TRAIN_DATA_PATH)

## Finding all words similar to known topic words

In [None]:
all_text = ' '.join(df_train['full_text'])
# Tokenize the text into words, convert to lowercase, and extract unique words
words = re.findall(r'\b\w+\b', all_text.lower())
word_counts = Counter(words)
# Filter words that occur at least three times
words_used_at_least_three_times = [word for word, count in word_counts.items() if count >= 3]
print(f'Words used at least three times: {len(words_used_at_least_three_times)}')
unique_words = set(words_used_at_least_three_times)
vocab = list(unique_words)
vocab = sorted(vocab, key=lambda word: word_counts[word], reverse=True)

In [None]:
# Get embeddings for topic words
topic_embeddings = {word: get_bert_embedding(word, preprocessor, encoder) for word in topic_words}

# as the vocabulary is more than 20 K we need to calculate embeddings in batches
def get_bert_embedding(word, preprocessor, encoder):
    text_input = tf.constant([word])
    encoder_inputs = preprocessor(text_input)
    outputs = encoder(encoder_inputs)
    pooled_output = outputs["pooled_output"]  # [batch_size, 128]
    return pooled_output.numpy()

# Get embeddings for the vocabulary
def process_batch(words):
    embeddings = {}
    for word in words:
        embeddings[word] = get_bert_embedding(word, preprocessor, encoder)
    return embeddings

# Split the vocabulary into batches
batch_size = 1000
ending=len(vocab)
#for testing
#ending=500
batches = [vocab[i:i+batch_size] for i in range(0, ending, batch_size)]

# Process batches in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(process_batch, batches)

# Combine embeddings from all batches
vocab_embeddings = {}
batch_count = 0
for result in results:
    batch_count += 1
    print(f"Processed batch {batch_count} out of {len(batches)}")
    vocab_embeddings.update(result)
print (2)

# Function to find similar words based on cosine similarity
def get_similar_words(topic_embeddings, vocab_embeddings, threshold=0.8):
    similar_words = set()
    for topic_word, topic_emb in topic_embeddings.items():
        for vocab_word, vocab_emb in vocab_embeddings.items():
            if topic_word != vocab_word:
                similarity = cosine_similarity(topic_emb, vocab_emb)[0][0]
                if similarity > threshold:
                    similar_words.add(vocab_word)
    return similar_words

# Find similar words (for testing different thresholds)
#for tr in np.arange(0.98,0.99,0.002):
#    related_words = get_similar_words(topic_embeddings, vocab_embeddings, threshold=tr)
#    print(f'{tr} Related Words: {len(related_words)}')
#    print (related_words)

In [None]:
related_words = get_similar_words(topic_embeddings, vocab_embeddings, threshold=0.984)
# Combine topic words and related words
all_exclude_words = set(topic_words).union(related_words)

In [None]:
def exclude_words(text, words):
    pattern = re.compile(r'\b(' + '|'.join(words) + r')\b', re.IGNORECASE)
    return pattern.sub("topic", text)

In [None]:
df_train['txt_wth_topic']=df_train['full_text'].apply(lambda x : exclude_words(x,all_exclude_words))

In [None]:
# shuffle the data
df = df_train.sample(frac=1).reset_index(drop=True)
# Select the columns with the features
df_data_x = df[['txt_wth_topic']]
# Select the column with the label:
df_data_y = df['score']

# Split the samples:
x_train, x_test, y_train, y_test = train_test_split(df_data_x,
                                                    df_data_y,
                                                    train_size=TRAIN_SIZE,
                                                    random_state=42)

# create tokenizer
tokenizer = Tokenizer(num_words=VOCABULARY_SIZE, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(x_train['txt_wth_topic'])  
train_sequences = tokenizer.texts_to_sequences(x_train['txt_wth_topic'])

# padding / truncating
train_padded = pad_sequences(train_sequences,
                             maxlen=MAX_LENGTH,
                             padding=PADDING_TYPE,
                             truncating=TRUNCATION_TYPE)

validation_sequences = tokenizer.texts_to_sequences(x_test['txt_wth_topic'])
validation_padded = pad_sequences(validation_sequences,
                                  maxlen=MAX_LENGTH,
                                  padding=PADDING_TYPE,
                                  truncating=TRUNCATION_TYPE)

# encoding
encoder = LabelEncoder()
encoder.fit(y_train)    
encoded_y_train = to_categorical(encoder.transform(y_train))
encoded_y_test = to_categorical(encoder.transform(y_test))
num_classes = len(np.unique(y_train))

In [None]:
model = tf.keras.Sequential([
tf.keras.layers.Embedding(VOCABULARY_SIZE, EMBEDDING_DIM),
tf.keras.layers.Dropout(0.4),
tf.keras.layers.LSTM(128, return_sequences=True),
tf.keras.layers.LSTM(64),
tf.keras.layers.Dense(256, activation="relu"),
tf.keras.layers.Dense(128, activation="relu"),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(num_classes, activation="softmax")
]) 

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Build the model by providing the input shape
model.build(input_shape=(None, MAX_LENGTH))

# summary and callback
model.summary()
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=PATIENCE)

# training
history = model.fit(train_padded,
                    encoded_y_train,
                    epochs=NUM_EPOCHS,
                    validation_data=(validation_padded, encoded_y_test),
                    callbacks=[callback]
                    ) 

predictotron = model.predict(validation_padded) 