# Introduction
Hi visitor,
this is my first NLP project and my first competition on Kaggle. I am familliar with the theoretical basics of NLP but never did a project on this topics especially with some pretrained models. So this is it. 

In this project I tried two approaches of pre-trained model us. One where I load the pre-trained model manually in the embeddings layer and use that layer as a part of my model (glove) and the other one based on Huggingfaces🤗 framework, where I use the from_pretrained() function which loads the whole model (with all layers).

INFO: According to the Kaggle dataset situation, I could find the Deberta base model there for Tensorflow therefore I needed to choose the roBerta model. So it could therefore occur the situation you find some variable / model / checkpoint names that are still named after deberta and not roberta. I will try to fix this one after the other.
And according to the current state of competition I focused on the Deberta/Roberta approach only which made my comment out the Glove model section. I will reactivate it in the final version of this notebook.

I thereforce ask you to bear with?! 🤗

# Imports and Datasets

In [None]:
import sys
assert sys.version_info >= (3,5)
import os
import pathlib

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder

import nltk
from string import punctuation
from collections import Counter

from scipy.spatial.distance import cosine

import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import layers
from keras.layers import Embedding, LSTM, Dense, Dropout, CuDNNLSTM, Bidirectional
from keras.layers.merge import concatenate
from transformers import BertTokenizer, TFDebertaModel
from transformers import RobertaTokenizer, TFRobertaModel, TFRobertaForSequenceClassification

#import mlflow
#from mlflow import log_metric, log_param, log_artifacts
#import mlflow.tensorflow
#from mlflow import pyfunc

assert tf.__version__ >= "2.0"

print(f"Tensorflow Version: {tf.__version__}")
print(f"Keras Version: {keras.__version__}")

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")
else:
    print(f'---Tensorflow is running with GPU Power now---')
    sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
    


random_state=42
tf.random.set_seed(random_state)
np.random.seed(random_state)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE','')
#kaggle = 0 # Kaggle path active = 1

MAIN_PATH = os.getcwd()

# change your local path here
if iskaggle:
    DATA_PATH = os.path.join(MAIN_PATH, '../input')
    PHRASES_PATH = os.path.join(DATA_PATH, 'us-patent-phrase-to-phrase-matching')
else:
    DATA_PATH = os.path.join(MAIN_PATH, 'data')
    PHRASES_PATH = os.path.join(DATA_PATH,'input\\us-patent-phrase-to-phrase-matching')



for dirname, _, filenames in os.walk(PHRASES_PATH): 
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# Get the Data

In [None]:
# Data path and file
CSV_FILE_TRAIN='train.csv'
CSV_FILE_TEST='test.csv'
CSV_FILE_COMF='sample_submission.csv'
CSV_FILE_CPC='titles.csv'
CPC_PATH='cpc-codes'
DEBERTA_PATH='huggingface-deberta-variants'
ROBERTA_PATH='roberta-base'

def load_csv_data(path, csv_file):
    csv_path = os.path.join(path, csv_file)
    return pd.read_csv(csv_path)

def load_csv_data_manuel(path, csv_file):
    csv_path = os.path.join(path, csv_file)
    csv_file = open(csv_path, 'r')
    csv_data = csv_file.readlines()
    csv_file.close()
    return csv_data
    

train = load_csv_data(PHRASES_PATH,CSV_FILE_TRAIN)
test = load_csv_data(PHRASES_PATH,CSV_FILE_TEST)
competition_file = load_csv_data(PHRASES_PATH,CSV_FILE_COMF)
cpc_code = load_csv_data(os.path.join(DATA_PATH, CPC_PATH), CSV_FILE_CPC)


print(f'Length of loaded trainset: {len(train)}')
print(f'Length of loaded testset: {len(test)}')
print(f'Length of loaded competition file: {len(competition_file)}')
print(f'Length of loaded cpc_codeset: {len(cpc_code)}')

In [None]:
train = train.join(cpc_code.set_index('code'), on = 'context')
test = test.join(cpc_code.set_index('code'), on = 'context')

## Loading Model Files

In [None]:
# change your local path here
if iskaggle:
    path_to_glove_file = os.path.join(DATA_PATH, 'glove6b/glove.6B.300d.txt') # kaggle datasource location
else:
    path_to_glove_file = os.path.join(DATA_PATH,'glove.6B\\glove.6B.300d.txt')

In [None]:
if iskaggle:
    DEBERTA_BASE = os.path.join(DATA_PATH, DEBERTA_PATH + '/deberta-base/deberta-base') # kaggle datasource location
else:
    DEBERTA_BASE = 'microsoft/deberta-base'

In [None]:
if iskaggle:
    ROBERTA_BASE = os.path.join(DATA_PATH, ROBERTA_PATH) # kaggle datasource location
else:
    ROBERTA_BASE = 'roberta-base'

# Data Understanding

## Given Attributes
- id - a unique identifier for a pair of phrases
- anchor - the first phrase
- target - the second phrase
- context - the CPC classification (version 2021.05), which indicates the subject within which the similarity is to be scored
- score - the similarity. This is sourced from a combination of one or more manual expert ratings.


## Score
The scores are in the 0-1 range with increments of 0.25 with the following meanings:

- 1.0 - Very close match. This is typically an exact match except possibly for differences in conjugation, quantity (e.g. singular vs. plural), and addition or removal of stopwords (e.g. “the”, “and”, “or”).
- 0.75 - Close synonym, e.g. “mobile phone” vs. “cellphone”. This also includes abbreviations, e.g. "TCP" -> "transmission control protocol".
- 0.5 - Synonyms which don’t have the same meaning (same function, same properties). This includes broad-narrow (hyponym) and narrow-broad (hypernym) matches.
- 0.25 - Somewhat related, e.g. the two phrases are in the same high level domain but are not synonyms. This also includes antonyms.
- 0.0 - Unrelated.

In [None]:
train['anchor'].value_counts(dropna=False)

The anchor value has 733 different values. Lets look at the target value.

In [None]:
train['target'].value_counts(dropna=False)

The target looks a little bit different. Here we have 29,340 different values.

In [None]:
train['score'].value_counts(dropna=False)

In [None]:
train['score'].value_counts(dropna=False).sort_index().plot.bar()

In [None]:
train.groupby(['anchor', 'context']).count()

# Data Preparation

#### Special Tokens
Defining the context as special token for the Tokenizer

In [None]:
train['context_token'] = '[' + train['context'] + ']'
test['context_token'] = '[' + test['context'] + ']'
context_list = list(train['context_token'].unique())

In [None]:
# Preparing cpc text 
train['title'] = train.title.apply(lambda text: text.split(';'))
train['title'] = train.title.apply(lambda context: ' '.join(context))

In [None]:
train['corpus'] = train['anchor'] + ' ' + train['target']
train['corpus_w_context'] = train['context_token'] + ' ' + train['corpus']
train['corpus_w_full_context'] = train['context_token'] + ' ' + train['corpus'] + ' ' + train['title']

test['corpus'] = test['anchor'] + ' ' + test['target']
test['corpus_w_context'] = test['context_token'] + ' ' + test['corpus']
test['corpus_w_full_context'] = train['context_token'] + ' ' + test['corpus'] + ' ' + test['title']

#### Identifing the features and the target.

In [None]:
y = train[['id','score']].copy()
X = train[['id','anchor','target','context', 'corpus', 'title', 'corpus_w_context', 'corpus_w_full_context']].copy()

## Training - Validation Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y['score'])

In [None]:
training_target = X_train['target']
print(f'Length of training_target - list: {len(training_target)}')

training_content = X_train['corpus']
print(f'Length of training_content - list: {len(training_content)}')

training_content_w_context = X_train['corpus_w_context']
print(f'Length of training_content_w_context - list: {len(training_content_w_context)}')

training_content_full = X_train['corpus_w_full_context']
print(f'Length of training_content_full - list: {len(training_content_full)}')


validating_content = X_val['corpus']
print(f'Length of validating_content - list: {len(validating_content)}')

validating_content_w_context = X_val['corpus_w_context']
print(f'Length of validating_content_w_context - list: {len(validating_content_w_context)}')

validating_content_full = X_val['corpus_w_full_context']
print(f'Length of validating_content_full - list: {len(validating_content_full)}')


test_content = test['corpus']
print(f'Length of test_content - list: {len(test_content)}')

test_content_full = test['corpus_w_full_context']
print(f'Length of test_content_full - list: {len(test_content_full)}')

training_labels = y_train['score']
validating_labels = y_val['score']

training_labels = np.asarray(training_labels)
validating_labels = np.asarray(validating_labels)

#### Label Encoding

In [None]:
encoder = LabelEncoder()
encoder.fit(y_train['score'])

training_labels = encoder.transform(training_labels)
validating_labels = encoder.transform(validating_labels)

training_labels = training_labels.reshape(-1, 1)
validating_labels = validating_labels.reshape(-1, 1)

## Data Cleaning

### Tokenization, Encoding and Padding

In [None]:
def extract_words(document, alpha=True):
    '''Extracing words from a sentence or full text.

    Parameters
    ----------
    document: str
        Text that needs to be tokenized by nltk word_tokenize.
    alpha: bool
        Keep only letters or not. 
    
    Returns
    -------
    set
        A set of words from the given text.
    '''
    if alpha == True:
        return set(
            word.lower() for word in nltk.word_tokenize(document)
            if any(c.isalpha() for c in word)
        )
    else:
        return set(
            word.lower() for word in nltk.word_tokenize(document)
        )


In [None]:
def process_docs(docs):
    content = []
    for doc in docs:
        content.append(extract_words(doc))
    return content

def max_length(lines):
    return max([len(s.split()) for s in lines])

In [None]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def encode_text(tokenizer, lines, length):
    sequences = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(sequences, maxlen=length)
    return padded


In [None]:
tokenizer = create_tokenizer(training_content_full)

In [None]:
vocab_size = len(tokenizer.word_index) + 1
max_line_length = max_length(training_content_full)
word_count = tokenizer.word_counts
word_index = tokenizer.word_index
oov_tok = "<OOV>"


In [None]:
training_content_enc = encode_text(tokenizer, training_content_full, max_line_length)
print(f'Shape training set (encoded): {training_content_enc.shape}')

validating_content_enc = encode_text(tokenizer, validating_content_full, max_line_length)
print(f'Shape validating set (encoded): {validating_content_enc.shape}')

print(f'Vocabulary size: {vocab_size}')
print(f'Max line lenght: {max_line_length}')

# Helpers for Deep Neural Network Training

#### Params for the Glove based model

In [None]:
# Main params for the model
embedding_dim = 300 # according to the pretrained network
hits = 0
misses = 0
lr = 0.0000008
batch_size = 512
num_epochs = 50

#### Params for Deberta/Roberta based model

In [None]:
lr_roberta = 0.000006   # 0.000006 <-70
num_epochs_roberta = 5 #8 #5
batch_size_roberta = 16

In [None]:
from keras.callbacks import ReduceLROnPlateau

# Tensorboard logging structure function
root_logdir = "../../tensorboard-logs"

def get_run_logdir(root_logdir, project):
    '''
    Returns logdir to the Tensorboard log for a specific project.

            Parameters:
                    root_logdir (str) : basic logdir from Tensorboard
                    project (str): projectname that will be logged in TB

            Returns:
                    os.path (str): Path to the final logdir
    '''
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    project_logdir = os.path.join(root_logdir,project)
    return os.path.join(project_logdir, run_id)


def lr_scheduler(epoch):
  """
  Returns a custom learning rate that decreases as epochs progress.
  """
  decay = 0.1 #1
  init_lr = lr_roberta 
  learning_rate = init_lr * (1 / (1 + decay * epoch))

  tf.summary.scalar('learning rate', data=learning_rate, step=epoch)
  return learning_rate


def lr_scheduler_2(epoch):
    learning_rate = 2e-6 # 0.000006
    if epoch == 0:
        return learning_rate * 0.06 #0.000006
    else:
        return learning_rate * (0.9**epoch)


tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=get_run_logdir(root_logdir,"nlp_phrase2phrase"), histogram_freq=1)
tensorboard_callback_roberta = tf.keras.callbacks.TensorBoard(log_dir=get_run_logdir(root_logdir,"nlp_phrase2phrase_roberta"), histogram_freq=1)
lr_callback_roberta = tf.keras.callbacks.LearningRateScheduler(lr_scheduler_2)

checkpoint_cb_roberta = keras.callbacks.ModelCheckpoint("trained_model_cp.h5", save_best_only=True, save_weights_only=True, monitor='val_loss', save_freq='epoch')
earlystopping_roberta = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
plt.plot([lr_scheduler(e) for e in range(10)])

# Model Development Based on Glove

## Pre-Trained Embeddings Load

In [None]:
embeddings_index = {}
with open(path_to_glove_file ,encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Preparing a corresponding embedding matrix for the Embedding layer in Keras.

According to the choosen pre-trained embedding matrix we need to set the embedding dimension on 100.

In [None]:
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

print(f"Converted {hits} words ({misses} misses)")

## Model Building
### The new Embedding Layer
Now loading the pre-trained word embedding matrix into the embedding layer. According to the pre-trained embedding load the trainable param needst to be set on "False".

In [None]:
model = tf.keras.models.Sequential(
    [
    keras.layers.Embedding(    
        vocab_size,
        embedding_dim,
        input_shape = [None],
        input_length=max_line_length,
        mask_zero=True,
        weights=[embedding_matrix],
        trainable = False),
    keras.layers.SpatialDropout1D(0.3),
    keras.layers.LSTM(300, return_sequences=True),
    keras.layers.LSTM(300),
    keras.layers.BatchNormalization(),
    keras.layers.Activation('relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(5, activation='softmax' )
    ]
)


In [None]:
model.compile(loss='sparse_categorical_crossentropy',
                #optimizer=keras.optimizers.Nadam(learning_rate=lr, beta_1=mmt),
                optimizer=keras.optimizers.Adam(),
                metrics=['accuracy']
                )

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes=True, to_file='multichannel.png')

In [None]:
history = model.fit(
    np.asarray(training_content_enc),
    np.asarray(training_labels),
    batch_size=batch_size,      # small batch size are better but costs a lot of time
    epochs=num_epochs,
    validation_data=(
        np.asarray(validating_content_enc),
        np.asarray(validating_labels)),
    verbose=1,
    callbacks=[tensorboard_callback])

In [None]:
#model.save("LSTM_model_label_encoding_4.h5")

# Second Model Development based on Deberta 🤗

In [None]:
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification

In [None]:
tokenizer_roberta = AutoTokenizer.from_pretrained(ROBERTA_BASE)
tokenizer_roberta.add_special_tokens({'additional_special_tokens': context_list})

In [None]:
model_roberta = TFAutoModelForSequenceClassification.from_pretrained(ROBERTA_BASE, trainable=True, return_dict=True, num_labels=5, output_hidden_states=True)

## Trying around Huggingfaces Model and Tokenizer Structure
The following small try and errors for getting familiar with this framework is based on this huggingface documentation: https://huggingface.co/docs/transformers/glossary#:~:text=token%3A%20a%20part%20of%20a,based%20deep%20learning%20model%20architecture.

And this might be interesting for the Tokenizer topic as well: https://huggingface.co/docs/transformers/preprocessing


### Experiments with Deberta Tokenizer (🤗)
Converting a test sentence with doberta tokenizer

In [None]:
test_text_tok = tokenizer_roberta('This is a Test')

Printing the results

In [None]:
test_text_tok

Printing the encoded results of the test sentence

In [None]:
test_text_tok["input_ids"]

Decoding the encoded test sentence back to its original form

In [None]:
tokenizer_roberta.decode(test_text_tok["input_ids"])

#### Attention Mask

In [None]:
sentence_a = "This is a test"
sentence_b = "This is a test as well but its longer, much longer, longer than any other test could be"

Encoding both sentences and retrieving the ids only

In [None]:
encoded_sen_a = tokenizer_roberta(sentence_a)["input_ids"]
encoded_sen_b = tokenizer_roberta(sentence_b)["input_ids"]

print(f'sentence a encoded: {encoded_sen_a}')
print(f'sentence b encoded: {encoded_sen_b}')

Once again tokenizing the sentences but with padding activated

In [None]:
padded_sentences = tokenizer_roberta([sentence_a, sentence_b], padding=True)

print(f'Sentences encoded: {padded_sentences["input_ids"]}')
print(f'Sentences att.msk: {padded_sentences["attention_mask"]}')

### Processing the Data for Roberta Model

In [None]:
MAX_LINE_LENGTH_BERT = len(tokenizer_roberta(X_train['corpus_w_full_context'].tolist(), padding=True, truncation=True, return_tensors="tf")[1])
print(f"Maximum sentence length is: {MAX_LINE_LENGTH_BERT}")

In [None]:
def preprocess_function(examples):
    return tokenizer_roberta(examples['corpus_w_full_context'].tolist(), padding='max_length', truncation=True, return_tensors="tf", max_length=MAX_LINE_LENGTH_BERT)

In [None]:
#X_train.map(preprocess_function, batched=True)
train_encoded = preprocess_function(X_train)
val_encoded = preprocess_function(X_val)

print(f'Length of the train-sentences [padded]: {train_encoded["input_ids"].shape[1]}')
print(f'Length of the val-sentences [padded]: {val_encoded["input_ids"].shape[1]}')

### Model Build


In [None]:
print(f'Number of labels, that came from deberta model: {model_roberta.num_labels}')

In [None]:
_input_ids_ = tf.keras.Input(shape = (MAX_LINE_LENGTH_BERT, ), dtype = tf.int32)
_attention_mask_ = tf.keras.Input(shape = (MAX_LINE_LENGTH_BERT, ), dtype = tf.int32)

x = model_roberta(
                input_ids = _input_ids_,
                attention_mask = _attention_mask_,
                output_hidden_states=True
                )
#print(x)
#print('-----------------------------------')
#print(x.hidden_states)
#print('-----------------------------------')
#print(x[0].hidden_states[-1])
#print('-----------------------------------')
#print(x.last_hidden_state)
x = tf.keras.layers.GlobalAveragePooling1D()(x.hidden_states[-1])
x = tf.keras.layers.Dropout(0.3)(x)
#x = tf.keras.layers.Dense(32, activation='relu')(x)
output = tf.keras.layers.Dense(5, activation='softmax')(x)

model2 = tf.keras.Model(inputs = [_input_ids_, _attention_mask_], 
                        outputs = output
                        )

In [None]:
model2.compile(loss='sparse_categorical_crossentropy', optimizer='Nadam', metrics=['accuracy'])

In [None]:
model2.summary()

### Model Fit

In [None]:
history_model2 = model2.fit(x=(np.asarray(train_encoded['input_ids']),
                                np.asarray(train_encoded['attention_mask'])
                                ),
                                y=np.asarray(training_labels).ravel(),
                                validation_data=((np.asarray(val_encoded['input_ids']),
                                                  np.asarray(val_encoded['attention_mask'])),
                                                np.asarray(validating_labels)
                                                ),
                                epochs=num_epochs_roberta,
                                batch_size=batch_size_roberta,
                                callbacks =[tensorboard_callback_roberta,
                                            lr_callback_roberta,
                                            checkpoint_cb_roberta,
                                            earlystopping_roberta]) #lr_callback rlrop

In [None]:
model2.save("roberta_trained_10_epochs_specialtokens.h5")

In [None]:
#from keras.models import load_model
# Or load the saved model from the callback : deberta_trained_model.h5
# model2.load_weights('deberta_trained_model.h5') #deberta_trained_10_epochs_decay_lr_1 # deberta_trained_3_epochs_decay_lr

# Evaluation

## Test with all Validation Data [Glove]

In [None]:
evaluation_glove = model.evaluate(np.asarray(validating_content_enc),
                             np.asarray(validating_labels), verbose=0)
    
print(f'Models validation loss: {evaluation_glove[0]} - Models validation accuracy: {evaluation_glove[1]}')

#### Accuracy Curve

In [None]:
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.title('lrate='+str(lr), pad=-50)

#### Loss Curve

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

## Test with all Validation Data [Roberta]

In [None]:
evaluation_roberta = model2.evaluate((np.asarray(val_encoded['input_ids']),
                              np.asarray(val_encoded['attention_mask']),
                             ),
                             validating_labels, verbose=0)

print(f'Models validation loss: {evaluation_roberta[0]} - Models validation accuracy: {evaluation_roberta[1]}')

#### Accuracy Curve

In [None]:
plt.plot(history_model2.history['accuracy'], label='train')
plt.plot(history_model2.history['val_accuracy'], label='test')
plt.title('lrate='+str(lr), pad=-50)

#### Loss Curve

In [None]:
plt.plot(history_model2.history['loss'])
plt.plot(history_model2.history['val_loss'])

# Submission File

## Training on all Data

## Prediction of Test File Values

In [None]:
#competition_file = pd.DataFrame(columns=['score'])
competition_file = pd.read_csv(PHRASES_PATH + "/sample_submission.csv")

In [None]:
test_encoded = preprocess_function(test)

In [None]:
test_prediction = model2.predict((np.asarray(test_encoded['input_ids']),
                                  np.asarray(test_encoded['attention_mask']) 
                                  ))

In [None]:
competition_file['score'] = encoder.inverse_transform(np.argmax(test_prediction, axis=1))

In [None]:
competition_file['score'].hist()

In [None]:
competition_file.to_csv('submission.csv', index=False)