## Lexical complexity prediction using deep learning

In [None]:
import random
import numpy as np
import torch

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
import pandas as pd

In [None]:
LP = pd.read_csv("lcp_single_train.tsv", sep='\t')

## Missing Values

In [None]:
LP = LP.dropna(subset=['sentence', 'token', 'complexity'])

In [None]:
#converting to string using .loc
LP.loc[:, 'sentence'] = LP['sentence'].astype(str)
LP.loc[:, 'token'] = LP['token'].astype(str)

## EDA

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#distribution of complexity scores
plt.figure(figsize=(14, 4))

plt.subplot(1, 3, 1)
sns.histplot(LP['complexity'], bins=30, kde=True, color='skyblue')
plt.title("Distribution of complexity scores")
plt.xlabel("Complexity")
plt.ylabel("Frequency")

In [None]:
#Box plot
plt.subplot(1, 3, 2)
sns.boxplot(x=LP['complexity'], color='orange')
plt.title("Box Plot of Complexity Scores")
plt.xlabel("Complexity")

In [None]:
# Sentence Length Distribution
LP['sentence_length'] = LP['sentence'].apply(lambda x: len(x.split()))
plt.subplot(1, 3, 3)
sns.histplot(LP['sentence_length'], bins=20, kde=True, color='green')
plt.title("Sentence Length Distribution")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
#Adding [TGT] tags
def mark_target(row):
    sentence = row['sentence']
    token = row['token']
    if token in sentence:
        return sentence.replace(token, f"[TGT] {token} [TGT]", 1)
    else:
        return f"[TGT] {token} [TGT] " + sentence

In [None]:
LP.loc[:, 'input'] = LP.apply(mark_target, axis=1)

In [None]:
LP[['input', 'token', 'complexity']].head()

In [None]:
#Removing tabs
LP['input'] = LP['input'].str.replace(r'[\t\n\r\\]', ' ', regex=True)

In [None]:
LP[['input', 'token', 'complexity']].head()

In [None]:
LP[['input', 'token', 'complexity']].tail()

## Model 1- RoBERTa Model


In [None]:
!pip install transformers datasets -q

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset

## Train-Validation Split 80/20

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    LP['input'].tolist(), LP['complexity'].tolist(), test_size=0.2, random_state=SEED
)

In [None]:
# HuggingFace Dataset
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels})
validate_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels})

## Roberta Tokenizer

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

In [None]:
train_dataset = train_dataset.map(tokenize, batched=True)
validate_dataset = validate_dataset.map(tokenize, batched=True)

In [None]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
validate_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

## LOading model

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=1)

In [None]:
# Hugging Face Trainer for fine-tuning
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_steps=10,
    report_to="none",
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset
)

## Training

In [None]:
trainer.train()

## Results

In [None]:
model.eval()
predictions = trainer.predict(validate_dataset)
preds = predictions.predictions.flatten()
true = predictions.label_ids

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import scipy.stats as stats

In [None]:
mse = mean_squared_error(true, preds)
print("Mean square error:", mse)
print("Pearson Correlation:", stats.pearsonr(preds, true)[0])
print("Mean absoulte error:", mean_absolute_error(true, preds))
print("R² Score:", r2_score(true, preds))

## Model 2-Distilbert model

In [None]:
import random
import numpy as np
import torch

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

## Tokenization

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
def tokenize_data(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

In [None]:
train_dataset = train_dataset.map(tokenize_data, batched=True)
val_dataset = validate_dataset.map(tokenize_data, batched=True)

In [None]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
validate_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

## Load Model

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=1)

In [None]:
from transformers import TrainingArguments, Trainer

## DistilBERT training setup using trainer API

In [None]:
training_args = TrainingArguments(
    output_dir='./distilbert_results',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_steps=10,
    report_to="none",
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

## Model Training

In [None]:
trainer.train()

## Results

Predictions

In [None]:
model.eval()

In [None]:
predictions = trainer.predict(val_dataset)
preds = predictions.predictions.flatten()
true = predictions.label_ids

In [None]:
import numpy as np

In [None]:
preds = np.clip(preds, 0, 1)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import scipy.stats as stats

metrics

In [None]:
print("MSE:", mean_squared_error(true, preds))
print("MAE:", mean_absolute_error(true, preds))
print("R² Score:", r2_score(true, preds))
print("Pearson Correlation:", stats.pearsonr(preds, true)[0])

## Model 3 - BiLSTM with Attention

In [None]:
!pip install keras tensorflow -q

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.layers import Layer
from sklearn.model_selection import train_test_split

In [None]:
SEED = 42
import os
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [None]:
texts = LP['input'].tolist()
labels = LP['complexity'].values

## Train-Test Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

Tokenization

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

Converting text to sequences

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)

Pad Sequence

In [None]:
max_len = 50
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len, padding='post')

Attention Layer for model 3

In [None]:
class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='att_weight', shape=(input_shape[-1], 1),
                                 initializer='normal', trainable=True)
        self.b = self.add_weight(name='att_bias', shape=(input_shape[1], 1),
                                 initializer='zeros', trainable=True)
        super(Attention, self).build(input_shape)

    def call(self, x):
        e = tf.keras.backend.tanh(tf.keras.backend.dot(x, self.W) + self.b)
        a = tf.keras.backend.softmax(e, axis=1)
        output = x * a
        return tf.keras.backend.sum(output, axis=1)

Pretrained gloVe word embeddings

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [None]:
embedding_index = {}
with open("glove.6B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    vector = embedding_index.get(word)
    if vector is not None:
        embedding_matrix[i] = vector

GloVe-Based BiLSTM with Attention

In [None]:
input_layer = Input(shape=(max_len,))
x = Embedding(input_dim=vocab_size,
              output_dim=embedding_dim,
              weights=[embedding_matrix],
              input_length=max_len,
              trainable=False)(input_layer)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Attention()(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
output_layer = Dense(1, activation='linear')(x)

model_glove = Model(inputs=input_layer, outputs=output_layer)
model_glove.compile(optimizer='adam', loss='mse', metrics=['mae'])
model_glove.summary()

## Training of Model

In [None]:
history_glove = model_glove.fit(X_train_pad, y_train,
                                epochs=10,
                                batch_size=32,
                                validation_data=(X_val_pad, y_val),
                                verbose=1)

## Results

In [None]:
y_pred_glove = model_glove.predict(X_val_pad).flatten()
y_pred_glove = np.clip(y_pred_glove, 0, 1)

print("MSE:", mean_squared_error(y_val, y_pred_glove))
print("MAE:", mean_absolute_error(y_val, y_pred_glove))
print("R² Score:", r2_score(y_val, y_pred_glove))
print("Pearson Correlation:", stats.pearsonr(y_val, y_pred_glove)[0])

## Model -04 Albert-base-v2

In [None]:
import random
import numpy as np
import torch

In [None]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
from transformers import AlbertTokenizer, AlbertForSequenceClassification

In [None]:
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

In [None]:
def tokenize_albert(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

In [None]:
train_dataset = train_dataset.map(tokenize_albert, batched=True)
val_dataset = val_dataset.map(tokenize_albert, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [None]:
model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=1)

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
training_args = TrainingArguments(
    output_dir='./albert_results',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="no",
    logging_steps=10,
    report_to="none",
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()

## Results

In [None]:
predictions = trainer.predict(val_dataset)
preds = predictions.predictions.flatten()
true = predictions.label_ids

In [None]:
import numpy as np
preds = np.clip(preds, 0, 1)


In [None]:
print("MSE:", mean_squared_error(true, preds))
print("MAE:", mean_absolute_error(true, preds))
print("R² Score:", r2_score(true, preds))
print("pearson correlation:", stats.pearsonr(preds, true)[0])

## Visualisation

In [None]:
#Model-1

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
#dataframe
data = {
    "Model": ["Model 1", "Model 2", "Model 3", "Model 4"],
    "MSE": [0.006925962865352631, 0.007170915603637695, 0.010263519208464881, 0.008414403535425663],
    "MAE": [0.06532153487205505, 0.06614823639392853, 0.07678683813144223, 0.07092108577489853],
    "R2 Score": [0.6367985010147095, 0.6239530444145203, 0.46177511101571644, 0.5587437748908997],
    "Pearson Correlation": [0.8014024778476525, 0.7935019120340172, 0.7026229967467534, 0.7508233155005781]
}

In [None]:
LP = pd.DataFrame(data)

In [None]:
# bar plot
metrics = ["MSE", "MAE", "R2 Score", "Pearson Correlation"]
figs = []

for metric in metrics:
    plt.figure(figsize=(8, 5))
    plt.bar(LP["Model"], LP[metric])
    plt.title(f"{metric} Comparison")
    plt.ylabel(metric)
    plt.xlabel("Models")
    plt.grid(axis='y')
    plt.tight_layout()
    plt.show()

#predicted vs actual plot model-01

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
true_values = true
predicted_values = preds

In [None]:
plt.figure(figsize=(7, 5))
plt.scatter(true_values, predicted_values, color='blue', alpha=0.7, label="Predicted vs Actual")
plt.plot([0, 1], [0, 1], color='red', linestyle='--', label="Perfect Prediction (y = x)")
plt.xlabel("Actual Complexity")
plt.ylabel("Predicted Complexity")
plt.title("Predicted vs Actual Complexity")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

error distribution plot

In [None]:
errors = predicted_values - true_values
plt.figure(figsize=(7, 5))
sns.histplot(errors, bins=10, kde=True, color='purple')
plt.title("Prediction Error Distribution (Predicted - Actual)")
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.grid(True)
plt.tight_layout()
plt.show()