In [1]:
import torch
from transformers import BertModel, BertTokenizerFast
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, r2_score
import pandas as pd
import requests
import textstat
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# Step 1: Loading dataset


In [2]:
url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu"
response = requests.get(url)
conllu_text = response.text

In [3]:
sentences = []
current_sentence = []

for line in conllu_text.splitlines():
    if line.startswith("# text = "):
        if current_sentence:
            sentences.append(" ".join(current_sentence))
            current_sentence = []
        sentence_text = line.replace("# text = ", "").strip()
        current_sentence.append(sentence_text)
    elif line == "":
        if current_sentence:
            sentences.append(" ".join(current_sentence))
            current_sentence = []


In [4]:
fkgl_scores = []
fre_scores = []

print(f"Total sentences extracted: {len(sentences)}")
print("Calculating FKGL and FRE scores...")

for sent in tqdm(sentences, desc="Processing sentences"):
    if sent and sent.strip():
        fkgl = textstat.flesch_kincaid_grade(sent)
        fre = textstat.flesch_reading_ease(sent)
    else:
        fkgl = 0.0
        fre = 0.0
    fkgl_scores.append(fkgl)
    fre_scores.append(fre)

print(f"\n Example sentence: {sentences[0]}")
print(f" FKGL: {fkgl_scores[0]}")
print(f" FRE: {fre_scores[0]}")


Total sentences extracted: 12544
Calculating FKGL and FRE scores...


Processing sentences: 100%|██████████| 12544/12544 [00:01<00:00, 6706.55it/s] 


 Example sentence: Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.
 FKGL: 10.580952380952379
 FRE: 56.6057142857143





In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_hidden_states=True)
model.to(device)
model.eval()

Using device: cpu


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
def get_sentence_embedding(sentence):
    encoded = tokenizer(
        sentence,
        return_tensors="pt",
        truncation=True,
        max_length=128,
        padding="max_length"
    )
    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.hidden_states  
    
    all_layer_embeddings = []
    for layer in hidden_states:
        token_embeddings = layer[0]  
        mask = attention_mask[0].unsqueeze(-1).expand(token_embeddings.size()).bool()
        
        valid_embeddings = token_embeddings[mask].view(-1, token_embeddings.size(-1))
        sentence_embedding = valid_embeddings.mean(dim=0).cpu().numpy()
        all_layer_embeddings.append(sentence_embedding)
    return all_layer_embeddings 


In [7]:

all_sentence_embeddings_by_layer = [[] for _ in range(model.config.num_hidden_layers + 1)]

for sent in tqdm(sentences, desc="Embedding sentences"):
    sentence_embs = get_sentence_embedding(sent)
    for layer_idx, emb in enumerate(sentence_embs):
        all_sentence_embeddings_by_layer[layer_idx].append(emb)

# Converting to numpy arrays
for i in range(len(all_sentence_embeddings_by_layer)):
    all_sentence_embeddings_by_layer[i] = np.vstack(all_sentence_embeddings_by_layer[i])

print("Embeddings shape per layer example:")
print(f"Layer 0 embedding shape: {all_sentence_embeddings_by_layer[0].shape}")

Embedding sentences: 100%|██████████| 12544/12544 [1:32:46<00:00,  2.25it/s]


Embeddings shape per layer example:
Layer 0 embedding shape: (12544, 768)


In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

results = []

for layer_idx, embeddings in enumerate(all_sentence_embeddings_by_layer):
    print(f"\n Layer {layer_idx} ")
    
    fkgl_maes = []
    fkgl_r2s = []
    fre_maes = []
    fre_r2s = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(embeddings)):
        X_train, X_test = embeddings[train_idx], embeddings[test_idx]
        y_train_fkgl, y_test_fkgl = np.array(fkgl_scores)[train_idx], np.array(fkgl_scores)[test_idx]
        y_train_fre, y_test_fre = np.array(fre_scores)[train_idx], np.array(fre_scores)[test_idx]
        
        # FKGL regression
        reg_fkgl = LinearRegression()
        reg_fkgl.fit(X_train, y_train_fkgl)
        pred_fkgl = reg_fkgl.predict(X_test)
        
        mae_fkgl = mean_absolute_error(y_test_fkgl, pred_fkgl)
        r2_fkgl = r2_score(y_test_fkgl, pred_fkgl)
        
        fkgl_maes.append(mae_fkgl)
        fkgl_r2s.append(r2_fkgl)
        
        # FRE regression
        reg_fre = LinearRegression()
        reg_fre.fit(X_train, y_train_fre)
        pred_fre = reg_fre.predict(X_test)
        
        mae_fre = mean_absolute_error(y_test_fre, pred_fre)
        r2_fre = r2_score(y_test_fre, pred_fre)
        
        fre_maes.append(mae_fre)
        fre_r2s.append(r2_fre)
        
        print(f"Fold {fold+1}: FKGL MAE={mae_fkgl:.3f}, R2={r2_fkgl:.3f} | FRE MAE={mae_fre:.3f}, R2={r2_fre:.3f}")
        
        # saving fold results
        results.append({
            "layer": layer_idx,
            "fold": fold + 1,
            "fkgl_mae": mae_fkgl,
            "fkgl_r2": r2_fkgl,
            "fre_mae": mae_fre,
            "fre_r2": r2_fre,
        })


 Layer 0 
Fold 1: FKGL MAE=3.525, R2=0.584 | FRE MAE=24.852, R2=0.551
Fold 2: FKGL MAE=3.687, R2=0.546 | FRE MAE=25.813, R2=0.521
Fold 3: FKGL MAE=3.449, R2=0.558 | FRE MAE=23.927, R2=0.490
Fold 4: FKGL MAE=3.594, R2=0.489 | FRE MAE=25.151, R2=0.457
Fold 5: FKGL MAE=3.462, R2=0.497 | FRE MAE=24.233, R2=0.449

 Layer 1 
Fold 1: FKGL MAE=3.325, R2=0.642 | FRE MAE=23.239, R2=0.611
Fold 2: FKGL MAE=3.490, R2=0.606 | FRE MAE=24.316, R2=0.581
Fold 3: FKGL MAE=3.372, R2=0.588 | FRE MAE=23.441, R2=0.522
Fold 4: FKGL MAE=3.453, R2=0.550 | FRE MAE=24.143, R2=0.518
Fold 5: FKGL MAE=3.381, R2=0.539 | FRE MAE=23.797, R2=0.492

 Layer 2 
Fold 1: FKGL MAE=3.304, R2=0.660 | FRE MAE=23.136, R2=0.630
Fold 2: FKGL MAE=3.498, R2=0.623 | FRE MAE=24.330, R2=0.603
Fold 3: FKGL MAE=3.360, R2=0.589 | FRE MAE=23.401, R2=0.525
Fold 4: FKGL MAE=3.438, R2=0.578 | FRE MAE=24.038, R2=0.551
Fold 5: FKGL MAE=3.342, R2=0.565 | FRE MAE=23.290, R2=0.522

 Layer 3 
Fold 1: FKGL MAE=3.236, R2=0.686 | FRE MAE=22.907, R2=0.

In [9]:
results_df = pd.DataFrame(results)
results_df.to_csv("bert_readability_regression_results.csv", index=False)
print("\n Regression results saved to 'bert_readability_regression_results.csv'")



 Regression results saved to 'bert_readability_regression_results.csv'
