In [1]:
pip install textstat

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import textstat
import pandas as pd
import torch
import numpy as np
import pickle
from datasets import Dataset
from tqdm.auto import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, r2_score
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)

print(" Step 1: Preparing the Readability Dataset ")

corpus_path = "/home/sharmajidotdev/manish/msdataset/Texts-SeparatedByReadingLevel" 

if not os.path.exists(corpus_path):
    print(f"Error: The corpus directory '{corpus_path}' does not exist.")
    print("Please download the corpus ")
    exit()

all_documents = []
all_fkgl_scores = []
all_fre_scores = []
all_levels = []

print("Loading and processing documents from the corpus...")
for level_dir in os.listdir(corpus_path):
    level_path = os.path.join(corpus_path, level_dir)
    if os.path.isdir(level_path):
        for filename in os.listdir(level_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(level_path, filename)
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                    
                    if text.strip():
                        fkgl = textstat.flesch_kincaid_grade(text)
                        fre = textstat.flesch_reading_ease(text)
                        
                        all_documents.append(text)
                        all_fkgl_scores.append(fkgl)
                        all_fre_scores.append(fre)
                        all_levels.append(level_dir)

readability_dataset = Dataset.from_dict({
    'text': all_documents,
    'fkgl_score': all_fkgl_scores,
    'fre_score': all_fre_scores,
    'level': all_levels
})
    
print(f"Successfully processed {len(readability_dataset)} documents.")
print(f"Example document text: {readability_dataset[0]['text'][:100]}...")
print(f"Example FRE score: {readability_dataset[0]['fre_score']:.2f}")

print("\nStep 1: Completed.")

 Step 1: Preparing the Readability Dataset 
Loading and processing documents from the corpus...
Successfully processed 567 documents.
Example document text: Intermediate 
Example FRE score: 48.10

Step 1: Completed.


In [5]:
print("\n Step 2: Fine-Tuning BERT for FRE Scores ")

model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

MODEL_DIR_READABILITY = "/home/sharmajidotdev/manish/models/readability"
os.makedirs(MODEL_DIR_READABILITY, exist_ok=True)
os.makedirs(os.path.join(MODEL_DIR_READABILITY, "Fre"), exist_ok=True)

fine_tune_results_fre = {'mae': [], 'r2': []}

N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

def tokenize_and_label_function(examples, label_column_name):
    tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=256)
    tokenized_inputs['labels'] = torch.tensor(examples[label_column_name], dtype=torch.float32)
    return tokenized_inputs

def compute_metrics_regression(eval_pred):
    predictions, labels = eval_pred
    mae = mean_absolute_error(labels, predictions)
    r2 = r2_score(labels, predictions)
    return {"mae": mae, "r2": r2}


 Step 2: Fine-Tuning BERT for FRE Scores 
Using device: cuda


In [7]:
for fold_idx, (train_val_indices, test_indices) in enumerate(kf.split(readability_dataset)):
    
    fre_checkpoint_path = os.path.join(MODEL_DIR_READABILITY, f"Fre/best_fold_{fold_idx}")
    
    if os.path.exists(fre_checkpoint_path):
        print(f"\n--- Fold {fold_idx + 1}/{N_SPLITS} already completed. Skipping. ---")
        continue
    
    print(f"\n--- Fine-Tuning for Fold {fold_idx + 1}/{N_SPLITS} ---")

    train_val_dataset = readability_dataset.select(train_val_indices)
    test_dataset = readability_dataset.select(test_indices)
    
    train_dataset, val_dataset = train_val_dataset.train_test_split(test_size=0.2).values()
    
    print(f"\n Fine Tuning for FRE for Fold {fold_idx + 1} ---")
    model_fre = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)

    tokenized_train_fre = train_dataset.map(lambda x: tokenize_and_label_function(x, 'fre_score'), batched=True)
    tokenized_val_fre = val_dataset.map(lambda x: tokenize_and_label_function(x, 'fre_score'), batched=True)
    tokenized_test_fre = test_dataset.map(lambda x: tokenize_and_label_function(x, 'fre_score'), batched=True)

    training_args_fre = TrainingArguments(
        output_dir=os.path.join(MODEL_DIR_READABILITY, f"fre/fold_{fold_idx}"),
        run_name=f"fre-fold-{fold_idx}",
        num_train_epochs=3, per_device_train_batch_size=8, per_device_eval_batch_size=8,
        weight_decay=0.01, eval_strategy="epoch", logging_dir='./logs',logging_steps=10,
        save_strategy="epoch", load_best_model_at_end=True,
    )
    
    trainer_fre = Trainer(
        model=model_fre, args=training_args_fre, train_dataset=tokenized_train_fre,
        eval_dataset=tokenized_val_fre, compute_metrics=compute_metrics_regression, tokenizer=tokenizer,
    )
    
    trainer_fre.train()
    eval_results_fre = trainer_fre.evaluate(tokenized_test_fre)
    fine_tune_results_fre['mae'].append(eval_results_fre['eval_mae'])
    fine_tune_results_fre['r2'].append(eval_results_fre['eval_r2'])
    print(f"Fold {fold_idx+1} FRE Test Results: {eval_results_fre}")
    model_fre.save_pretrained(os.path.join(MODEL_DIR_READABILITY, f"Fre/best_fold_{fold_idx}"))



--- Fold 1/5 already completed. Skipping. ---

--- Fold 2/5 already completed. Skipping. ---

--- Fold 3/5 already completed. Skipping. ---

--- Fold 4/5 already completed. Skipping. ---

--- Fold 5/5 already completed. Skipping. ---


In [8]:
print("\n FRE Fine Tuning Summary ")
avg_fre_mae = np.mean(fine_tune_results_fre['mae'])
avg_fre_r2 = np.mean(fine_tune_results_fre['r2'])
print(f"Average Fine Tuning Performance (FRE) across 5 folds: MAE={avg_fre_mae:.4f}, R2={avg_fre_r2:.4f}")


 FRE Fine Tuning Summary 
Average Fine Tuning Performance (FRE) across 5 folds: MAE=46.1669, R2=-22.9090
