In [1]:
pip install textstat

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import textstat
import pandas as pd
import torch
import numpy as np
import pickle
from datasets import Dataset
from tqdm.auto import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, r2_score
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)

print("Step 1: Preparing the Readability Dataset")

corpus_path = "/home/sharmajidotdev/manish/msdataset/Texts-SeparatedByReadingLevel" 

if not os.path.exists(corpus_path):
    print(f"Error: The corpus directory '{corpus_path}' does not exist.")
    print("Please download the corpus ")
    exit()

all_documents = []
all_fkgl_scores = []
all_fre_scores = []
all_levels = []

print("Loading and processing documents from the corpus...")
for level_dir in os.listdir(corpus_path):
    level_path = os.path.join(corpus_path, level_dir)
    if os.path.isdir(level_path):
        for filename in os.listdir(level_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(level_path, filename)
                with open(file_path, 'r', encoding='utf-8') as f:
                    text = f.read()
                    
                    if text.strip():
                        fkgl = textstat.flesch_kincaid_grade(text)
                        fre = textstat.flesch_reading_ease(text)
                        
                        all_documents.append(text)
                        all_fkgl_scores.append(fkgl)
                        all_fre_scores.append(fre)
                        all_levels.append(level_dir)

readability_dataset = Dataset.from_dict({
    'text': all_documents,
    'fkgl_score': all_fkgl_scores,
    'fre_score': all_fre_scores,
    'level': all_levels
})
    
print(f"Successfully processed {len(readability_dataset)} documents.")
print(f"Example document text: '{readability_dataset[0]['text'][:100]}...'")
print(f"Example FKGL score: {readability_dataset[0]['fkgl_score']:.2f}")

print("\nStep 1: Completed.")


  from .autonotebook import tqdm as notebook_tqdm


Step 1: Preparing the Readability Dataset
Loading and processing documents from the corpus...
Successfully processed 567 documents.
Example document text: 'Intermediate 
Example FKGL score: 11.94

Step 1: Completed.


In [3]:
print("\n Step 2: Fine-Tuning BERT for FKGL Scores ")

model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

MODEL_DIR_READABILITY = "/home/sharmajidotdev/manish/models/readability"
os.makedirs(MODEL_DIR_READABILITY, exist_ok=True)
os.makedirs(os.path.join(MODEL_DIR_READABILITY, "Fkgl"), exist_ok=True)

fine_tune_results_fkgl = {'mae': [], 'r2': []}

N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

def tokenize_and_label_function(examples, label_column_name):
    tokenized_inputs = tokenizer(examples['text'], padding="max_length", truncation=True, max_length=256)
    tokenized_inputs['labels'] = torch.tensor(examples[label_column_name], dtype=torch.float32)
    return tokenized_inputs

def compute_metrics_regression(eval_pred):
    predictions, labels = eval_pred
    mae = mean_absolute_error(labels, predictions)
    r2 = r2_score(labels, predictions)
    return {"mae": mae, "r2": r2}



 Step 2: Fine-Tuning BERT for FKGL Scores 
Using device: cuda


In [4]:
for fold_idx, (train_val_indices, test_indices) in enumerate(kf.split(readability_dataset)):
    
    fkgl_checkpoint_path = os.path.join(MODEL_DIR_READABILITY, f"Fkgl/best_fold_{fold_idx}")
    
    if os.path.exists(fkgl_checkpoint_path):
        print(f"\n--- Fold {fold_idx + 1}/{N_SPLITS} already completed. Skipping. ---")
        continue
    
    print(f"\n Fine Tuning for Fold {fold_idx + 1}/{N_SPLITS} ")

    train_val_dataset = readability_dataset.select(train_val_indices)
    test_dataset = readability_dataset.select(test_indices)
    
    train_dataset, val_dataset = train_val_dataset.train_test_split(test_size=0.2).values()
    
    print(f"\n Fine Tuning for FKGL for Fold {fold_idx + 1} ")
    model_fkgl = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)

    tokenized_train_fkgl = train_dataset.map(lambda x: tokenize_and_label_function(x, 'fkgl_score'), batched=True)
    tokenized_val_fkgl = val_dataset.map(lambda x: tokenize_and_label_function(x, 'fkgl_score'), batched=True)
    tokenized_test_fkgl = test_dataset.map(lambda x: tokenize_and_label_function(x, 'fkgl_score'), batched=True)
    
    training_args_fkgl = TrainingArguments(
        output_dir=os.path.join(MODEL_DIR_READABILITY, f"Fkgl/fold_{fold_idx}"),
        run_name=f"fkgl-fold-{fold_idx}",
        num_train_epochs=3, per_device_train_batch_size=8, per_device_eval_batch_size=8,logging_steps=10,
        weight_decay=0.01, eval_strategy="epoch", logging_dir='./logs',
        save_strategy="epoch", load_best_model_at_end=True,
    )

    trainer_fkgl = Trainer(
        model=model_fkgl, args=training_args_fkgl, train_dataset=tokenized_train_fkgl,
        eval_dataset=tokenized_val_fkgl, compute_metrics=compute_metrics_regression, tokenizer=tokenizer,
    )
    
    trainer_fkgl.train()
    eval_results_fkgl = trainer_fkgl.evaluate(tokenized_test_fkgl)
    fine_tune_results_fkgl['mae'].append(eval_results_fkgl['eval_mae'])
    fine_tune_results_fkgl['r2'].append(eval_results_fkgl['eval_r2'])
    print(f"Fold {fold_idx+1} FKGL Test Results: {eval_results_fkgl}")
    model_fkgl.save_pretrained(os.path.join(MODEL_DIR_READABILITY, f"Fkgl/best_fold_{fold_idx}"))



--- Fold 1/5 already completed. Skipping. ---

 Fine Tuning for Fold 2/5 

 Fine Tuning for FKGL for Fold 2 


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 362/362 [00:00<00:00, 1612.02 examples/s]
Map: 100%|██████████| 91/91 [00:00<00:00, 1540.72 examples/s]
Map: 100%|██████████| 114/114 [00:00<00:00, 1235.22 examples/s]
  trainer_fkgl = Trainer(


Epoch,Training Loss,Validation Loss,Mae,R2
1,21.4669,11.020518,2.681038,-1.398791
2,4.7787,4.518609,1.778632,0.016453
3,4.0357,3.883676,1.657008,0.154656


Fold 2 FKGL Test Results: {'eval_loss': 3.4213476181030273, 'eval_mae': 1.5118625164031982, 'eval_r2': 0.13016116619110107, 'eval_runtime': 1.4451, 'eval_samples_per_second': 78.888, 'eval_steps_per_second': 10.38, 'epoch': 3.0}

 Fine Tuning for Fold 3/5 

 Fine Tuning for FKGL for Fold 3 


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 363/363 [00:00<00:00, 1751.22 examples/s]
Map: 100%|██████████| 91/91 [00:00<00:00, 1610.03 examples/s]
Map: 100%|██████████| 113/113 [00:00<00:00, 1447.24 examples/s]
  trainer_fkgl = Trainer(


Epoch,Training Loss,Validation Loss,Mae,R2
1,20.2332,11.597965,2.842669,-1.699969
2,5.0548,4.232935,1.680344,0.014586
3,3.0031,2.687883,1.274172,0.37427


Fold 3 FKGL Test Results: {'eval_loss': 2.9014599323272705, 'eval_mae': 1.3084485530853271, 'eval_r2': 0.3188062906265259, 'eval_runtime': 1.1583, 'eval_samples_per_second': 97.556, 'eval_steps_per_second': 12.95, 'epoch': 3.0}

 Fine Tuning for Fold 4/5 

 Fine Tuning for FKGL for Fold 4 


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 363/363 [00:00<00:00, 2284.77 examples/s]
Map: 100%|██████████| 91/91 [00:00<00:00, 2053.23 examples/s]
Map: 100%|██████████| 113/113 [00:00<00:00, 2055.48 examples/s]
  trainer_fkgl = Trainer(


Epoch,Training Loss,Validation Loss,Mae,R2
1,22.7978,12.535304,2.949249,-1.656501
2,4.93,4.679214,1.725635,0.008374
3,2.8349,3.645015,1.489598,0.227543


Fold 4 FKGL Test Results: {'eval_loss': 3.1573281288146973, 'eval_mae': 1.406311273574829, 'eval_r2': 0.2824860215187073, 'eval_runtime': 1.1876, 'eval_samples_per_second': 95.147, 'eval_steps_per_second': 12.63, 'epoch': 3.0}

 Fine Tuning for Fold 5/5 

 Fine Tuning for FKGL for Fold 5 


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 363/363 [00:00<00:00, 2050.81 examples/s]
Map: 100%|██████████| 91/91 [00:00<00:00, 2162.11 examples/s]
Map: 100%|██████████| 113/113 [00:00<00:00, 2200.86 examples/s]
  trainer_fkgl = Trainer(


Epoch,Training Loss,Validation Loss,Mae,R2
1,19.6739,12.253371,2.86046,-1.382176
2,3.9762,4.997544,1.841723,0.028428
3,3.4108,3.350609,1.49052,0.348609


Fold 5 FKGL Test Results: {'eval_loss': 2.42488431930542, 'eval_mae': 1.1937713623046875, 'eval_r2': 0.38742566108703613, 'eval_runtime': 1.1683, 'eval_samples_per_second': 96.72, 'eval_steps_per_second': 12.839, 'epoch': 3.0}


In [6]:
print("\n FKGL Fine Tuning Summary ")
avg_fkgl_mae = np.mean(fine_tune_results_fkgl['mae'])
avg_fkgl_r2 = np.mean(fine_tune_results_fkgl['r2'])
print(f"Average Fine Tuning Performance (FKGL) across 5 folds: MAE={avg_fkgl_mae:.4f}, R2={avg_fkgl_r2:.4f}")



 FKGL Fine Tuning Summary 
Average Fine Tuning Performance (FKGL) across 5 folds: MAE=1.3551, R2=0.2797
