**Installing and importing the necessary packages**

In [None]:
pip install rouge

In [1]:
pip show rouge

Name: rouge
Version: 1.0.1
Summary: Full Python ROUGE Score Implementation (not a wrapper)
Home-page: http://github.com/pltrdy/rouge
Author: pltrdy
Author-email: pltrdy@gmail.com
License: LICENCE.txt
Location: /opt/conda/lib/python3.10/site-packages
Requires: six
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install bert-score

In [2]:
pip show bert-score

Name: bert-score
Version: 0.3.13
Summary: PyTorch implementation of BERT score
Home-page: https://github.com/Tiiiger/bert_score
Author: Tianyi Zhang*, Varsha Kishore*, Felix Wu*, Kilian Q. Weinberger, and Yoav Artzi
Author-email: tzhang@asapp.com
License: MIT
Location: /opt/conda/lib/python3.10/site-packages
Requires: matplotlib, numpy, packaging, pandas, requests, torch, tqdm, transformers
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install textstat

In [3]:
pip show textstat

Name: textstat
Version: 0.7.3
Summary: Calculate statistical features from text
Home-page: https://github.com/shivam5992/textstat
Author: Shivam Bansal, Chaitanya Aggarwal
Author-email: shivam5992@gmail.com
License: MIT
Location: /opt/conda/lib/python3.10/site-packages
Requires: pyphen
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
import glob
from pathlib import Path
from transformers import BartTokenizer, BartForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from torch.utils.data import Dataset, DataLoader
import shutil
import csv
from rouge import Rouge
import bert_score
from textstat import gunning_fog
import zipfile

import warnings
warnings.filterwarnings("ignore", message="TypedStorage is deprecated", category=UserWarning)
warnings.filterwarnings("ignore", message="Some non-default generation parameters are set in the model config", category=UserWarning)
warnings.filterwarnings("ignore", message="huggingface/tokenizers: The current process just got forked", category=UserWarning)

# warnings.resetwarnings()

**GPU**

In [5]:
# Move tokenizer and model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Available Device is",device)

Available Device is cuda


**Loading the models**

In [6]:
# Load Legal Pegasus tokenizer and model for summarization
legal_pegasus_tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
legal_pegasus_model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
legal_pegasus_model = legal_pegasus_model.to(device)

# Load BART tokenizer
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
# Fine-tune BART model
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
bart_model.to(device)

print(legal_pegasus_model)
print("\n\n")
print(bart_model)

tokenizer_config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_no

**Training**

In [7]:
# Define a dataset class
class TextSimplificationDataset(Dataset):
    def __init__(self, summaries_folder, simplify_folder, tokenizer):
        self.summaries_folder = summaries_folder
        self.simplify = self.load_texts(simplify_folder)
        self.tokenizer = tokenizer
    
    def load_texts(self, folder):
        texts = []
        folder_path = Path(folder)
        for file_path in folder_path.glob("*.txt"):
            with open(file_path, "r", encoding="utf-8") as file:
                texts.append(file.read())
        return texts
    
    def __len__(self):
        return len(self.summaries_folder)
    
    def __getitem__(self, idx):
        source_text = self.load_summary(idx)
        target_text = self.simplify[idx]
        encoding = self.tokenizer(source_text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        labels = self.tokenizer(target_text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")["input_ids"]
        return {k: v.squeeze(0) for k, v in encoding.items()}, labels.squeeze(0)
    
    def load_summary(self, idx):
        file_path = self.summaries_folder[idx]
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()


# Generate summaries for Judgement_Train folder
judgement_folder = "/kaggle/input/mildsum-split-2/Judgement_Train"
judgement_files = glob.glob(os.path.join(judgement_folder, "*.txt"))

summaries_folder = "/kaggle/working/Summaries_Train"
os.makedirs(summaries_folder, exist_ok=True)

# Set the model to evaluation mode for inference
legal_pegasus_model.eval()

for file_path in judgement_files:
    file_name = os.path.basename(file_path)
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
        with torch.no_grad():
            # Encode the input text
            inputs = legal_pegasus_tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
            # Generate summary on GPU
            summary = legal_pegasus_model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_length=100, num_beams=4, early_stopping=True)
            # Save the summary with the same name as the input file
            with open(os.path.join(summaries_folder, file_name), "w", encoding="utf-8") as f:
                f.write(legal_pegasus_tokenizer.decode(summary[0], skip_special_tokens=True))



print("Training Started ...")
                
# Move the input back to CPU
inputs = inputs.to("cpu")


# Define paths to your dataset folders
simplify_folder = "/kaggle/input/mildsum-split-2/Simplify_Train"

# Define dataset and dataloader
dataset = TextSimplificationDataset(glob.glob(os.path.join(summaries_folder, "*.txt")), simplify_folder, bart_tokenizer)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

optimizer = torch.optim.AdamW(bart_model.parameters(), lr=5e-5)

# Train the model
bart_model.train()
for epoch in range(20):  # Adjust number of epochs as needed
    for batch in dataloader:
        inputs = {key: value.to(device) for key, value in batch[0].items()}
        labels = batch[1].to(device)

        outputs = bart_model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"|\t Epoch {epoch+1} \t | \tLoss = {loss.item()} \t|")
    print("-" * 65)

# Save the fine-tuned BART model
output_dir = "/kaggle/working/fine_tuned_bart_model"
os.makedirs(output_dir, exist_ok=True)
bart_tokenizer.save_pretrained(output_dir)
bart_model.save_pretrained(output_dir)
print("\nFine-tuned BART model saved successfully.\n")

labels = labels.to("cpu")

shutil.rmtree(summaries_folder)
print("\nTraining Completed.\n")

|	 Epoch 1 	 | 	Loss = 2.6931285858154297 	|
-----------------------------------------------------------------
|	 Epoch 2 	 | 	Loss = 2.2824175357818604 	|
-----------------------------------------------------------------
|	 Epoch 3 	 | 	Loss = 1.8398253917694092 	|
-----------------------------------------------------------------
|	 Epoch 4 	 | 	Loss = 1.863691806793213 	|
-----------------------------------------------------------------
|	 Epoch 5 	 | 	Loss = 1.5864046812057495 	|
-----------------------------------------------------------------
|	 Epoch 6 	 | 	Loss = 1.4915056228637695 	|
-----------------------------------------------------------------
|	 Epoch 7 	 | 	Loss = 1.1613913774490356 	|
-----------------------------------------------------------------
|	 Epoch 8 	 | 	Loss = 0.9879866242408752 	|
-----------------------------------------------------------------
|	 Epoch 9 	 | 	Loss = 0.8358415961265564 	|
-----------------------------------------------------------------
|	

Non-default generation parameters: {'max_length': 142, 'min_length': 56, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


|	 Epoch 20 	 | 	Loss = 0.09266327321529388 	|
-----------------------------------------------------------------

Fine-tuned BART model saved successfully.


Training Completed.



**Testing**

In [8]:
# Define paths to test dataset folders
judgement_test_folder = "/kaggle/input/mildsum-split-2/Judgement_Test"
simplify_test_folder = "/kaggle/input/mildsum-split-2/Simplify_Test"

# Generate summaries for Judgement_Test folder using Legal Pegasus model
test_summaries_folder = "/kaggle/working/Summaries_Test"
os.makedirs(test_summaries_folder, exist_ok=True)

# Set the Legal Pegasus model to evaluation mode for inference
legal_pegasus_model.eval()

for file_path in glob.glob(os.path.join(judgement_test_folder, "*.txt")):
    file_name = os.path.basename(file_path)
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
        with torch.no_grad():
            # Encode the input text
            inputs = legal_pegasus_tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
            # Generate summary on GPU
            summary = legal_pegasus_model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_length=100, num_beams=4, early_stopping=True)
            # Save the summary with the same name as the input file
            with open(os.path.join(test_summaries_folder, file_name), "w", encoding="utf-8") as f:
                f.write(legal_pegasus_tokenizer.decode(summary[0], skip_special_tokens=True))
                
                
# Define dataset and data loader for testing
test_dataset = TextSimplificationDataset(glob.glob(os.path.join(test_summaries_folder, "*.txt")), simplify_test_folder, bart_tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Evaluate the fine-tuned BART model on the test dataset
bart_model.eval()
total_loss = 0.0
num_samples = 0

# Calculatimg loss
for batch in test_dataloader:
    
    inputs = {key: value.to(device) for key, value in batch[0].items()}
    labels = batch[1].to(device)

    with torch.no_grad():
        
        outputs = bart_model(**inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item() * labels.size(0)
        num_samples += labels.size(0)

test_summary_files = glob.glob(os.path.join(test_summaries_folder, "*.txt"))

generated_simplified_folder = "/kaggle/working/Generated_Simplified"
os.makedirs(generated_simplified_folder, exist_ok=True)


# Saving the generated summaries
for file_path in test_summary_files:
    file_name = os.path.basename(file_path)
    with open(file_path, "r", encoding="utf-8") as file:
        text = file.read()
        with torch.no_grad():
            # Encode the input text
            inputs = bart_tokenizer(text, return_tensors="pt", max_length=512, truncation=True).to(device)
            # Generate summary on GPU
            simplified = bart_model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_length=1024, num_beams=4, early_stopping=True)
            # Save the summary with the same name as the input file
            with open(os.path.join(generated_simplified_folder, file_name), "w", encoding="utf-8") as f:
                f.write(bart_tokenizer.decode(simplified[0], skip_special_tokens=True))

# Delete the test summaries folder
shutil.rmtree(test_summaries_folder)
                
average_loss = total_loss / num_samples
print(f"Average Loss on Test Dataset: {average_loss}")

Average Loss on Test Dataset: 4.346084743738174


**Metrics Calculation**

In [9]:
# Function to calculate ROUGE scores
def calculate_rouge(reference, candidate):
    rouge = Rouge()
    scores = rouge.get_scores(candidate, reference)
    return scores[0]['rouge-1']['f'], scores[0]['rouge-2']['f'], scores[0]['rouge-l']['f']

# Function to calculate BERT score
def calculate_bert_score(reference, candidate):
    _, _, f1 = bert_score.score([candidate], [reference], lang='en', model_type='bert-base-uncased', rescale_with_baseline=True)
    return f1.item()

# Paths to the folders
test_folder = "/kaggle/input/mildsum-split-2/Simplify_Test"
output_folder = "/kaggle/working/Generated_Simplified"

# Get list of files in the folders
test_files = sorted(os.listdir(test_folder))
output_files = sorted(os.listdir(output_folder))

# File path for scores.csv
csv_file_path = '/kaggle/working/rouge_scores.csv'

# Initialize CSV writer
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ID', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BERT Score'])

    # Loop through files and calculate scores
    for i, (test_file, output_file) in enumerate(zip(test_files, output_files)):
        # Read content of files
        with open(os.path.join(test_folder, test_file), 'r', encoding='utf-8') as f:
            test_content = f.read()
        with open(os.path.join(output_folder, output_file), 'r', encoding='utf-8') as f:
            output_content = f.read()

        # Calculate scores
        rouge_1, rouge_2, rouge_l = calculate_rouge(test_content, output_content)
        bert_score_val = calculate_bert_score(test_content, output_content)

        # Write scores to CSV
        writer.writerow([i+1, rouge_1, rouge_2, rouge_l, bert_score_val])

print("Scores calculated and saved to scores.csv.")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Scores calculated and saved to scores.csv.


In [10]:
# Function to calculate Gunning Fox Index (GFI) score
def calculate_gfi(text):
    return gunning_fog(text)

# Paths to the folders
input_folder = "/kaggle/working/Generated_Simplified"

# Get list of files in the folder
input_files = sorted(os.listdir(input_folder))

# Initialize CSV writer
with open('gfi_scores.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ID', 'GFI Score'])

    # Loop through files and calculate GFI scores
    for i, input_file in enumerate(input_files):
        # Read content of file
        with open(os.path.join(input_folder, input_file), 'r', encoding='utf-8') as f:
            content = f.read()

        # Calculate GFI score
        gfi_score = calculate_gfi(content)

        # Write scores to CSV
        writer.writerow([i+1, gfi_score])

print("GFI scores calculated and saved to gfi_scores.csv.")

GFI scores calculated and saved to gfi_scores.csv.


In [13]:
directory_to_zip = '/kaggle/working/Generated_Simplified'
output_zip_file = '/kaggle/working/Generated_Simplified.zip'

with zipfile.ZipFile(output_zip_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(directory_to_zip):
        for file in files:
            zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), directory_to_zip))

**Remove directories (if needed)**

In [11]:
# !rm -r /kaggle/working/fine_tuned_bart_model
# !rm -r /kaggle/working/Generated_Simplified
# !rm -r /kaggle/working/rouge_scores.csv
# !rm -r /kaggle/working/gfi_scores.csv
# !rm -r /kaggle/working/Summaries_Train
# !rm -r /kaggle/working/Summaries_Test