In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm, tqdm_notebook
from glob import glob

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
# Show the directory structure
root_dir = "/kaggle/input/bbc-news-summary/BBC News Summary/News Articles"

# Show the folders inside the root
print(os.listdir(root_dir))

In [None]:
# List which has the unique category names
unique_category = os.listdir(root_dir)

# List to store the category names
news_category_list = []

# List to store the news articles
news_article_list = []

news_summaries_list = []

# Iterate through unique category
for category in tqdm(unique_category, colour='yellow'):
    # Get the absolute path of that category directory
    abs_category_path = root_dir + "/" + category
    abs_summary_path = "/kaggle/input/bbc-news-summary/BBC News Summary/Summaries" + "/" + category
    
    # Create a query to get the absolute path of all the .txt files
    query = abs_category_path + "/*.txt"
    summary_query = abs_summary_path + "/*.txt"
    #print(query)
    
    # Get the list of absolute path of all the files
    file_paths = glob(query)
    summary_paths = glob(summary_query)
    
    # Iterate through all the .txt files path
    for file_path in file_paths:

        # Open the file
        f = open(file_path, 'r', encoding='latin-1')
        # Read the file contents
        news_article = f.read()
        # Close the file
        f.close()

            
        news_article_list.append(news_article)
        news_category_list.append(category)
            
    for summary_path in summary_paths:
        # Open the file
        f = open(summary_path, 'r', encoding='utf-8')
        # Read the file contents
        news_summary = f.read()
        # Close the file
        f.close()

            
        # Append new articles and category to respective list
        news_summaries_list.append(news_summary)
        
# Print length of news articles and categories
print("Total Articles: ", len(news_article_list))
print("Total Summaries: ", len(news_summaries_list))
print("Total Categories: ", len(news_category_list), end='\n\n')

# Print a sample article
print("Sample Article: ")
print(news_article_list[0])

In [None]:
# Dictionary to create a dataframe
df_dict = {"news": news_article_list, "summaries":news_summaries_list, "labels": news_category_list}

# Convert to dataframe
df = pd.DataFrame(df_dict)

# Remove the 'n' from the news
def remove_ns(text):
    return text.replace("\n", " ")

# Apply this to the data frame
df['news'] = df['news'].astype(str)
df['summaries'] = df['summaries'].astype(str)
for i in range(df.shape[0]):
    df.news[i] = df.news[i].replace("\n", " ")

# Show the dataframe
df.head()


# Save the df into system
df.to_csv('/kaggle/working/origin_dataframe.csv', index=False)

In [None]:
df = pd.read_csv('/kaggle/working/origin_dataframe.csv')
df.head()
#df.summaries[3]
#df.shape

# Task B

## Prepare Dataset for training the Model A&B

In [None]:
from datasets import Dataset

df = pd.read_csv('/kaggle/working/origin_dataframe.csv')
df_business=df[df['labels'] == 'business']
ten_percent = int(0.1 * len(df_business))
test_data = df_business.sample(n=ten_percent, random_state=42)

df = df.drop(test_data.index)
df_business = df_business.drop(test_data.index)
modelB_dataset=Dataset.from_pandas(df_business)
#modelA_dataset=Dataset.from_pandas(df_business)

In order to ensure that the amount of business data used to train model A is no more than B, 20% of the business data is actively deleted.

In [None]:
business_data = df[df['labels'] == 'business']

twenty_percent = int(0.2 * len(business_data))

indices_to_remove = business_data.sample(n=twenty_percent, random_state=42).index

df = df.drop(indices_to_remove)
print(df[df['labels'] == 'business'].shape)

In [None]:
modelA_dataset=Dataset.from_pandas(df)
test_dataset=Dataset.from_pandas(test_data)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

#configuration = BartConfig()
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

inputs = tokenizer("I loved reading the Hunger Games!")

In [None]:
max_input_length = 700
max_target_length = 500

def preprocess_function(examples):

    model_inputs = tokenizer(
        examples["news"], max_length=max_input_length, padding="max_length",
        truncation=True
    )
    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summaries"], max_length=max_target_length, padding="max_length",
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_modelA_datasets = modelA_dataset.map(preprocess_function, batched=False)
tokenized_modelB_datasets = modelB_dataset.map(preprocess_function, batched=False)
tokenized_test_datasets = test_dataset.map(preprocess_function, batched=False)

In [None]:
from datasets.dataset_dict import DatasetDict

tokenized_modelA_datasets_dict=tokenized_modelA_datasets.train_test_split(test_size=0.1, shuffle = True)
tokenized_modelB_datasets_dict=tokenized_modelB_datasets.train_test_split(test_size=0.05, shuffle = True)

tokenized_modelA_datasets_dict = DatasetDict({
    'train': tokenized_modelA_datasets_dict['train'],
    'validation': tokenized_modelA_datasets_dict['test'],
    'test': tokenized_test_datasets})
tokenized_modelB_datasets_dict = DatasetDict({
    'train': tokenized_modelB_datasets_dict['train'],
    'validation': tokenized_modelB_datasets_dict['test'],
    'test': tokenized_test_datasets})

print(tokenized_modelA_datasets_dict)
print(tokenized_modelB_datasets_dict)

## Save the dataset for future training

In [None]:
tokenized_modelA_datasets_dict.save_to_disk("/kaggle/working/dataA") 
tokenized_modelB_datasets_dict.save_to_disk("/kaggle/working/dataB") 

## Read datasetDict

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm, tqdm_notebook
from glob import glob
import os
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

#configuration = BartConfig()
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

from datasets import load_from_disk
tokenized_modelA_datasets_dict = load_from_disk("/kaggle/working/dataA")
tokenized_modelB_datasets_dict = load_from_disk("/kaggle/working/dataB")

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
tokenized_modelA_datasets_dict 

DatasetDict({
    train: Dataset({
        features: ['news', 'summaries', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1874
    })
    validation: Dataset({
        features: ['news', 'summaries', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 209
    })
    test: Dataset({
        features: ['news', 'summaries', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51
    })
})

In [10]:
tokenized_modelB_datasets_dict

DatasetDict({
    train: Dataset({
        features: ['news', 'summaries', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 436
    })
    validation: Dataset({
        features: ['news', 'summaries', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 23
    })
    test: Dataset({
        features: ['news', 'summaries', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51
    })
})

## Train a specific model B (business data)
### Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize 

ax = df_business['labels'].value_counts().plot(kind='bar', figsize=(10,7))

#for p in ax.patches:
#    ax.annotate("{:.1f}".format(p.get_height()), (p.get_x(), p.get_height()+5))
plt.title("News categories count")
plt.show()

In [None]:
token_counts = [len(word_tokenize(text)) for text in df_business.news]

plt.figure(figsize=(10, 6))
sns.histplot(token_counts, bins=50, kde=True, color='blue')
plt.title('Distribution of Token Counts in Texts')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()

In [None]:
token_counts_summary = [len(word_tokenize(text_sum)) for text_sum in df_business.summaries]

plt.figure(figsize=(10, 6))
sns.histplot(token_counts_summary, bins=50, kde=True, color='blue')
plt.title('Distribution of Token Counts in Summaries')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.show()

# Model Selection

Given that the large model was likely pre-trained on our dataset, we chose to use a smaller language model.

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

#configuration = BartConfig()
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

inputs = tokenizer("I loved reading the Hunger Games!")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Fine-tune the model B

In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
from datasets import load_dataset
import time
import torch.optim as optim
#import evaluate



In [4]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


## Setup the PEFT/LoRA model for Fine-Tuning

In [None]:
print(print_number_of_trainable_model_parameters(peft_model_B))

In [7]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config_B = LoraConfig(
    r=8, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

#Add LoRA adapter layers/parameters to the original LLM to be trained.
peft_model_B = get_peft_model(model, 
                            lora_config_B)

print(print_number_of_trainable_model_parameters(peft_model_B))

trainable model parameters: 884736
all model parameters: 248462592
percentage of trainable model parameters: 0.36%


## Train PEFT Adapter B

In [None]:
output_dir = f'/kaggle/working/peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args_B = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=10,
    save_steps=10000,
    logging_steps=100
)

peft_trainer_B = Trainer(
    model=peft_model_B,
    args=peft_training_args_B,
    train_dataset=tokenized_modelB_datasets_dict["train"],
)

In [None]:
from safetensors.torch import load_model, save_model
peft_trainer_B.train()

peft_model_path_B="/kaggle/working/peft-dialogue-summary-checkpoint-local-B"

peft_trainer_B.model.save_pretrained(peft_model_path_B)
tokenizer.save_pretrained(peft_model_path_B)

In [None]:
from peft import PeftModel, PeftConfig

model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

peft_model_B = PeftModel.from_pretrained(model, 
                                       '/kaggle/working/peft-dialogue-summary-checkpoint-local-B', 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=True)

peft_model_base = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

In [None]:
print(print_number_of_trainable_model_parameters(peft_model_B))
print(print_number_of_trainable_model_parameters(peft_model_base))

## Evaluate the Model Quantitatively (with ROUGE Metric)

In [None]:
#human_baseline_summaries = train_test_val_data['validation']['summaries']

human_baseline_summaries = tokenized_modelB_datasets_dict["validation"]['summaries']

peft_model_B_summaries = []

for i in range(len(human_baseline_summaries)):
    news = torch.tensor([tokenized_modelB_datasets_dict["validation"]["input_ids"][i],tokenized_modelB_datasets_dict["validation"]["attention_mask"][i]])

    peft_model_B_outputs = peft_model_B.generate(input_ids=news,
                                             generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    peft_model_B_text_output = tokenizer.decode(peft_model_B_outputs[0],
                                              skip_special_tokens=True)

    peft_model_B_summaries.append(peft_model_B_text_output)

In [None]:
import evaluate

rouge = evaluate.load('rouge')

peft_model_B_results = rouge.compute(
    predictions=peft_model_B_summaries,
    references=human_baseline_summaries[0:len(peft_model_B_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('PEFT MODEL:')
print(peft_model_B_results)

## BERTScore

In [None]:
from bert_score import BERTScorer

# Instantiate the BERTScorer object for English language
scorer = BERTScorer(lang="en")

# Calculate BERTScore for summary 2 against the excerpt
# P2, R2, F2_2 represent Precision, Recall, and F1 Score respectively
P2, R2, F2_2 = scorer.score(peft_model_B_summaries, human_baseline_summaries[0:len(peft_model_B_summaries)])

print("PEFT Model Summaries F1 Score:", F2_2.tolist()[0])

# Fine-tune the model A

In [8]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config_A = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

#Add LoRA adapter layers/parameters to the original LLM to be trained.
peft_model_A = get_peft_model(model, 
                            lora_config_A)
print(print_number_of_trainable_model_parameters(peft_model_A))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [None]:
output_dir = f'/kaggle/working/peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args_A = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=10,
    save_steps=10000,
    logging_steps=100
)

peft_trainer_A = Trainer(
    model=peft_model_A,
    args=peft_training_args_A,
    train_dataset=tokenized_modelA_datasets_dict["train"],
)

In [None]:
from safetensors.torch import load_model, save_model
peft_trainer_A.train()

peft_model_path_A="/kaggle/working/peft-dialogue-summary-checkpoint-local-A"

peft_trainer_A.model.save_pretrained(peft_model_path_A)
tokenizer.save_pretrained(peft_model_path_A)

In [None]:
from peft import PeftModel, PeftConfig

model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

peft_model_A = PeftModel.from_pretrained(model, 
                                       '/kaggle/working/peft-dialogue-summary-checkpoint-local-A', 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=True)

peft_model_base = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

In [None]:
print(print_number_of_trainable_model_parameters(peft_model_A))
print(print_number_of_trainable_model_parameters(peft_model_base))

## ROUGE

In [None]:
#human_baseline_summaries = train_test_val_data['validation']['summaries']

human_baseline_summaries = tokenized_modelA_datasets_dict["validation"]['summaries']

peft_model_A_summaries = []

for i in range(len(human_baseline_summaries)):
    news = torch.tensor([tokenized_modelA_datasets_dict["validation"]["input_ids"][i],tokenized_modelA_datasets_dict["validation"]["attention_mask"][i]])

    peft_model_A_outputs = peft_model_A.generate(input_ids=news,
                                             generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    peft_model_A_text_output = tokenizer.decode(peft_model_A_outputs[0],
                                              skip_special_tokens=True)

    peft_model_A_summaries.append(peft_model_A_text_output)

In [None]:
import evaluate

rouge = evaluate.load('rouge')

peft_model_A_results = rouge.compute(
    predictions=peft_model_A_summaries,
    references=human_baseline_summaries[0:len(peft_model_A_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('PEFT MODEL:')
print(peft_model_A_results)

## BERTScore

In [None]:
from bert_score import BERTScorer

# Instantiate the BERTScorer object for English language
scorer = BERTScorer(lang="en")

# Calculate BERTScore for summary 2 against the excerpt
# P2, R2, F2_2 represent Precision, Recall, and F1 Score respectively
P2, R2, F2_2 = scorer.score(peft_model_A_summaries, human_baseline_summaries[0:len(peft_model_A_summaries)])

print("PEFT Model Summaries F1 Score:", F2_2.tolist()[0])

# Evaluate both model on the test set

In [None]:
human_baseline_summaries = tokenized_modelA_datasets_dict["test"]['summaries']

peft_model_A_summaries = []
peft_model_B_summaries = []

for i in range(len(human_baseline_summaries)):
    news = torch.tensor([tokenized_modelA_datasets_dict["test"]["input_ids"][i],tokenized_modelA_datasets_dict["test"]["attention_mask"][i]])

    peft_model_A_outputs = peft_model_A.generate(input_ids=news,
                                             generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    peft_model_A_text_output = tokenizer.decode(peft_model_A_outputs[0],
                                              skip_special_tokens=True)
    
    peft_model_B_outputs = peft_model_B.generate(input_ids=news,
                                             generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    peft_model_B_text_output = tokenizer.decode(peft_model_B_outputs[0],
                                              skip_special_tokens=True)

    peft_model_A_summaries.append(peft_model_A_text_output)
    peft_model_B_summaries.append(peft_model_B_text_output)

## ROUGE

In [None]:
import evaluate

rouge = evaluate.load('rouge')

peft_model_A_results = rouge.compute(
    predictions=peft_model_A_summaries,
    references=human_baseline_summaries[0:len(peft_model_A_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
peft_model_B_results = rouge.compute(
    predictions=peft_model_B_summaries,
    references=human_baseline_summaries[0:len(peft_model_B_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('MODEL A:')
print(peft_model_A_results)
print('MODEL B:')
print(peft_model_B_results)

## BERTScore

In [None]:
from bert_score import BERTScorer

# Instantiate the BERTScorer object for English language
scorer = BERTScorer(lang="en")

P1, R1, F2_1 = scorer.score(peft_model_A_summaries, human_baseline_summaries[0:len(peft_model_A_summaries)])
P2, R2, F2_2 = scorer.score(peft_model_B_summaries, human_baseline_summaries[0:len(peft_model_B_summaries)])

print("Model A Summaries F1 Score:", F2_1.tolist()[0])
print("Model B Summaries F1 Score:", F2_2.tolist()[0])