# AI 574 Final Project
## Article Summarization fine tuned for articles about deep learning

# Section 1:  Define and use functions to retrieve and parse data

In [1]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
# First, we need a function that walks through the full directory and finds all of the xml files.
#  This function isn't too hard.  Provided with a root directory, it creates a blank list and then 
#  walks down the subdirectories.  Every time it finds an xml file, it adds it to the list.
def list_xml_files(directory):
    xml_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.xml'):
                xml_files.append(os.path.join(root, file))
    return xml_files

In [3]:
# Now, we start defining functions to parse the XML file itsself.  For every header at level one that we find 
#  within the XML file, we need to pull it's title.  The ABSTRACT seems to pull out correctly, but for any further
#  major sections in the paper, the attribute of "title" ends up holding the title of the section.  As such, we use
#  the element tag to hold the section title if it doesn't have a specific title.  If it does, then the title is held.
#  This function creates a list of the data for the entire paper.  Specifically, if the depth of the XML tree is at 1, it
#  iterates through the children of this tree and pulls out all of the text from the file.  This is then appended to a list
#  to store all of the data.

def parse_element(element, depth = 0):
    data = []
    
    if depth == 1 :
        if element.tag == 'SECTION':
            header = element.attrib.get('title')
        else:
            header = element.tag
        content = []
        
        for child in element:
            content.append(child.text.strip() if child.text else '')
        
        if content:
            data.append((header, '\n'.join(content)))
    
    for child in element:
        data.extend(parse_element(child, depth + 1))
    
    return data

In [4]:
# In order to use the prse_element function, we need to retrieve the file from the tree
# and then apply the parsing function to the root retrieved from the path to that paper

def parse_xml_file(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    return parse_element(root, depth = 0)

#  For each of these we also need to read a text file
def read_text_file(file_path):
    with open(file_path, 'r', encoding = 'utf-8') as file:
        return file.read().strip()

In [5]:
# Finally, we need to join all of the section headers and text into a single text item.
#  This item will fill our dictionary text item for the articles that we are training from.

def format_to_text(data):
    formatted_text = []
    
    for header, content in data:
        formatted_text.append(header)
        formatted_text.append(content)
    return '\n\n'.join(formatted_text)

In [6]:
def read_files(xml_file_path):
    base_path = os.path.dirname(xml_file_path)
    xml_file_name = os.path.basename(xml_file_path).replace('.xml', '')
    summary_file_path = os.path.join(base_path, '..', 'summary', f'{xml_file_name}.gold.txt')
    
    xml_data = parse_xml_file(xml_file)
    document_text = format_to_text(xml_data)
    
    summary_text = read_text_file(summary_file_path)
    
    lines = summary_text.split('\n', 1)
    title = lines[0].strip() if lines else ''
    summary = lines[1].strip() if len(lines) > 1 else ''
    
    results = {
        'text': document_text,
        'summary': summary,
        'title': title
    }
    
    return results

## Reading the documents:
There are 1009 documents in our sample dataset, so we need to generate a list of the files that store those documents.  
This code will generate a list of the links to the xml files.  These file names are then used to track down the summaries as well with the functions presented above.  In the end, we end up with a list of dictionary items containing the text, summary, and title for each article.

In [7]:
directory = "top1000_complete"
xml_files = list_xml_files(directory)

In [8]:
print(len(xml_files))

1009


In [9]:
xml_files[1]

'top1000_complete\\A00-1043\\Documents_xml\\A00-1043.xml'

#### An example of a single paper in order to make sure that the above code works.

In [10]:
#read_files(xml_files[1])

###  Reading all the documents

In [11]:
data = []
for xml_file in xml_files:
    file = read_files(xml_file)
    data.append(file)

Our data is now in a list of dictionaries which matches the billsum dataset within the tutorial provided by huggingface at https://huggingface.co/docs/transformers/v4.17.0/en/tasks/summarization

Based on this, we should be able to fine tune our model.  One quick step that we should take will be to convert our dataset into the right format and then apply a splitting to the data in order to test the results.

In [12]:
#!pip install datasets
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

In [13]:
dataset = Dataset.from_dict({
    'text': [item['text'] for item in data],
    'summary': [item['summary'] for item in data],
    'title': [item['title'] for item in data]
})

In [14]:
train_data_temp, test_data = dataset.train_test_split(test_size = 0.15, seed = 42).values()
train_data, valid_data = train_data_temp.train_test_split(test_size = 0.176, seed = 42).values()

In [15]:
data_dict = DatasetDict({
    'train': train_data,
    'valid': valid_data,
    'test': test_data
})

In [16]:
print(len(data_dict['train']))
print(len(data_dict['valid']))
print(len(data_dict['test']))

706
151
152


In [17]:
# Make sure that the data is working properly...

data_dict['train'][1]

{'text': "ABSTRACT\n\nThis paper presents and compares WordNetbased and distributional similarity approaches.\nThe strengths and weaknesses of each approach regarding similarity and relatedness tasks are discussed, and a combination is presented.\nEach of our methods independently provide the best results in their class on the RG and WordSim353 datasets, and a supervised combination of them yields the best published results on all datasets.\nFinally, we pioneer cross-lingual similarity, showing that our methods are easily adapted for a cross-lingual task with minor losses.\n\n1 Introduction\n\nMeasuring semantic similarity and relatedness between terms is an important problem in lexical semantics.\nIt has applications in many natural language processing tasks, such as Textual Entailment, Word Sense Disambiguation or Information Extraction, and other related areas like Information Retrieval.\nThe techniques used to solve this problem can be roughly classified into two main categories: t

In [18]:
type(data_dict)

datasets.dataset_dict.DatasetDict

## Explore the dataset

# Section 2:  Fine Tuning Model Code

In [19]:
import transformers
from transformers import AutoTokenizer, DataCollatorForSeq2Seq 
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import evaluate
import datasets
import rouge_score

import numpy as np

OSError: [WinError 126] The specified module could not be found. Error loading "C:\Users\asus\AppData\Local\Programs\Python\Python311\Lib\site-packages\torch\lib\fbgemm.dll" or one of its dependencies.

In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.get_device_name(0)

In [None]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length = 1024, truncation = True)
    labels = tokenizer(text_target = examples["summary"], max_length = 256, truncation = True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_data = data_dict.map(preprocess_function, batched = True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer, model=checkpoint, return_tensors = 'pt')

In [None]:
rouge = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens = True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens = True)
    result = rouge.compute(predictions = decoded_preds, references = decoded_labels, use_stemmer = True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v, in result.items()}

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
model.generation_config.max_new_tokens = 256

In [None]:
training_args = Seq2SeqTrainingArguments(
    num_train_epochs = 3,
    
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    
    output_dir = "T5_model",
    eval_strategy = "epoch",
    learning_rate = 2e-5,
    weight_decay = 0.01,
    
    save_total_limit = 1,
    predict_with_generate = True,
    fp16 = True,
    push_to_hub = False,
)

trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["valid"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics,
)

In [None]:
import wandb
wandb.init(mode='disabled')

In [None]:
trainer.train()

In [None]:
save_directory="./T5_model"
tokenizer.save_pretrained(save_directory)
trainer.save_model(save_directory)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("./T5_model")
model = AutoModelForSeq2SeqLM.from_pretrained("./T5_model")

In [None]:
tokenized_data['test']

In [None]:
def generate_summaries(inputs):
    inputs = tokenizer(inputs, return_tensors='pt', padding = True, truncation = True)
    with torch.no_grad():
        outputs = model.generate(inputs['input_ids'], max_new_tokens = 256, min_new_tokens = 200, num_beams = 4, early_stopping = True)
    return tokenizer.batch_decode(outputs, skip_special_tokens = True)

In [None]:
generated_texts = []
for example in tokenized_data['test']:
    input_text = example['text']
    prediction = generate_summaries(input_text)
    generated_texts.append(prediction[0])

In [None]:
true_summaries = [example['summary'] for example in data_dict['test']]

In [None]:
result = rouge.compute(predictions = generated_texts, references = true_summaries, use_stemmer = True)

In [None]:
print(result)

In [None]:
generated_texts[4]

In [None]:
true_summaries[4]