In [None]:
import os
import xml.etree.ElementTree as ET

def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    # Extract the main text
    main_text = ' '.join([elem.text for elem in root.findall('.//section') if elem.text])
    return main_text

# Example usage
file_path = 'ScisummNet/data/SomePaper/SomePaper.xml'
main_text = parse_xml(file_path)
print(main_text[:500])  # Print the first 500 characters


In [None]:
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

# Function to preprocess the data
def preprocess_data(text, summary):
    model_inputs = tokenizer(text, max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(summary, max_length=150, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Example usage
text = "Your extracted main text here."
summary = "Corresponding summary here."
processed_data = preprocess_data(text, summary)


In [None]:
from transformers import T5ForConditionalGeneration

# Load the pre-trained T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")


In [None]:
from transformers import Seq2SeqTrainingArguments

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True
)


In [None]:
from torch.utils.data import Dataset

class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer, max_input_length=1024, max_output_length=150):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]
        model_inputs = self.tokenizer(text, max_length=self.max_input_length, truncation=True, padding="max_length", return_tensors="pt")
        labels = self.tokenizer(summary, max_length=self.max_output_length, truncation=True, padding="max_length", return_tensors="pt")
        model_inputs["labels"] = labels["input_ids"].squeeze()
        return {key: val.squeeze() for key, val in model_inputs.items()}

# Example usage
texts = ["Your extracted main text here."]
summaries = ["Corresponding summary here."]
dataset = SummarizationDataset(texts, summaries, tokenizer)


In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

# Initialize data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,  # Ideally, use a separate validation set
    tokenizer=tokenizer,
    data_collator=data_collator
)


In [None]:
# Train the model
trainer.train()


In [None]:
# Evaluate the model
results = trainer.evaluate()

# Print evaluation results
print(results)


In [None]:
# # Function to generate summary
# def generate_summary(text):
#     inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
#     summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)

# ::contentReference[oaicite:35]{index=35}
 


In [5]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

# Load saved model
model = T5ForConditionalGeneration.from_pretrained("results/checkpoint-75")
tokenizer = AutoTokenizer.from_pretrained("results/checkpoint-75")

In [6]:
# Function to generate summary
def generate_summary(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example usage
sample_paper = "Recent developments in sequence-to-sequence learning with neural networks have considerably improved the quality of automatically generated text summaries and document keywords, stipulating the need for even bigger training corpora. Metadata of research articles are usually easy to find online and can be used to perform research on various tasks. In this paper, we introduce two huge datasets for text summarization (OAGSX) and keyword generation (OAGKX) research, containing 34 million and 23 million records, respectively. The data were retrieved from the Open Academic Graph which is a network of research profiles and publications. We carefully processed each record and also tried several extractive and abstractive methods of both tasks to create performance baselines for other researchers. We further illustrate the performance of those methods previewing their outputs. In the near future, we would like to apply topic modeling on the two sets to derive subsets of research articles from more specific disciplines."
print("Original Paper:\n", sample_paper)
print("\nGenerated Summary:\n", generate_summary(sample_paper))


Original Paper:
 Recent developments in sequence-to-sequence learning with neural networks have considerably improved the quality of automatically generated text summaries and document keywords, stipulating the need for even bigger training corpora. Metadata of research articles are usually easy to find online and can be used to perform research on various tasks. In this paper, we introduce two huge datasets for text summarization (OAGSX) and keyword generation (OAGKX) research, containing 34 million and 23 million records, respectively. The data were retrieved from the Open Academic Graph which is a network of research profiles and publications. We carefully processed each record and also tried several extractive and abstractive methods of both tasks to create performance baselines for other researchers. We further illustrate the performance of those methods previewing their outputs. In the near future, we would like to apply topic modeling on the two sets to derive subsets of researc

In [None]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

# Memuat model dan tokenizer lain (misalnya, PEGASUS)
model_pegasus = T5ForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer_pegasus = AutoTokenizer.from_pretrained("google/pegasus-xsum")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are using a model of type pegasus to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.
Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['decoder.block.0.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.0.layer.0.SelfAttention.q.weight', 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'decoder.block.0.layer.0.SelfAttention.v.weight', 'decoder.block.0.layer.0.layer_norm.weight', 'decoder.block.0.layer.1.EncDecAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.o.weight', 'decoder.block.0.layer.1.En

Krist Krist Krist Krist Krist Krist Krist Krist Krist Krist Krist Krist Krist Krist Krist Krist Krist Krist bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets bullets applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud applaud appl

In [9]:
# Fungsi untuk menghasilkan ringkasan menggunakan model dan tokenizer yang berbeda
def generate_summary(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Menggunakan model PEGASUS untuk menghasilkan ringkasan
sample_paper = "Recent developments in sequence-to-sequence learning with neural networks have considerably improved the quality of automatically generated text summaries and document keywords, stipulating the need for even bigger training corpora. Metadata of research articles are usually easy to find online and can be used to perform research on various tasks. In this paper, we introduce two huge datasets for text summarization (OAGSX) and keyword generation (OAGKX) research, containing 34 million and 23 million records, respectively. The data were retrieved from the Open Academic Graph which is a network of research profiles and publications. We carefully processed each record and also tried several extractive and abstractive methods of both tasks to create performance baselines for other researchers. We further illustrate the performance of those methods previewing their outputs. In the near future, we would like to apply topic modeling on the two sets to derive subsets of research articles from more specific disciplines."
print(generate_summary(sample_paper, model_pegasus, tokenizer_pegasus))


scalability scalability scalability scalability scalability scalability scalability scalability scalability scalability scalability scalability scalability scalability scalability scalability scalability scalability scalabilityESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESESES
