In [1]:
import pandas as pd
import os
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

In [2]:
def load_annotations(file_path):
    data = []
    
    with open(file_path, 'r') as f:
        for line in f:
            fields = line.strip().split('\t')
            if len(fields) >= 4:
                article_id = fields[0]
                narrative_field = fields[1]
                subnarrative_field = fields[2]
                explanation = fields[3]

                # Split narratives and sub-narratives if they contain semicolons
                narratives = narrative_field.split(';')
                subnarratives = subnarrative_field.split(';')

                # Append processed data to the list
                data.append([article_id, narratives, subnarratives, explanation])
    
    # Convert the data list into a pandas DataFrame
    return pd.DataFrame(data, columns=["article_id", "narratives", "subnarratives", "explanation"])


In [3]:
def load_all_articles(raw_documents_folder):
    articles = {}
    for filename in os.listdir(raw_documents_folder):
        if filename.endswith(".txt"):
            article_id = filename.split('.')[0]
            with open(os.path.join(raw_documents_folder, filename), 'r', encoding='utf-8') as f:
                articles[article_id] = f.read()
    return articles

In [4]:
annotations = load_annotations('./EN/subtask-3-annotations.txt')
articles = load_all_articles('./EN/raw-documents')

In [6]:
def join_articles_with_annotations(annotations, articles):
    # Convert articles dictionary to a DataFrame
    articles_df = pd.DataFrame(list(articles.items()), columns=["article_id", "article_text"])
    # Normalize article_id in both DataFrames
    annotations["article_id"] = annotations["article_id"].str.strip().str.lower().str.replace(".txt", "")
    articles_df["article_id"] = articles_df["article_id"].str.strip().str.lower()
    
    # Perform a join
    merged_df = annotations.set_index("article_id").join(articles_df.set_index("article_id"), on="article_id")
    
    # Reset the index if needed
    merged_df.reset_index(inplace=True)
    
    return merged_df


In [7]:
annotations_mapped  = join_articles_with_annotations(annotations, articles)

In [None]:
def prepare_data_for_model(annotations_mapped):
    annotations_mapped["input"] = (
        "narrative: " + annotations["narratives"].apply(lambda x: "; ".join(x)) +
        " subnarrative: " + annotations["subnarratives"].apply(lambda x: "; ".join(x)) +
        " context: " + annotations_mapped["article_text"]
    )
    annotations_mapped["output"] = annotations_mapped["explanation"]
    return annotations_mapped[["input", "output"]]

training_data = prepare_data_for_model(annotations_mapped)

<bound method NDFrame.head of                                                  input  \
0    narrative: CC: Criticism of climate movement s...   
1    narrative: CC: Questioning the measurements an...   
2    narrative: CC: Criticism of climate movement s...   
3    narrative: URW: Speculating war outcomes subna...   
4    narrative: URW: Praise of Russia subnarrative:...   
..                                                 ...   
198  narrative: CC: Criticism of climate movement s...   
199  narrative: URW: Discrediting Ukraine subnarrat...   
200  narrative: URW: Negative Consequences for the ...   
201  narrative: CC: Questioning the measurements an...   
202  narrative: URW: Blaming the war on others rath...   

                                                output  
0    The text accuses climate activist Bill Gates f...  
1    There are inconsistencies in the predictions o...  
2    The article talks about climate activists atta...  
3    The text conveys a narrative depicting n

In [9]:
# Load tokenizer and model
model = T5ForConditionalGeneration.from_pretrained("t5-small")
tokenizer = T5Tokenizer.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
def t5_summarize(article, max_length=150, min_length=40):
    inputs = tokenizer.encode("summarize: " + article, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


# # Load the dataset into a pandas DataFrame
# df = pd.read_csv(dataset_path)  # Adjust to read_excel or read_json as needed
# # Summarize each article in the dataset
# # Ensure the column name 'article' matches your dataset's column for text data
# df['generated'] = df['article'].apply(lambda x: t5_summarize(x))

In [14]:
training_data['generated'] = training_data['input'].apply(lambda x: t5_summarize(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['generated'] = training_data['input'].apply(lambda x: t5_summarize(x))


In [36]:
!pip install evaluate
from evaluate import load



In [42]:
bertscore = load("bertscore")
# Function to calculate bertScore
def calculate_bert_scores(references, generations):
    bert_scores = bertscore.compute(predictions = generations, references=references, model_type="distilbert-base-uncased")
    return bert_scores['f1']

# Prepare data
references = training_data['output']
generations = training_data['generated']

# Calculate bert scores
bert_scores = calculate_bert_scores(references, generations)

# Add scores to the DataFrame
training_data['bert_score'] = bert_scores
# print(training_data)
# Calculate and print average score
avg_bert_score = sum(bert_scores) / len(bert_scores) if bert_scores else 0
print(f"Average bertScore: {avg_bert_score:.4f}")


Average bertScore: 0.7581
