## PGR

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m69.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
## imports

# data parsing and processing
import pandas as pd
import numpy as np
import os
import glob
import xml.etree.ElementTree as ET

# ML learning
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import BertConfig


# metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# # source paths
# DATA_PATH = "../data/"
# CORPORA_PATH = "raw/pgr-crowd/corpora/"

# source path colab
DATA_PATH = "/content/drive/MyDrive/"

In [4]:
# functions and classes

def parse_xml_file(file_path):
    """
    Function parses an xml file given the path to the file.
    The function returns a list dicts the following keys:
    'sentence' - the text containing the entities
    'entity1' - the first entity in the sentence
    'entity2' - the second entity in the sentence
    'relation' - true if relationship exists, else false
    """
    
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []
    
    for sentence in root.findall('sentence'):
        text = sentence.attrib['text']
        pairs = []
        
        for pair in sentence.findall('pair'):
            e1_id = pair.attrib['e1']
            e2_id = pair.attrib['e2']
            relation = pair.attrib['relation'] == 'true'
            pairs.append((e1_id, e2_id, relation))
        
        entities = {entity.attrib['id']: entity.attrib['text'] for entity in sentence.findall('entity')}
        
        for e1_id, e2_id, relation in pairs:
            data.append({
                'sentence': text,
                'entity1': entities[e1_id],
                'entity2': entities[e2_id],
                'relation': relation,
            })
    
    return data

def preprocess_data(corpus_path):
    """
    Preprocess corpora of xml files for entity extraction with provided path.
    Returns a list of sentences with entity/relationship pairs
    """
    data = []
    
    for file in os.listdir(corpus_path):
        if file.endswith('.xml'):
            file_path = os.path.join(corpus_path, file)
            data.extend(parse_xml_file(file_path))
    
    return data

class REDataset(Dataset): # Relation Extraction
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        sentence = item['sentence']
        entity1 = item['entity1']
        entity2 = item['entity2']
        relation = int(item['relation'])

        # Tokenize the text, add special tokens [CLS] and [SEP]
        inputs = self.tokenizer.encode_plus(
            f"{entity1} [SEP] {entity2} [SEP] {sentence}",
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(relation, dtype=torch.long),
        }

In [5]:
# Preprocess training data
train_data = []
train_folder = DATA_PATH + "amazon_train/"
train_data = preprocess_data(train_folder)

# Preprocess test data
test_data = []
test_folder = DATA_PATH + "consensus_test/"
test_data = preprocess_data(test_folder)

# Split the data into training and validation sets (80% training, 20% validation)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)
len(train_data), len(val_data), len(test_data)

(3602, 901, 1792)

In [16]:
# Compute class frequencies
class_freqs = np.bincount([example['relation'] for example in train_data])

# Compute class weights as the inverse of class frequencies
class_weights = 1 / class_freqs

# Normalize the class weights so that they sum to 1
class_weights = class_weights / class_weights.sum()

In [23]:
# Hyperparameters
batch_size = 16
num_epochs = 3
learning_rate = 2e-5
max_length = 128

# Load pre-trained BERT model, model configs, and tokenizer
config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2)
config.loss_function = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(device))
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create Dataset and DataLoader
train_dataset = REDataset(train_data, tokenizer, max_length)
val_dataset = REDataset(val_data, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_function = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float().to(device))

# Training loop
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_function(logits, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    # Evaluation loop
    model.eval()
    total_eval_loss = 0
    total_eval_accuracy = 0
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        with torch.no_grad():
          outputs = model(input_ids, attention_mask=attention_mask)
          logits = outputs.logits
          loss = loss_function(logits, labels)
        total_eval_loss += loss.item()
        
        preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        true_labels = labels.detach().cpu().numpy()
        total_eval_accuracy += (preds == true_labels).sum()
        
    avg_val_loss = total_eval_loss / len(val_dataloader)
    avg_val_accuracy = total_eval_accuracy / len(val_dataset)
    print(f"Epoch {epoch + 1}/{num_epochs}: Val Loss = {avg_val_loss:.4f}, Val Acc = {avg_val_accuracy:.4f}")

# Save the fine-tuned model
model.save_pretrained(DATA_PATH + "bert_relation_extraction")
tokenizer.save_pretrained(DATA_PATH + "bert_relation_extraction")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/3: Val Loss = 0.8005, Val Acc = 0.9301
Epoch 2/3: Val Loss = 0.7328, Val Acc = 0.8979
Epoch 3/3: Val Loss = 1.1244, Val Acc = 0.9057


TypeError: ignored

In [24]:
# Save the fine-tuned model
config = model.config
config.pop("loss", None)  # Remove the custom loss function from the config

model.save_pretrained(os.path.join(DATA_PATH, "bert_relation_extraction"), config=config)
tokenizer.save_pretrained(os.path.join(DATA_PATH, "bert_relation_extraction"))


AttributeError: ignored

In [12]:
## Test data

test_dataset = REDataset(test_data, tokenizer, max_length)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [13]:
# Evaluation loop
model.eval()
all_preds = []
all_true_labels = []

for batch in test_dataloader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

    preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
    true_labels = labels.detach().cpu().numpy()

    all_preds.extend(preds)
    all_true_labels.extend(true_labels)


In [14]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score

print("Classification Report:")
print(classification_report(all_true_labels, all_preds, target_names=["No Relation", "Relation"]))

print("Confusion Matrix:")
print(confusion_matrix(all_true_labels, all_preds))

print("F1 Score:")
print(f1_score(all_true_labels, all_preds))


Classification Report:
              precision    recall  f1-score   support

 No Relation       0.70      0.02      0.04       613
    Relation       0.66      0.99      0.79      1179

    accuracy                           0.66      1792
   macro avg       0.68      0.51      0.42      1792
weighted avg       0.67      0.66      0.54      1792

Confusion Matrix:
[[  14  599]
 [   6 1173]]
F1 Score:
0.7949847509318876


The results show that the model performs well in detecting relations (F1-score of 0.795 for the "Relation" class) but struggles with the "No Relation" class (F1-score of 0.04). The overall accuracy is 0.66, which might be misleading because it is mainly driven by the model's good performance in the "Relation" class. The confusion matrix indicates that the model tends to predict "Relation" more often than "No Relation," resulting in a high number of false positives for the "No Relation" class.