In [1]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from transformers import BertTokenizer, BertModel


In [2]:
def load_annotations_with_taxonomy(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                article_id = fields[0]
                taxonomy_narratives = fields[1].split(';')
                sub_narratives_full = fields[2].split(';')

                # Extract taxonomy from the first narrative prefix (e.g., "URW", "CC", or "Other")
                taxonomy = taxonomy_narratives[0].split(':')[0].strip()

                # Extract narratives and sub-narratives, removing taxonomy prefix
                narratives = [n.split(':', 1)[1].strip() if ':' in n else n for n in taxonomy_narratives]
                sub_narratives = [s.split(':', 1)[1].strip() if ':' in s else s for s in sub_narratives_full]

                data.append([article_id, taxonomy, narratives, sub_narratives])

    return pd.DataFrame(data, columns=["article_id", "taxonomy", "narratives", "sub_narratives"])

In [3]:
def load_all_articles(raw_documents_folder):
    articles = {}
    for filename in os.listdir(raw_documents_folder):
        if filename.endswith(".txt"):
            article_id = filename.split('.')[0]
            with open(os.path.join(raw_documents_folder, filename), 'r', encoding='utf-8') as f:
                articles[article_id] = f.read()
    return articles

In [4]:
annotations = load_annotations_with_taxonomy('./EN/subtask-2-annotations.txt')
articles = load_all_articles('./EN/raw-documents')

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        # Tokenize and create input tensors
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        # Get model outputs
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        # Extract [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
    
    return np.array(embeddings)

In [18]:
# Prepare the taxonomy data and load corresponding text content
taxonomy_encoder = LabelEncoder()
annotations['taxonomy_encoded'] = taxonomy_encoder.fit_transform(annotations['taxonomy'])

texts = []
taxonomies = annotations['taxonomy_encoded'].tolist()
for article_id in annotations['article_id']:
    text = articles.get(article_id.split('.')[0], "")
    texts.append(text)


# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(texts, taxonomies, test_size=0.2, random_state=42)

# TF-IDF Vectorization of the text data

embedding = TfidfVectorizer(max_features=700, ngram_range=(1, 5), min_df=5, max_df=0.8, sublinear_tf=False)
X_train_embed = get_bert_embeddings(X_train)
X_test_embed = get_bert_embeddings(X_test)
#X_train_embed = embedding.fit_transform(X_train)
#X_test_embed = embedding.transform(X_test)

In [24]:
model = XGBClassifier(max_depth=6, n_estimators=100, learning_rate=0.72, subsample=0.8, colsample_bytree=0.4, objective='multi: softmax', num_class=3)
#model = LogisticRegression(max_iter=1603, solver = 'lbfgs', penalty='l2', C=3)
#model = SVC(kernel='linear', degree=5, C=2, probability=False)
#model = MultinomialNB() #(0 Precision For CC)
#model = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_split=5, min_samples_leaf=10, random_state=42)

model.fit(X_train_embed, y_train)
y_pred = model.predict(X_test_embed)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}%")
classification_rep = classification_report(y_test, y_pred, target_names=taxonomy_encoder.classes_)
print("Classification Report:\n", classification_rep)

Accuracy: 75.0%
Classification Report:
               precision    recall  f1-score   support

          CC       0.40      0.29      0.33         7
       Other       0.75      0.83      0.79        18
         URW       0.87      0.87      0.87        15

    accuracy                           0.75        40
   macro avg       0.67      0.66      0.66        40
weighted avg       0.73      0.75      0.74        40



In [7]:
from sklearn.preprocessing import MultiLabelBinarizer
xgb_model = XGBClassifier(max_depth=6, n_estimators=100, learning_rate=0.72, subsample=0.8, colsample_bytree=0.4, objective='multi: softmax', num_class=3)
