In [1]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from transformers import BertTokenizer, BertModel


In [2]:
def load_annotations_with_taxonomy(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            fields = line.strip().split('\t')
            if len(fields) >= 3:
                article_id = fields[0]
                taxonomy_narratives = fields[1].split(';')
                sub_narratives_full = fields[2].split(';')

                # Extract taxonomy from the first narrative prefix (e.g., "URW", "CC", or "Other")
                taxonomy = taxonomy_narratives[0].split(':')[0].strip()

                # Extract narratives and sub-narratives, removing taxonomy prefix
                narratives = [n.split(':', 1)[1].strip() if ':' in n else n for n in taxonomy_narratives]
                sub_narratives = [s.split(':', 1)[1].strip() if ':' in s else s for s in sub_narratives_full]

                data.append([article_id, taxonomy, narratives, sub_narratives])

    return pd.DataFrame(data, columns=["article_id", "taxonomy", "narratives", "sub_narratives"])

In [3]:
def load_all_articles(raw_documents_folder):
    articles = {}
    for filename in os.listdir(raw_documents_folder):
        if filename.endswith(".txt"):
            article_id = filename.split('.')[0]
            with open(os.path.join(raw_documents_folder, filename), 'r', encoding='utf-8') as f:
                articles[article_id] = f.read()
    return articles

In [4]:
annotations = load_annotations_with_taxonomy('./EN/subtask-2-annotations.txt')
articles = load_all_articles('./EN/raw-documents')

In [5]:
# annotations = load_annotations_with_taxonomy('./HI/subtask-2-annotations.txt')
# articles = load_all_articles('./HI/raw-documents')

In [6]:
# annotations = load_annotations_with_taxonomy('./PT/subtask-2-annotations.txt')
# articles = load_all_articles('./PT/raw-documents')

In [7]:
# annotations = load_annotations_with_taxonomy('./BG/subtask-2-annotations.txt')
# articles = load_all_articles('./BG/raw-documents')

In [8]:
# annotations = load_annotations_with_taxonomy('./RU/subtask-2-annotations.txt')
# articles = load_all_articles('./RU/raw-documents')

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        # Tokenize and create input tensors
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        
        # Get model outputs
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        # Extract [CLS] token embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
    
    return np.array(embeddings)

In [10]:
# Prepare the taxonomy data and load corresponding text content
taxonomy_encoder = LabelEncoder()
annotations['taxonomy_encoded'] = taxonomy_encoder.fit_transform(annotations['taxonomy'])

taxon = []
taxonomies = annotations['taxonomy_encoded'].tolist()
for article_id in annotations['article_id']:
    text = articles.get(article_id.split('.')[0], "")
    taxon.append(text)


# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(taxon, taxonomies, test_size=0.5, random_state=42)

# TF-IDF Vectorization of the text data
embedding = TfidfVectorizer(max_features=700, ngram_range=(1, 5), min_df=5, max_df=0.8, sublinear_tf=False)
# X_train_embed = get_bert_embeddings(X_train)
# X_test_embed = get_bert_embeddings(X_test)
# X_train_embed = embedding.fit_transform(X_train)
# X_test_embed = embedding.transform(X_test)
X_train_tfidf = embedding.fit_transform(X_train).toarray()
X_test_tfidf = embedding.transform(X_test).toarray()
X_train_bert = get_bert_embeddings(X_train)
X_test_bert = get_bert_embeddings(X_test)
X_train_tfidf = normalize(X_train_tfidf, axis=1)
X_test_tfidf = normalize(X_test_tfidf, axis=1)
X_train_bert = normalize(X_train_bert, axis=1)
X_test_bert = normalize(X_test_bert, axis=1)
X_train_embed = np.hstack((X_train_tfidf, X_train_bert))
X_test_embed = np.hstack((X_test_tfidf, X_test_bert))

In [11]:
model = XGBClassifier(max_depth=6, n_estimators=100, learning_rate=0.72, subsample=0.8, colsample_bytree=0.4, objective='multi:softmax', num_class=3)
#model = LogisticRegression(max_iter=1603, solver = 'lbfgs', penalty='l2', C=3)
#model = SVC(kernel='linear', degree=5, C=2, probability=False)
#model = MultinomialNB() #(0 Precision For CC)
#model = RandomForestClassifier(n_estimators=200, max_depth=6, min_samples_split=5, min_samples_leaf=10, random_state=42)

model.fit(X_train_embed, y_train)
y_pred = model.predict(X_test_embed)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy*100}%")
classification_rep = classification_report(y_test, y_pred, target_names=taxonomy_encoder.classes_)
print("Classification Report:\n", classification_rep)

Accuracy: 73.5%
Classification Report:
               precision    recall  f1-score   support

          CC       0.79      0.84      0.82        50
       Other       0.71      0.67      0.69        87
         URW       0.72      0.75      0.73        63

    accuracy                           0.73       200
   macro avg       0.74      0.75      0.75       200
weighted avg       0.73      0.73      0.73       200



In [12]:
# Encode narratives
X_tfidf = embedding.fit_transform(articles.values())

# Encode taxonomy labels for classification
taxonomy_encoder = LabelEncoder()
y_taxonomy = taxonomy_encoder.fit_transform(annotations['taxonomy'])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_taxonomy, test_size=0.2, random_state=42)

In [13]:
# Filter narratives for URW and CC (excluding Other)
narratives = annotations.loc[annotations['taxonomy'] != 'Other', 'narratives']
narratives_encoder = MultiLabelBinarizer()
y_narratives = narratives_encoder.fit_transform(narratives)
# Use the same TF-IDF features (X_tfidf) for training
X_train_narr, X_test_narr, y_train_narr, y_test_narr = train_test_split(
    X_tfidf[annotations['taxonomy'] != 'Other'], y_narratives, test_size=0.2, random_state=42
)


In [14]:
from skmultilearn.problem_transform import LabelPowerset

# Initialize and train Label Powerset classifier with XGBoost
narrative_clf = LabelPowerset(model)
narrative_clf.fit(X_train_narr, y_train_narr)

# Predict narratives
y_pred_narr = narrative_clf.predict(X_test_narr)

# Evaluate narratives
from sklearn.metrics import hamming_loss, jaccard_score
f1_micro = f1_score(y_test_narr, y_pred_narr, average='micro')
print(f"Total F1-Score (Micro): {f1_micro:.4f}")
f1_macro = f1_score(y_test_narr, y_pred_narr, average='macro')
print(f"Total F1-Score (Macro): {f1_macro:.4f}")
print("Narrative Classification Report:")
print(classification_report(y_test_narr, y_pred_narr, target_names=narratives_encoder.classes, zero_division=0))




Total F1-Score (Micro): 0.1658
Total F1-Score (Macro): 0.0725
Narrative Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         8
           2       0.00      0.00      0.00         8
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         3
           5       0.11      0.25      0.15         4
           6       0.29      0.20      0.24        10
           7       0.62      0.53      0.57        15
           8       0.29      0.22      0.25         9
           9       0.30      0.20      0.24        15
          10       0.00      0.00      0.00         5
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         8
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# # Filter annotations to exclude 'Other'
# sub_narratives_mask = annotations['taxonomy'] != 'Other'
# sub_narratives = annotations.loc[sub_narratives_mask, 'sub_narratives']
# X_sub_filtered = X_tfidf[sub_narratives_mask]

# # Encode sub-narratives as multi-label
# sub_narratives_encoder = MultiLabelBinarizer()
# y_sub_narratives = sub_narratives_encoder.fit_transform(sub_narratives)


In [16]:
# X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(X_sub_filtered, y_sub_narratives, test_size=0.2, random_state=42)
# sub_narrative_clf = LabelPowerset(XGBClassifier(eval_metric='mlogloss', random_state=42))
# sub_narrative_clf.fit(X_train_sub, y_train_sub)

# # Predict on the test set
# y_pred_sub = sub_narrative_clf.predict(X_test_sub)
# f1_micro_sub = f1_score(y_test_sub, y_pred_sub, average='micro')
# print(f"Micro F1-Score: {f1_micro_sub:.4f}")
# f1_macro_sub = f1_score(y_test_sub, y_pred_sub, average='macro')
# print(f"Macro F1-Score: {f1_macro_sub:.4f}")
# print("Sub-Narrative Classification Report:")
# print(classification_report(y_test_sub, y_pred_sub, target_names=sub_narratives_encoder.classes, zero_division=0))

In [None]:
sub_narratives = annotations.loc[annotations['taxonomy'] != 'Other', 'sub_narratives']

# Step 2: Encode sub-narratives
sub_narratives_encoder = MultiLabelBinarizer()
y_sub_narratives = sub_narratives_encoder.fit_transform(sub_narratives)

X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
    X_tfidf[annotations['taxonomy'] != 'Other'], y_sub_narratives, test_size=0.2, random_state=42
)

# Step 4: Initialize and train Label Powerset classifier for sub-narratives
from xgboost import XGBClassifier

sub_narrative_model = XGBClassifier(eval_metric='mlogloss', random_state=42)
sub_narrative_clf = LabelPowerset(sub_narrative_model)
sub_narrative_clf.fit(X_train_sub, y_train_sub)

# Step 5: Predict sub-narratives
y_pred_sub = sub_narrative_clf.predict(X_test_sub)

# Step 6: Evaluate predictions
f1_micro_sub = f1_score(y_test_sub, y_pred_sub, average='micro')
f1_macro_sub = f1_score(y_test_sub, y_pred_sub, average='macro')
print(f"Micro F1-Score: {f1_micro_sub:.4f}")
print(f"Macro F1-Score: {f1_macro_sub:.4f}")

print("Sub-Narrative Classification Report:")
print(
    classification_report(
        y_test_sub, 
        y_pred_sub, 
        target_names=sub_narratives_encoder.classes_, 
        zero_division=0
    )
)

Micro F1-Score: 0.0846
Macro F1-Score: 0.0494
Sub-Narrative Classification Report:
                                                                                                               precision    recall  f1-score   support

                                        Amplifying Climate Fears: Amplifying existing fears of global warming       0.00      0.00      0.00         0
                                                      Amplifying Climate Fears: Doomsday scenarios for humans       0.00      0.00      0.00         0
                                                                              Amplifying Climate Fears: Other       0.00      0.00      0.00         0
                                            Amplifying war-related fears: By continuing the war we risk WWIII       0.00      0.00      0.00         4
                                            Amplifying war-related fears: NATO should/will directly intervene       0.00      0.00      0.00         0
          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
