In [None]:
!pip install -U transformers datasets accelerate scikit-learn matplotlib seaborn --quiet


In [None]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns
import transformers

print("‚úÖ Transformers version:", transformers.__version__)
print("‚úÖ PyTorch version:", torch.__version__)


#Upload files

In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
df_eng = pd.read_csv("eng_clean.csv", encoding="utf-8")
print(df_eng.shape)
print(df_eng.columns)
df_eng.head()


In [None]:
df_spa = pd.read_csv("spa_clean.csv", encoding="utf-8")
print(df_spa.shape)
print(df_spa.columns)
df_spa.head()

In [None]:
df_deu = pd.read_csv("deu_clean.csv", encoding="utf-8")
print(df_deu.shape)
print(df_deu.columns)
df_deu.head()

================================================================================
#English
================================================================================

In [None]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_eng["text"].tolist(),
    df_eng["polarization"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_eng["polarization"]
)

print(f"‚úÖ Training samples: {len(train_texts)}")
print(f"‚úÖ Testing samples: {len(test_texts)}")


In [None]:
from datasets import Dataset
from transformers import BertTokenizer

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

# Convert to Hugging Face Dataset format
train_df = pd.DataFrame({"text": train_texts, "labels": train_labels})
test_df  = pd.DataFrame({"text": test_texts,  "labels": test_labels})

train_dataset = Dataset.from_pandas(train_df)
test_dataset  = Dataset.from_pandas(test_df)

# Apply tokenization
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset  = test_dataset.map(tokenize, batched=True)

# Set format for PyTorch
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format("torch",  columns=["input_ids", "attention_mask", "labels"])

print("‚úÖ Tokenization complete! Sample columns:", train_dataset.column_names)


In [None]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Load pretrained model (2 labels = polarized / non-polarized)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Define metric function for Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision_macro": p, "recall_macro": r, "f1_macro": f1}


In [None]:
from transformers import Trainer

# Create Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()


In [None]:
batch_size = 16  # you can try 32 if GPU allows

training_args = TrainingArguments(
    output_dir="bert_eng_runs",
    do_train=True,               # train the model
    do_eval=True,                # evaluate at each epoch
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    report_to=[]                 # disables wandb etc.
)

print("‚úÖ Training arguments ready (compatible mode)")



In [None]:
# make a folder in Drive for your model
save_path = "/content/drive/MyDrive/POLAR_Task_9/models/english_bert_base_uncased"

trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

print(f"‚úÖ Model and tokenizer saved to: {save_path}")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/POLAR_Task_9/models/english_bert_base_uncased",
    local_files_only=True
)
tokenizer = BertTokenizer.from_pretrained(
    "/content/drive/MyDrive/POLAR_Task_9/models/english_bert_base_uncased",
    local_files_only=True
)

print("‚úÖ English BERT model loaded successfully from Drive!")

In [None]:
# Re-running cells to define and train the English model

batch_size = 16  # you can try 32 if GPU allows

training_args = TrainingArguments(
    output_dir="bert_eng_runs",
    do_train=True,               # train the model
    do_eval=True,                # evaluate at each epoch
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    report_to=[]                 # disables wandb etc.
)

print("‚úÖ Training arguments ready (compatible mode)")

In [None]:
from transformers import Trainer

# Create Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
save_dir = "/content/drive/MyDrive/POLAR_Task_9/models/english_bert_uncased_evaluation"
os.makedirs(save_dir, exist_ok=True)


# evaluate on test set
metrics = trainer.evaluate(test_dataset)
print("\n=== Test Metrics ===")
for k, v in metrics.items():
    if isinstance(v, (int, float)):
        print(f"{k}: {v:.4f}")

# predictions
pred_out = trainer.predict(test_dataset)
y_true = pred_out.label_ids
y_pred = np.argmax(pred_out.predictions, axis=1)

# classification report
print("\n=== Detailed Classification Report ===")
print(classification_report(y_true, y_pred, target_names=["Non-Polarized", "Polarized"], digits=4))

# confusion matrix
cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(4.5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Non-Polarized","Polarized"],
            yticklabels=["Non-Polarized","Polarized"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix ‚Äì BERT Base Uncased (English)")
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "confusion_matrix.png"), bbox_inches='tight', dpi=300)
plt.show()
with open(os.path.join(save_dir, "test_metrics.txt"), "w") as f:
    for k, v in metrics.items():
        if isinstance(v, (int, float)):
            f.write(f"{k}: {v:.4f}\n")
with open(os.path.join(save_dir, "classification_report.txt"), "w") as f:
    f.write(classification_report(y_true, y_pred, target_names=["Non-Polarized", "Polarized"], digits=4))



================================================================================
#Spanish
================================================================================

In [None]:
#Spanish
# 2Ô∏è‚É£ Split into train/test
train_texts_spa, test_texts_spa, train_labels_spa, test_labels_spa = train_test_split(
    df_spa["text"].tolist(),
    df_spa["polarization"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_spa["polarization"]
)
print(f"Training samples: {len(train_texts_spa)} | Testing samples: {len(test_texts_spa)}")

# 3Ô∏è‚É£ Tokenize (Spanish BERT)
tokenizer_spa = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

def tokenize_spa(batch):
    return tokenizer_spa(batch["text"], padding="max_length", truncation=True, max_length=128)

train_df_spa = pd.DataFrame({"text": train_texts_spa, "labels": train_labels_spa})
test_df_spa  = pd.DataFrame({"text": test_texts_spa,  "labels": test_labels_spa})

train_dataset_spa = Dataset.from_pandas(train_df_spa).map(tokenize_spa, batched=True)
test_dataset_spa  = Dataset.from_pandas(test_df_spa ).map(tokenize_spa, batched=True)

train_dataset_spa = train_dataset_spa.with_format("torch", columns=["input_ids","attention_mask","labels"])
test_dataset_spa  = test_dataset_spa.with_format("torch",  columns=["input_ids","attention_mask","labels"])

print("‚úÖ Spanish tokenization complete!")


In [None]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
import os
# 1Ô∏è‚É£ Load Spanish BERT model
model_spa = BertForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels=2)

# 2Ô∏è‚É£ Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision_macro": precision, "recall_macro": recall, "f1_macro": f1}
os.environ["WANDB_DISABLED"] = "true"


In [None]:
# 3Ô∏è‚É£ Training arguments
training_args_spa = TrainingArguments(
    output_dir="bert_spanish_runs",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50
)

# 4Ô∏è‚É£ Trainer
trainer_spa = Trainer(
    model=model_spa,
    args=training_args_spa,
    train_dataset=train_dataset_spa,
    eval_dataset=test_dataset_spa,
    compute_metrics=compute_metrics
)

print("üöÄ Starting training for Spanish...")
trainer_spa.train()


In [None]:
from google.colab import drive
drive.mount('/content/drive')


##Load Spanish Model

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model_spa = BertForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/POLAR_Task_9/models/spanish_bert_base_uncased",
    local_files_only=True
)
tokenizer_spa = BertTokenizer.from_pretrained(
    "/content/drive/MyDrive/POLAR_Task_9/models/spanish_bert_base_uncased",
    local_files_only=True
)

print("‚úÖ Spanish BERT model loaded successfully from Drive!")



In [None]:
from sklearn.model_selection import train_test_split

# Assuming df_spa is your Spanish dataframe
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df_spa["text"].tolist(),
    df_spa["polarization"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_spa["polarization"]
)


In [None]:
from torch.utils.data import DataLoader, TensorDataset
import torch

test_encodings = tokenizer_spa(
    test_texts,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

test_dataset = TensorDataset(
    test_encodings["input_ids"],
    test_encodings["attention_mask"],
    torch.tensor(test_labels)
)


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import torch
import os

# === Add these lines ===
save_dir = "/content/drive/MyDrive/POLAR_Task_9/models/spanish_bert_uncased_evaluation"
os.makedirs(save_dir, exist_ok=True)
# ========================

model_spa.eval()  # set model to evaluation mode

all_preds = []
all_labels = []

with torch.no_grad():
    for i in range(0, len(test_dataset), 16):  # batch size = 16
        batch = test_dataset[i:i+16]
        input_ids, attention_mask, labels = [t for t in batch]
        outputs = model_spa(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.tolist())
        all_labels.extend(labels.tolist())

# Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro')

print("=== Test Metrics (Spanish BERT Uncased) ===")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")

# === Save metrics ===
with open(os.path.join(save_dir, "test_metrics.txt"), "w") as f:
    f.write(f"Accuracy:  {accuracy:.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall:    {recall:.4f}\n")
    f.write(f"F1-score:  {f1:.4f}\n")

# Classification report
report = classification_report(all_labels, all_preds, target_names=["Non-Polarized", "Polarized"])
print("\n=== Detailed Classification Report ===")
print(report)

# === Save classification report ===
with open(os.path.join(save_dir, "classification_report.txt"), "w") as f:
    f.write(report)

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(4.5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["Non-Polarized", "Polarized"],
            yticklabels=["Non-Polarized", "Polarized"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix ‚Äì Spanish BERT Uncased Model")
plt.tight_layout()

# === Save confusion matrix ===
plt.savefig(os.path.join(save_dir, "confusion_matrix.png"), bbox_inches='tight', dpi=300)
plt.show()

print(f"\n‚úÖ Evaluation results saved to: {save_dir}")


==============================================================================================
#German
==============================================================================================

In [None]:
# üá©üá™ --- GERMAN DATA PREPARATION (for BERT Uncased) ---

from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer

# ‚úÖ 1Ô∏è‚É£ Split into train/test
train_texts_deu, test_texts_deu, train_labels_deu, test_labels_deu = train_test_split(
    df_deu["text"].tolist(),
    df_deu["polarization"].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=df_deu["polarization"]
)
print(f"Training samples: {len(train_texts_deu)} | Testing samples: {len(test_texts_deu)}")

# ‚úÖ 2Ô∏è‚É£ Load correct uncased German BERT tokenizer
tokenizer_deu = BertTokenizer.from_pretrained("bert-base-german-dbmdz-uncased")

# ‚úÖ 3Ô∏è‚É£ Tokenization function
def tokenize_deu(batch):
    return tokenizer_deu(batch["text"], padding="max_length", truncation=True, max_length=128)

# ‚úÖ 4Ô∏è‚É£ Create HuggingFace datasets
train_df_deu = pd.DataFrame({"text": train_texts_deu, "labels": train_labels_deu})
test_df_deu  = pd.DataFrame({"text": test_texts_deu,  "labels": test_labels_deu})

train_dataset_deu = Dataset.from_pandas(train_df_deu).map(tokenize_deu, batched=True)
test_dataset_deu  = Dataset.from_pandas(test_df_deu ).map(tokenize_deu, batched=True)

train_dataset_deu = train_dataset_deu.with_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset_deu  = test_dataset_deu.with_format("torch",  columns=["input_ids", "attention_mask", "labels"])

print("‚úÖ German tokenization complete!")


In [None]:
# üá©üá™ --- GERMAN MODEL TRAINING (BERT Uncased) ---

from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support

# ‚úÖ Define label mapping
id2label = {0: "Non-Polarized", 1: "Polarized"}
label2id = {"Non-Polarized": 0, "Polarized": 1}

# ‚úÖ Load the model
model_deu = BertForSequenceClassification.from_pretrained(
    "bert-base-german-dbmdz-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

# ‚úÖ Define metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision_macro': precision,
        'recall_macro': recall,
        'f1_macro': f1
    }

# ‚úÖ Training setup (balanced, clean)
batch_size = 16
training_args = TrainingArguments(
    output_dir="bert_deu_runs",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_steps=50,
    report_to="none"
)

In [None]:
#  Trainer setup
trainer_deu = Trainer(
    model=model_deu,
    args=training_args,
    train_dataset=train_dataset_deu,
    eval_dataset=test_dataset_deu,
    tokenizer=tokenizer_deu,
    compute_metrics=compute_metrics
)

print("üöÄ Starting training for German...")
trainer_deu.train()


In [None]:
# üíæ Save German model and tokenizer to Google Drive
save_dir = "/content/drive/MyDrive/POLAR_Task_9/models/german_bert_base_uncased_model"

model_deu.save_pretrained(save_dir)
tokenizer_deu.save_pretrained(save_dir)

print(f"‚úÖ German BERT model and tokenizer saved to: {save_dir}")


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

model_ger = BertForSequenceClassification.from_pretrained(
    "/content/drive/MyDrive/POLAR_Task_9/models/german_bert_base_uncased_model",
    local_files_only=True
)
tokenizer_ger = BertTokenizer.from_pretrained(
    "/content/drive/MyDrive/POLAR_Task_9/models/german_bert_base_uncased_model",
    local_files_only=True
)

print("‚úÖ German BERT model loaded successfully from Drive!")


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/POLAR_Task_9/models/german_bert_base_uncased_model/eval_temp",
    per_device_eval_batch_size=16,
    report_to="none",
)

trainer_deu = Trainer(
    model=model_deu,
    args=training_args,
    tokenizer=tokenizer_deu,
    eval_dataset=test_dataset_deu  # <--- important
)


In [None]:
metrics_deu = trainer_deu.evaluate(test_dataset_deu)
preds_output = trainer_deu.predict(test_dataset_deu)


In [None]:
# üá©üá™ --- GERMAN MODEL EVALUATION AND SAVE RESULTS ---

from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

# === Create save directory ===
save_dir = "/content/drive/MyDrive/POLAR_Task_9/models/german_bert_base_uncased_evaluation"
os.makedirs(save_dir, exist_ok=True)

# ‚úÖ Evaluate on test set
print("=== Test Metrics (German) ===")
metrics_deu = trainer_deu.evaluate(eval_dataset=test_dataset_deu)  # pass eval dataset
for k, v in metrics_deu.items():
    print(f"{k}: {v:.4f}")

# === Save metrics ===
with open(os.path.join(save_dir, "test_metrics.txt"), "w") as f:
    for k, v in metrics_deu.items():
        f.write(f"{k}: {v:.4f}\n")

# ‚úÖ Generate predictions
preds_output = trainer_deu.predict(test_dataset_deu)
preds = np.argmax(preds_output.predictions, axis=-1)
labels = preds_output.label_ids

# ‚úÖ Detailed classification report
report = classification_report(labels, preds, target_names=["Non-Polarized", "Polarized"])
print("\n=== Detailed Classification Report (German) ===")
print(report)

# === Save classification report ===
with open(os.path.join(save_dir, "classification_report.txt"), "w") as f:
    f.write(report)

# ‚úÖ Confusion matrix visualization
cm = confusion_matrix(labels, preds)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Purples",
            xticklabels=["Non-Polarized", "Polarized"],
            yticklabels=["Non-Polarized", "Polarized"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix ‚Äì German BERT Base Uncased")
plt.tight_layout()

# === Save confusion matrix ===
plt.savefig(os.path.join(save_dir, "confusion_matrix.png"), bbox_inches='tight', dpi=300)
plt.show()

print(f"\n‚úÖ All evaluation results saved to: {save_dir}")
