In [7]:
import wandb
wandb.login(key="c26df6b59bfb128917e73bbb00a79ca7e9324a11")




True

In [None]:
import torch
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import logging
import shutil
import os

logging.basicConfig(level=logging.INFO)

train_path = "/kaggle/input/maqa-unbalanced-with-severity/MAQA_Severity_Train.xlsx"
test_path = "/kaggle/input/maqa-unbalanced-with-severity/MAQA_Severity_Test.xlsx"

train_df = pd.read_excel(train_path)
test_df = pd.read_excel(test_path)

all_data = pd.concat([train_df, test_df], ignore_index=True)[['q_body', 'severity', 'category']]

valid_categories = [
    "ÿßŸÖÿ±ÿßÿ∂ ŸÜÿ≥ÿßÿ¶Ÿäÿ©", "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπÿ∂ŸÑÿßÿ™ ŸàÿßŸÑÿπÿ∏ÿßŸÖ Ÿà ÿßŸÑŸÖŸÅÿßÿµŸÑ", "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨Ÿáÿßÿ≤ ÿßŸÑŸáÿ∂ŸÖŸä",
    "ÿßŸÑÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨ŸÜÿ≥Ÿäÿ©", "ÿ∑ÿ® ÿßŸÑÿßÿ≥ŸÜÿßŸÜ", "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑŸÇŸÑÿ® Ÿà ÿßŸÑÿ¥ÿ±ÿßŸäŸäŸÜ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπŸäŸàŸÜ", "ÿßŸÜŸÅ ÿßÿ∞ŸÜ Ÿàÿ≠ŸÜÿ¨ÿ±ÿ©", "ÿ¨ÿ±ÿßÿ≠ÿ© ÿ™ÿ¨ŸÖŸäŸÑ", "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿØŸÖ"
]
all_data = all_data[all_data["category"].isin(valid_categories)].reset_index(drop=True)

valid_severity = ["ÿ≠ÿ±ÿ¨", "ÿ∫Ÿäÿ± ÿ≠ÿ±ÿ¨"]
all_data = all_data[all_data["severity"].isin(valid_severity)].reset_index(drop=True)

all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)
train_df, test_df = train_test_split(
    all_data[['q_body', 'severity']],
    test_size=0.2,
    random_state=42,
    stratify=all_data['severity']
)

print("\nüîπ Training Severity Distribution:")
print(train_df["severity"].value_counts())
print("\nüîπ Test Severity Distribution:")
print(test_df["severity"].value_counts())

severity_mapping = {sev: i for i, sev in enumerate(valid_severity)}
train_df['label'] = train_df['severity'].map(severity_mapping)
test_df['label'] = test_df['severity'].map(severity_mapping)

model_name = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["q_body"], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_pandas(train_df[['q_body', 'label']])
test_dataset = Dataset.from_pandas(test_df[['q_body', 'label']])
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

base_model = AutoModel.from_pretrained(model_name)

class CustomModel(torch.nn.Module):
    def __init__(self, base_model, num_labels):
        super(CustomModel, self).__init__()
        self.base_model = base_model
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(base_model.config.hidden_size, num_labels)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        loss = self.loss_fn(logits, labels) if labels is not None else None
        return {"loss": loss, "logits": logits}

model = CustomModel(base_model, num_labels=len(valid_severity))

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=96,
    per_device_eval_batch_size=96,
    num_train_epochs=10,
    weight_decay=0.01,
    learning_rate=3e-5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("\nüîπ Starting Training...")
trainer.train()

test_metrics = trainer.evaluate(test_dataset)
print("\nüîπ Test Metrics:", test_metrics)

predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids

print("\nüîπ Confusion Matrix:")
print(confusion_matrix(labels, preds))

print("\nüîπ Classification Report:")
print(classification_report(labels, preds))

def save_complete_model(model, tokenizer, severity_mapping, save_path):
    model.base_model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    torch.save({
        'classifier_state': model.classifier.state_dict(),
        'num_labels': model.classifier.out_features
    }, f"{save_path}/classifier_state.pt")
    with open(f"{save_path}/severity_mapping.pkl", "wb") as f:
        pickle.dump(severity_mapping, f)

save_complete_model(trainer.model, tokenizer, severity_mapping, "XLM-RoBERTa-Severity")

def load_complete_model(model_path):
    base_model = AutoModel.from_pretrained(model_path)
    classifier_state = torch.load(f"{model_path}/classifier_state.pt", map_location=torch.device('cpu'))
    model = CustomModel(base_model, classifier_state['num_labels'])
    model.classifier.load_state_dict(classifier_state['classifier_state'])
    model.eval()
    return model

tokenizer = AutoTokenizer.from_pretrained("XLM-RoBERTa-Severity")
model2 = load_complete_model("XLM-RoBERTa-Severity")
print("‚úÖ Model Loaded Successfully!")

def predict_severity(text, model, tokenizer, severity_mapping):
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
    logits = outputs["logits"]
    predicted_label = torch.argmax(logits, dim=-1).item()
    severity_mapping_reverse = {v: k for k, v in severity_mapping.items()}
    return severity_mapping_reverse[predicted_label]

test_samples = ["ÿßŸÑÿßÿπÿ±ÿßÿ∂ ÿ¥ÿØŸäÿØŸá", "ÿ™ÿ≠ÿ™ÿßÿ¨ ÿßŸÑŸâ ÿ±ÿßÿ≠Ÿá ŸÅŸÇÿ∑", "ÿ≠ÿßŸÑŸá ŸÖÿ™Ÿàÿ≥ÿ∑Ÿá", "ŸÜÿ≤ŸäŸÅ ÿ≠ÿßÿØ"]
for text in test_samples:
    predicted_severity = predict_severity(text, model2, tokenizer, severity_mapping)
    print(f"\nüîπ Input: {text}")
    print(f"Predicted Severity: {predicted_severity}")

shutil.make_archive("XLM-RoBERTa-Severity", 'zip', "XLM-RoBERTa-Severity")
print("\n‚úÖ Model Saved & Zipped for Download!")

# Download link
from IPython.display import FileLink
FileLink(r'XLM-RoBERTa-Severity.zip')



üîπ Training Severity Distribution:
severity
ÿ≠ÿ±ÿ¨        120301
ÿ∫Ÿäÿ± ÿ≠ÿ±ÿ¨     38842
Name: count, dtype: int64

üîπ Test Severity Distribution:
severity
ÿ≠ÿ±ÿ¨        30075
ÿ∫Ÿäÿ± ÿ≠ÿ±ÿ¨     9711
Name: count, dtype: int64


Map:   0%|          | 0/159143 [00:00<?, ? examples/s]

Map:   0%|          | 0/39786 [00:00<?, ? examples/s]

  trainer = Trainer(



üîπ Starting Training...




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3715,0.345586,0.840924,0.843063,0.840924,0.841911
2,0.3008,0.378035,0.826622,0.85516,0.826622,0.834433
3,0.2841,0.374705,0.835998,0.862674,0.835998,0.843201
4,0.234,0.356396,0.853944,0.867076,0.853944,0.858201
5,0.192,0.571796,0.803197,0.863407,0.803197,0.815488
6,0.1567,0.467305,0.834389,0.867244,0.834389,0.84251
7,0.1349,0.553272,0.829387,0.866731,0.829387,0.838259
8,0.1023,0.639973,0.82519,0.866794,0.82519,0.834718
9,0.0771,0.628205,0.842759,0.870679,0.842759,0.849906
10,0.0847,0.684867,0.838134,0.869379,0.838134,0.845901





üîπ Test Metrics: {'eval_loss': 0.35639625787734985, 'eval_accuracy': 0.8539435982506409, 'eval_precision': 0.8670760117303695, 'eval_recall': 0.8539435982506409, 'eval_f1': 0.8582014389961218, 'eval_runtime': 166.7611, 'eval_samples_per_second': 238.581, 'eval_steps_per_second': 1.247, 'epoch': 10.0}





üîπ Confusion Matrix:
[[26183  3892]
 [ 1919  7792]]

üîπ Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.87      0.90     30075
           1       0.67      0.80      0.73      9711

    accuracy                           0.85     39786
   macro avg       0.80      0.84      0.81     39786
weighted avg       0.87      0.85      0.86     39786

‚úÖ Model Loaded Successfully!

üîπ Input: ÿßŸÑÿßÿπÿ±ÿßÿ∂ ÿ¥ÿØŸäÿØŸá
Predicted Severity: ÿ≠ÿ±ÿ¨

üîπ Input: ÿ™ÿ≠ÿ™ÿßÿ¨ ÿßŸÑŸâ ÿ±ÿßÿ≠Ÿá ŸÅŸÇÿ∑
Predicted Severity: ÿ≠ÿ±ÿ¨

üîπ Input: ÿ≠ÿßŸÑŸá ŸÖÿ™Ÿàÿ≥ÿ∑Ÿá
Predicted Severity: ÿ≠ÿ±ÿ¨

üîπ Input: ŸÜÿ≤ŸäŸÅ ÿ≠ÿßÿØ
Predicted Severity: ÿ≠ÿ±ÿ¨

‚úÖ Model Saved & Zipped for Download!


In [17]:
from pprint import pprint
report = classification_report(labels, preds, output_dict=True)
print("\nüîπ Classification Report (high precision):")
print(f"Accuracy: {accuracy_score(labels, preds):.4f}")
print(f"Macro Avg F1-score: {report['macro avg']['f1-score']:.4f}")
print(f"Weighted Avg F1-score: {report['weighted avg']['f1-score']:.4f}")
print("\nüîπ Full Report Dictionary:")
pprint(report)



üîπ Classification Report (high precision):
Accuracy: 0.8539
Macro Avg F1-score: 0.8143
Weighted Avg F1-score: 0.8582

üîπ Full Report Dictionary:
{'0': {'f1-score': 0.9001151657871668,
       'precision': 0.9317130453348517,
       'recall': 0.870590191188695,
       'support': 30075},
 '1': {'f1-score': 0.7283944846926852,
       'precision': 0.6668948990071893,
       'recall': 0.8023890433528987,
       'support': 9711},
 'accuracy': 0.8539435982506409,
 'macro avg': {'f1-score': 0.814254825239926,
               'precision': 0.7993039721710204,
               'recall': 0.8364896172707968,
               'support': 39786},
 'weighted avg': {'f1-score': 0.8582014389961218,
                  'precision': 0.8670760117303695,
                  'recall': 0.8539435982506409,
                  'support': 39786}}
