In [None]:
!pip install pandas pyarrow fastparquet huggingface_hub

In [None]:
df = pd.read_parquet("hf://datasets/KAIST-IC-LAB721/SDCNL/data/train-00000-of-00001.parquet")

In [None]:
df

In [None]:
df['text'].head()

In [None]:
for idx, str in enumerate(df['text']):
    # print(string)

    clean_str = list([val for val in str if val.isalnum() or val == ' '])
    clean_str = ''.join(clean_str)
    low_clean_str = clean_str.lower()

    # print(low_clean_str)

    df = df.replace(df['text'][idx], low_clean_str)

In [None]:
df['text'][0]

In [None]:
!pip install transformers torch

In [None]:
from transformers import AutoTokenizer, XLMRobertaModel

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaModel.from_pretrained('xlm-roberta-base', num_labels=2, hidden_dropout_prob=0.3,    # Add dropout
    attention_probs_dropout_prob=0.3)

In [None]:
!pip install  tqdm scikit-learn

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

In [29]:
print("MPS available:", torch.backends.mps.is_available())

MPS available: False


In [None]:
def get_device():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    return device

In [None]:
device = get_device()
print(f"Current device is: {device}")  # Will print "cuda" if using CUDA
print(f"Is CUDA?: {device.type == 'cuda'}")  # Will print True if using CUDA

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        # Remove the str() conversion since texts are already strings
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Hypeparameters for training need to be modified to prevent overfitting

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Training function
def train_model(model, train_loader, val_loader, device, num_epochs=15):
    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.1)
    
    best_val_acc = 0
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        all_train_labels = []
        all_train_predictions = []
        all_train_probs = []
        
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}')
        
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            
            # Calculate accuracy
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
            
            all_train_labels.extend(labels.cpu().numpy())
            all_train_predictions.extend(predictions.cpu().numpy())
            all_train_probs.extend(torch.softmax(outputs.logits, dim=1)[:, 1].detach().cpu().numpy())  # Detach tensor
            
            progress_bar.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{(correct/total)*100:.2f}%'
            })
        
        # Calculate training metrics
        train_acc = accuracy_score(all_train_labels, all_train_predictions) * 100
        train_precision = precision_score(all_train_labels, all_train_predictions)
        train_recall = recall_score(all_train_labels, all_train_predictions)
        train_f1 = f1_score(all_train_labels, all_train_predictions)
        train_auroc = roc_auc_score(all_train_labels, all_train_probs)
        
        print(f'Training Accuracy: {train_acc:.2f}%')
        print(f'Training Precision: {train_precision:.4f}')
        print(f'Training Recall: {train_recall:.4f}')
        print(f'Training F1-score: {train_f1:.4f}')
        print(f'Training AUROC: {train_auroc:.4f}')
        
        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        all_val_labels = []
        all_val_predictions = []
        all_val_probs = []
        
        print("\nRunning validation...")
        with torch.no_grad():
            for batch in tqdm(val_loader):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )
                
                predictions = torch.argmax(outputs.logits, dim=1)
                probs = torch.softmax(outputs.logits, dim=1)[:, 1]
                val_correct += (predictions == labels).sum().item()
                val_total += labels.size(0)

                all_val_labels.extend(labels.cpu().numpy())
                all_val_predictions.extend(predictions.cpu().numpy())
                all_val_probs.extend(probs.cpu().numpy())
        
        val_acc = (val_correct/val_total)*100
        val_precision = precision_score(all_val_labels, all_val_predictions)
        val_recall = recall_score(all_val_labels, all_val_predictions)
        val_f1 = f1_score(all_val_labels, all_val_predictions)
        val_auroc = roc_auc_score(all_val_labels, all_val_probs)

        print(f'Validation Accuracy: {val_acc:.2f}%')
        print(f'Validation Precision: {val_precision:.4f}')
        print(f'Validation Recall: {val_recall:.4f}')
        print(f'Validation F1-score: {val_f1:.4f}')
        print(f'Validation AUROC: {val_auroc:.4f}')
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pt')
            print(f'Saved new best model with validation accuracy: {val_acc:.2f}%')

In [None]:
# Set device
device = get_device()
print(f"Using device: {device}")

# Initialize model with proper configuration
model = XLMRobertaForSequenceClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=2,
    problem_type="single_label_classification"
).to(device)

# Split data - making sure to convert to list
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    list(df['text']),
    list(df['label']),
    test_size=0.2,
    random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=42
)

# Create datasets
train_dataset = TextClassificationDataset(
    train_texts,
    train_labels,
    tokenizer
)

val_dataset = TextClassificationDataset(
    val_texts,
    val_labels,
    tokenizer
)

# Create dataloaders with smaller batch size for CPU
batch_size = 4 if device == 'cpu' else 32

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4  # Set to 0 for CPU
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=4  # Set to 0 for CPU
)

# Print dataset sizes
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

# Train the model
try:
    train_model(model, train_loader, val_loader, device)
except KeyboardInterrupt:
    print("Training interrupted by user")
except Exception as e:
    print(f"Error during training: {str(e)}")
    raise e

In [None]:
def predict(texts, model, tokenizer, device):
    model.eval()
    encoded_texts = tokenizer(
        texts,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    input_ids = encoded_texts['input_ids'].to(device)
    attention_mask = encoded_texts['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
    
    return predictions.cpu().numpy()


def test_model(model, test_loader, device):
    model.eval()
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predictions.cpu().numpy())
    
    acc = accuracy_score(all_labels, all_predictions)
    print(f"Test Accuracy: {acc:.2f}")
    

test_dataset = TextClassificationDataset(
    test_texts,
    test_labels,
    tokenizer
)

test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=4  # Set to 0 for CPU
)

print(f"Testing samples: {len(test_dataset)}")

# Load the best model
model.load_state_dict(torch.load('best_model.pt'))
model.to(device)

# Example prediction
new_texts = [test_dataset.texts[0], test_dataset.texts[1]]
predictions = [predict(test_dataset.texts[0], model, tokenizer, device), predict(test_dataset.texts[0], model, tokenizer, device)]
print(f'Sample text: {new_texts}\n True labels: {test_dataset.labels[0], test_dataset.labels[1]}\n Predictions: {predictions}')

# Test the model
model.eval()
all_labels = []
all_predictions = []
all_probs = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predicted_labels = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        probs = torch.softmax(outputs.logits, dim=1)[:, 1].cpu().numpy()
        
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted_labels)
        all_probs.extend(probs)

test_acc = accuracy_score(all_labels, all_predictions) * 100
test_precision = precision_score(all_labels, all_predictions)
test_recall = recall_score(all_labels, all_predictions)
test_f1 = f1_score(all_labels, all_predictions)
test_auroc = roc_auc_score(all_labels, all_probs)

print(f'Test Accuracy: {test_acc:.2f}%')
print(f'Test Precision: {test_precision:.4f}')
print(f'Test Recall: {test_recall:.4f}')
print(f'Test F1-score: {test_f1:.4f}')
print(f'Test AUROC: {test_auroc:.4f}')

In [None]:
!pip install shap lime

In [None]:
# Need to fix up the code below

In [None]:
import shap, transformers, numpy as np, matplotlib as plt, pandas as pd

pred = transformers.pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer(
        test_texts,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    ),
    device=device,
    return_all_scores=True,
)

shap_explainer = shap.Explainer(pred)
shap_values = shap_explainer(test_texts)

shap.initjs()
shap.plots.bar(shap_values[:, :, 0].mean(0), order=shap.Explanation.argsort)
shap.plots.bar(shap_values[:, :, 1].mean(0), order=shap.Explanation.argsort)
figure = plt.figure()
shap.summary_plot(shap_values, test_texts)

In [None]:
from lime.lime_tabular import LimeTabularExplainer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(lowercase=True, stop_words="english")
X_test = vectorizer.fit_transform(test_texts)
feature_names = vectorizer.get_feature_names_out()
print("Feature Names:", feature_names)

labels = np.array([1, 0])  
lime_explainer = LimeTabularExplainer(
    training_data=X_test.toarray(),  # Convert sparse matrix to dense array
    feature_names=feature_names,
    class_names=[0, 1], 
    verbose=True,
    mode="classification"
)

# Testing single instance
lime_explanation = lime_explainer.explain_instance(
    X_test.toarray()[0],
    model.predict_proba,
    num_features=10
)

lime_explanation.show_in_notebook()

In [None]:
# Running LIME for multiple instances
for i in range(len(X_test.toarray())):
    explanation = lime_explainer.explain_instance(
        X_test.toarray()[i],
        model.predict_proba,
        num_features=10
    )

    explanation.show_in_notebook()

In [None]:
shap_feature_importance = shap_values.values
shap_feature_names = shap_values.feature_names

lime_feature_importance = {name: weight for name, weight in lime_explanation.as_list()}

shap_norm = np.abs(shap_feature_importance / np.sum(np.abs(shap_feature_importance)))
lime_norm = np.array([lime_feature_importance.get(name, 0) for name in shap_feature_names])
lime_norm = np.abs(lime_norm / np.sum(np.abs(lime_norm)))

combined_importance = pd.DataFrame({
    "Feature": shap_feature_names,
    "SHAP Importance": shap_norm,
    "LIME Importance": lime_norm
})

combined_importance = combined_importance.sort_values(by="SHAP Importance", ascending=False)

combined_importance.set_index("Feature").plot(kind="bar", figsize=(10, 6))
plt.title("SHAP vs LIME Feature Importance")
plt.ylabel("Normalized Importance")
plt.show()