In [None]:
import nltk
from nltk.tree import Tree
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def read_ptb_tree(tree_string):
    return Tree.fromstring(tree_string)

def extract_sentence_and_label(tree):
    label = (tree.label())

    words = tree.leaves()
    sentence = ' '.join(words)

    return sentence, label

def read_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            tree = read_ptb_tree(line.strip())
            sentence, label = extract_sentence_and_label(tree)
            data.append({'sentence': sentence, 'label': label})
    return data

In [None]:
train_path = '/kaggle/input/treeset/train.txt'
test_path = '/kaggle/input/treeset/test.txt'
dev_path = '/kaggle/input/treeset/dev.txt'

train_data = read_file(train_path)
test_data = read_file(test_path)
dev_data = read_file(dev_path)

In [None]:
def group_data_by_level(data):
    data_by_level = {}
    
    for item in data:
        label = int(item['label']) 
        sentence = item['sentence']
        
        if label not in data_by_level:
            data_by_level[label] = []
        
        data_by_level[label].append(sentence)
    
    return data_by_level

data_by_level = group_data_by_level(train_data)


In [None]:
import random

def create_large_data_pairs(data_by_level, target_size):
    pairs = []
    levels = list(data_by_level.keys())  
    
    while len(pairs) < target_size:
        for level in levels:
            level_data = data_by_level[level]
            if len(level_data) < 2:
                continue  
            
            sen0, sent1 = random.sample(level_data, 2)
            
            if level == 0:
                hard_neg_level = 1
            elif level == 4:
                hard_neg_level = 3
            else:
                hard_neg_level = random.choice([level - 1, level + 1])
            
            hard_neg = random.choice(data_by_level[hard_neg_level])
            
            pairs.append((sen0, sent1, hard_neg))
            
            if len(pairs) >= target_size:
                break
    
    return pairs

target_size = 300000

pairs = create_large_data_pairs(data_by_level, target_size)

In [None]:
pairs = list(set(pairs))

In [None]:
# pairs = pairs[:1000] ##sampling

In [None]:
import re
import unicodedata

def canonicalize_text(text):
    text = re.sub(r'[\d\W_]+', ' ', text)

    text = ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )

    text = text.lower()

    text = text.strip()

    return text

In [None]:
class SST5_Dataset(Dataset):
    def __init__(self, file_path):
        self.data = [
            (
                tokenizer(
                    canonicalize_text(row['sentence']),
                    add_special_tokens=True,
                    max_length=512,  
                    padding='max_length', 
                    truncation=True,      
                    return_tensors="pt"   
                ),  
                int(row['label']) if isinstance(row['label'], str) else row['label']
            )
            for row in read_file(file_path)
        ]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        X,y = self.data[idx]
        input_ids = X['input_ids'].squeeze(0)
        attention_mask = X['attention_mask'].squeeze(0)
        label = torch.tensor(y, dtype=torch.long)
        return input_ids, attention_mask, label

In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from torch.utils.data import DataLoader

trainset = SST5_Dataset(train_path)
testset = SST5_Dataset(test_path)
valset = SST5_Dataset(dev_path)

In [None]:
# from torch.utils.data import Subset
# import random

# train_indices = random.sample(range(len(trainset)), 500)
# test_indices = random.sample(range(len(testset)), 100)
# val_indices = random.sample(range(len(valset)), 100)

# trainset = Subset(trainset, train_indices)
# testset = Subset(testset, test_indices)
# valset = Subset(valset, val_indices)

In [None]:
from transformers import BertTokenizer

def create_triples(pairs):
    triples = []

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    for sen0, sent1, hard_neg in tqdm(pairs):

        anchor = canonicalize_text(sen0)
        pos = canonicalize_text(sent1)
        neg = canonicalize_text(hard_neg)

        anchor_tokens = tokenizer.tokenize('[CLS] ' + anchor + ' [SEP]')
        pos_tokens = tokenizer.tokenize('[CLS] ' + pos + ' [SEP]')
        neg_tokens = tokenizer.tokenize('[CLS] ' + neg + ' [SEP]')
        
        triples.append((anchor, anchor_tokens, pos, pos_tokens, neg, neg_tokens))

    return triples

nli_data = create_triples(pairs)

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import numpy as np

class NLI_Dataset(Dataset):
    def __init__(self, triples):
        self.triples = triples
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.triples)

    def __getitem__(self, idx):
        return self.triples[idx]

    def pad_data(self, data):
        anchor = [x[0] for x in data]
        pos = [x[2] for x in data]
        neg = [x[4] for x in data]

        encoding_anchor = self.tokenizer(anchor, return_tensors='pt', padding=True, truncation=True)
        token_ids_anchor = encoding_anchor['input_ids']
        attention_mask_anchor = encoding_anchor['attention_mask']

        encoding_pos = self.tokenizer(pos, return_tensors='pt', padding=True, truncation=True)
        token_ids_pos = encoding_pos['input_ids']
        attention_mask_pos = encoding_pos['attention_mask']

        encoding_neg = self.tokenizer(neg, return_tensors='pt', padding=True, truncation=True)
        token_ids_neg = encoding_neg['input_ids']
        attention_mask_neg = encoding_neg['attention_mask']

        return token_ids_anchor, attention_mask_anchor, token_ids_pos, attention_mask_pos, token_ids_neg, attention_mask_neg

    def collate_fn(self, all_data):
        all_data.sort(key=lambda x: -len(x[1]))

        batches = []
        num_batches = int(np.ceil(len(all_data) / 32)) 

        for i in range(num_batches):
            start_idx = i * 32
            data = all_data[start_idx: start_idx + 32]

            token_ids_anchor, attention_mask_anchor, \
            token_ids_pos, attention_mask_pos, \
            token_ids_neg, attention_mask_neg = self.pad_data(data)

            batches.append({
                'token_ids_anchor': token_ids_anchor,
                'attention_mask_anchor': attention_mask_anchor,
                'token_ids_pos': token_ids_pos,
                'attention_mask_pos': attention_mask_pos,
                'token_ids_neg': token_ids_neg,
                'attention_mask_neg': attention_mask_neg,
            })

        return batches

NLI_dataset = NLI_Dataset(nli_data)

In [None]:
from transformers import BertModel

import logging
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, f1_score, recall_score, accuracy_score

In [None]:
def supCL_loss(criterion, anchor, pos, neg, new_w=0, temper=0.05):
    cos = nn.CosineSimilarity(dim=-1)
    pos_sim = cos(anchor.unsqueeze(1), pos.unsqueeze(0)) / temper
    neg_sim = cos(anchor.unsqueeze(1), neg.unsqueeze(0)) / temper

    cos_sim = torch.cat([pos_sim, neg_sim], dim=1)
    labels = torch.arange(cos_sim.size(0)).long().to(anchor.device)
    weights = torch.tensor(
        [[0.0] * (cos_sim.size(-1) - neg_sim.size(-1)) + [0.0] * i + [new_w] + [0.0] * (neg_sim.size(-1) - i - 1) for i in range(neg_sim.size(-1))]
    ).to(anchor.device)
    cos_sim += weights
    loss = criterion(cos_sim, labels)
    return loss

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Train with SimCSE
def train_cl(model, criterion, cl_loss, trainset, epochs, path='/kaggle/working/best_model_cl.pth'):
    train_loader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=NLI_dataset.collate_fn)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    for epoch in range(epochs):
        model.train()

        train_loss = 0.0
        for batch in tqdm(train_loader, desc=f"Training SimCSE ...:"):
            optimizer.zero_grad()
            input_ids_anchor = batch[0]['token_ids_anchor'].to(device)
            attention_mask_anchor = batch[0]['attention_mask_anchor'].to(device)
            input_ids_pos = batch[0]['token_ids_pos'].to(device)
            attention_mask_pos = batch[0]['attention_mask_pos'].to(device)
            input_ids_neg = batch[0]['token_ids_neg'].to(device)
            attention_mask_neg = batch[0]['attention_mask_neg'].to(device)

            anchor_output = model(input_ids_anchor, attention_mask_anchor)['pooler_output']
            pos_output = model(input_ids_pos, attention_mask_pos)['pooler_output']
            neg_output = model(input_ids_neg, attention_mask_neg)['pooler_output']

            loss = cl_loss(criterion, anchor_output, pos_output, neg_output)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)

        print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.4f}")
        
    torch.save(model.state_dict(), path)   
        
    return model

cl_model = train_cl(model, nn.CrossEntropyLoss(), supCL_loss, NLI_dataset, epochs=3)


In [None]:
class BertClassifier(nn.Module):
    def __init__(self, num_labels):
        super(BertClassifier, self).__init__()
        self.bert = cl_model
        # Frozen bert
        #self.bert.requires_grad_(False)

        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.5),
            nn.Linear(256, num_labels)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

In [None]:
def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    train_loss = 0.0

    for input_ids, attention_mask, labels in tqdm(dataloader, desc="Training"):
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(dataloader)

    print(f"Train Loss: {train_loss:.4f}")
    return train_loss

def eval_one_epoch(model, dataloader, criterion, device):
    model.eval()
    eval_loss = 0.0

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(dataloader, desc="Evaluating"):
          
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)

            eval_loss += loss.item()

            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    eval_loss /= len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    print(f"Eval Loss: {eval_loss:.4f}, Accuracy: {accuracy:.4f}")

    return eval_loss, accuracy

In [None]:
def train_cls(model, criterion, trainset, valset, epochs, save_dir='/kaggle/working/'):
    train_loader = DataLoader(trainset, batch_size=32, shuffle=True)
    val_loader = DataLoader(valset, batch_size=32, shuffle=False)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
    
    best_model_path = f"{save_dir}best_model_cls.pth"

    best_val_loss = float('inf')
    for epoch in range(epochs):
        train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, accuracy= eval_one_epoch(model, val_loader, criterion, device)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_model_path)
            print(f"Save model at epoch {epoch + 1}")

    return model

model = BertClassifier(5)
model.to(device)

cls_model = train_cls(model, nn.CrossEntropyLoss(), trainset, valset, 6)

In [None]:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np

def test_model(model_path, testset, device, batch_size=32):

    model = BertClassifier(num_labels=5)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    model.to(device)
    
    test_loader = DataLoader(testset, batch_size=batch_size)
    
    all_preds = []
    all_labels = []
    total_loss = 0
    correct_predictions = 0
    
    with torch.no_grad():
        for input_ids, attention_mask, labels in tqdm(test_loader, desc="Testing"):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = F.cross_entropy(outputs, labels)
            total_loss += loss.item()
            
            # Get predictions
            preds = torch.argmax(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels).item()
            
            # Store predictions and labels for metric calculation
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    accuracy = correct_predictions / len(testset)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, 
        all_preds, 
        average='weighted'
    )
    avg_loss = total_loss / len(test_loader)
    
    # Print metrics
    print(f"Test Loss: {avg_loss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Calculate and plot confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm, 
        annot=True, 
        fmt='d', 
        cmap='Blues',
        xticklabels=np.unique(all_labels),
        yticklabels=np.unique(all_labels)
    )
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.tight_layout()
    plt.show()
    
    metrics = {
        'loss': avg_loss,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm
    }
    
    return metrics

model_path = '/kaggle/working/best_model.pth'

metrics = test_model(
    model_path=model_path,
    testset=testset,
    device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)