In [20]:
# based on https://www.kaggle.com/competitions/AI4Code/discussion/343614

import os
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, BertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from time import time, strftime, localtime
from bisect import bisect
from functools import cmp_to_key
import json
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

config = {
    "data_path": "../AI4Code_data/train/",
    "test_path": "../AI4Code_data/test/",
    "train_orders_path": "../AI4Code_data/train_orders.csv",
    
    "train_size": 0.8,
    "valid_size": 0.2,
    "random_seed": 42,
    
    "max_features": 20000,
    "ngram_range": (1, 3),
    "char_ngram_range": (2, 5),
    
    "embedding_dim": 256,
    "hidden_size": 512,
    "num_attention_heads": 8,
    "num_hidden_layers": 4,
    "dropout_rate": 0.1,
    "use_eye_matrix": True,
    "use_char_ngram_similarity": True,
    
    "batch_size": 1,
    "learning_rate": 2e-4,
    "weight_decay": 0.01,
    "num_epochs": 5,
    "save_dir": "./tfidf_transformer_model/"
}

def prepare_folders():
    current_time = strftime("%d.%m.%Y-%H.%M", localtime())
    savedir = f"{config['save_dir']}{current_time}/"

    if not os.path.exists(config['save_dir']):
        os.makedirs(config['save_dir'])
    if not os.path.exists(savedir):
        os.makedirs(savedir)

    return savedir

def get_device():
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
    return device

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions

def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip().lower()


In [17]:
class TfidfNotebookDataset(Dataset):
    def __init__(self, path, data, fit_vectorizers=False):
        self.path = path
        self.data = data
        self.notebook_ids = list(data.index)
        
        self.word_vectorizer = TfidfVectorizer(
            max_features=config['max_features'],
            ngram_range=config['ngram_range'],
            min_df=3,
            analyzer='word'
        )
        
        self.char_vectorizer = TfidfVectorizer(
            max_features=config['max_features'],
            ngram_range=config['char_ngram_range'],
            min_df=3,
            analyzer='char'
        )
        
        self.notebooks = {}
        all_texts = []
        
        print(f"Loading {len(self.notebook_ids)} notebooks...")
        for notebook_id in tqdm(self.notebook_ids):
            with open(f"{path}{notebook_id}.json") as f:
                notebook = json.load(f)
                
            cell_order = self.data.loc[notebook_id, "cell_order"]
            
            cell_texts = []
            cell_types = []
            
            for cell_id in cell_order:
                source = notebook["source"][cell_id]
                cell_type = notebook["cell_type"][cell_id]
                
                processed_text = preprocess_text(source)
                cell_texts.append(processed_text)
                cell_types.append(1 if cell_type == "code" else 0)
                
                all_texts.append(processed_text)
            
            self.notebooks[notebook_id] = {
                "cell_ids": cell_order,
                "cell_texts": cell_texts,
                "cell_types": cell_types
            }
        
        if fit_vectorizers:
            print("Fitting TF-IDF vectorizers...")
            self.word_vectorizer.fit(all_texts)
            self.char_vectorizer.fit(all_texts)
    
    def __len__(self):
        return len(self.notebook_ids)
    
    def __getitem__(self, idx):
        notebook_id = self.notebook_ids[idx]
        notebook = self.notebooks[notebook_id]
        
        cell_ids = notebook["cell_ids"]
        cell_texts = notebook["cell_texts"]
        cell_types = notebook["cell_types"]
        
        word_vectors = self.word_vectorizer.transform(cell_texts)
        char_vectors = self.char_vectorizer.transform(cell_texts)
        
        char_sim_matrix = cosine_similarity(char_vectors)
        
        word_vectors_coo = word_vectors.tocoo()
        
        indices = np.column_stack((word_vectors_coo.row, word_vectors_coo.col))
        word_indices = torch.tensor(indices, dtype=torch.long)
        
        word_values = torch.tensor(word_vectors_coo.data, dtype=torch.float)
        word_size = torch.Size(word_vectors.shape)
        
        return {
            "notebook_id": notebook_id,
            "cell_ids": cell_ids,
            "cell_types": torch.tensor(cell_types, dtype=torch.long),
            "word_indices": word_indices,
            "word_values": word_values,
            "word_size": word_size,
            "char_sim_matrix": torch.tensor(char_sim_matrix, dtype=torch.float)
        }



In [21]:
class SparseEmbedding(nn.Module):
    """Sparse embedding layer for TF-IDF vectors with batch support"""
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(vocab_size, embedding_dim) * 0.02)
    
    def forward(self, indices, values, size):
        """
        Compute embeddings from sparse TF-IDF vectors
        
        Args:
            indices: Tensor of indices [B, nnz, 2]
            values: Tensor of values [B, nnz]
            size: Size info as list of tensors [tensor([rows]), tensor([cols])]
        """
        # For now, we only support batch_size=1
        batch_size = indices.size(0)
        if batch_size != 1:
            raise ValueError(f"Only batch_size=1 supported, got {batch_size}")
        
        # Extract the actual dimensions from size
        num_cells = size[0].item()
        vocab_size = size[1].item()
        
        # Remove batch dimension
        indices = indices.squeeze(0)  # [nnz, 2]
        values = values.squeeze(0)    # [nnz]
        
        # Extract row and column indices
        rows = indices[:, 0].long()
        cols = indices[:, 1].long()
        
        # Get embeddings for each word and scale by TF-IDF values
        word_embeddings = self.weight[cols]
        scaled_embeddings = word_embeddings * values.unsqueeze(1)
        
        # Create output tensor
        result = torch.zeros(num_cells, self.weight.size(1), device=self.weight.device)
        
        # Sum embeddings for each document
        for i in range(len(rows)):
            result[rows[i]] += scaled_embeddings[i]
        
        return result

class CustomAttention(nn.Module):
    def __init__(self, hidden_size, num_heads, dropout_rate=0.1):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_size = hidden_size // num_heads
        
        self.q_proj = nn.Linear(hidden_size, hidden_size)
        self.k_proj = nn.Linear(hidden_size, hidden_size)
        self.v_proj = nn.Linear(hidden_size, hidden_size)
        self.o_proj = nn.Linear(hidden_size, hidden_size)
        
        self.dropout = nn.Dropout(dropout_rate)
        
        self.ngram_weight = nn.Parameter(torch.tensor(0.1))
        
        self.use_eye = config['use_eye_matrix']
        self.use_ngram = config['use_char_ngram_similarity']
    
    def forward(self, hidden_states, char_sim_matrix=None):
        batch_size, seq_len, _ = hidden_states.size()
        
        q = self.q_proj(hidden_states)
        k = self.k_proj(hidden_states)
        v = self.v_proj(hidden_states)
        
        q = q.view(batch_size, seq_len, self.num_heads, self.head_size).transpose(1, 2)
        k = k.view(batch_size, seq_len, self.num_heads, self.head_size).transpose(1, 2)
        v = v.view(batch_size, seq_len, self.num_heads, self.head_size).transpose(1, 2)
        
        scores = torch.matmul(q, k.transpose(-1, -2)) / (self.head_size ** 0.5)
        
        if self.use_eye:
            eye = torch.eye(seq_len, device=scores.device).unsqueeze(0).unsqueeze(0)
            scores = scores - eye * 10
        
        if self.use_ngram and char_sim_matrix is not None:
            weight = torch.sigmoid(self.ngram_weight)
            sim_matrix = char_sim_matrix.unsqueeze(0).unsqueeze(0)
            scores = scores + weight * sim_matrix
        
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        context = torch.matmul(attention_weights, v)
    
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.hidden_size)
        output = self.o_proj(context)
        
        return output

class TransformerBlock(nn.Module):

    def __init__(self, hidden_size, num_heads, dropout_rate=0.1):
        super().__init__()
        self.attention = CustomAttention(hidden_size, num_heads, dropout_rate)
        self.norm1 = nn.LayerNorm(hidden_size)
        self.norm2 = nn.LayerNorm(hidden_size)
        
        self.ffn = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 4),
            nn.GELU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size * 4, hidden_size),
            nn.Dropout(dropout_rate)
        )
    
    def forward(self, x, char_sim_matrix=None):
        attention_output = self.attention(x, char_sim_matrix)
        x = self.norm1(x + attention_output)
        
        ffn_output = self.ffn(x)
        x = self.norm2(x + ffn_output)
        
        return x

class TfidfTransformerModel(nn.Module):
    """Complete model for cell ordering using TF-IDF and transformer"""
    def __init__(self, vocab_size):
        super().__init__()
        
        # Model dimensions
        self.hidden_size = config['hidden_size']
        self.embedding_dim = config['embedding_dim']
        
        # Embeddings
        self.word_embedding = SparseEmbedding(vocab_size, self.embedding_dim)
        self.type_embedding = nn.Embedding(2, self.embedding_dim)  # 0 for markdown, 1 for code
        
        # Input projection
        self.input_proj = nn.Linear(self.embedding_dim, self.hidden_size)
        
        # Transformer blocks
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(
                self.hidden_size, 
                config['num_attention_heads'],
                config['dropout_rate']
            ) for _ in range(config['num_hidden_layers'])
        ])
        
        # Output layer for cell scoring
        self.score_predictor = nn.Linear(self.hidden_size, 1)
    
    def forward(self, word_indices, word_values, word_size, cell_types, char_sim_matrix=None):
        """Forward pass for cell ordering"""
        # Handle batch dimension for cell_types and char_sim_matrix
        if cell_types.dim() > 1:
            cell_types = cell_types.squeeze(0)
        if char_sim_matrix is not None and char_sim_matrix.dim() > 2:
            char_sim_matrix = char_sim_matrix.squeeze(0)
        
        # Get embeddings from TF-IDF vectors
        word_embeddings = self.word_embedding(word_indices, word_values, word_size)
        
        # Add cell type embeddings
        type_embeddings = self.type_embedding(cell_types)
        embeddings = word_embeddings + type_embeddings
        
        # Project to hidden size
        x = self.input_proj(embeddings)
        
        # Apply transformer blocks
        x = x.unsqueeze(0)  # Add batch dimension for transformers
        for block in self.transformer_blocks:
            x = block(x, char_sim_matrix)
        x = x.squeeze(0)  # Remove batch dimension
        
        # Compute scores (higher score = earlier position)
        scores = self.score_predictor(x).squeeze(-1)
        
        return scores

In [22]:
def train_model(model, train_dataset, valid_dataset, device, savedir):
    """Train the model and save checkpoints"""
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False)
    
    # Optimizer
    optimizer = torch.optim.AdamW(
        model.parameters(), 
        lr=config['learning_rate'],
        weight_decay=config['weight_decay']
    )
    
    # Move model to device
    model.to(device)
    
    # Training stats
    best_kendall = -float('inf')
    train_losses = []
    valid_kendalls = []
    
    print(f"Starting training for {config['num_epochs']} epochs...")
    
    # Debug info for first batch
    for i, batch in enumerate(train_loader):
        if i == 0:
            print("\nDebug info for first batch:")
            print(f"word_indices shape: {batch['word_indices'].shape}")
            print(f"word_values shape: {batch['word_values'].shape}")
            print(f"word_size: {batch['word_size']}")
            print(f"cell_types shape: {batch['cell_types'].shape}")
            print(f"char_sim_matrix shape: {batch['char_sim_matrix'].shape}")
            
            # Print word_size in more detail
            print(f"word_size[0] type: {type(batch['word_size'][0])}")
            print(f"word_size[0] value: {batch['word_size'][0]}")
            print(f"word_size[1] value: {batch['word_size'][1]}")
            break
    
    for epoch in range(1, config['num_epochs'] + 1):
        epoch_start = time()
        
        # Training
        model.train()
        total_loss = 0.0
        batch_count = 0
        
        for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch} Training")):
            try:
                # Move data to device
                word_indices = batch["word_indices"].to(device)
                word_values = batch["word_values"].to(device)
                word_size = [tensor.to(device) for tensor in batch["word_size"]]
                cell_types = batch["cell_types"].to(device)
                char_sim_matrix = batch["char_sim_matrix"].to(device)
                
                # Print info for first batch
                if epoch == 1 and batch_idx == 0:
                    print(f"\nBatch inputs details:")
                    print(f"  word_indices shape: {word_indices.shape}")
                    print(f"  word_values shape: {word_values.shape}")
                    print(f"  word_size: {word_size}")
                    print(f"  cell_types shape: {cell_types.shape}")
                    print(f"  char_sim_matrix shape: {char_sim_matrix.shape}")
                
                # Forward pass
                scores = model(word_indices, word_values, word_size, cell_types, char_sim_matrix)
                
                # Print scores for debugging
                if epoch == 1 and batch_idx == 0:
                    print(f"  scores shape: {scores.shape}")
                    print(f"  scores values: {scores}")
                
                # ListNet loss 
                n = scores.size(0)
                target_ranks = torch.arange(n, device=device, dtype=torch.float32)
                target_probs = F.softmax(-target_ranks, dim=0)  # Higher probability for cells that come first
                pred_probs = F.softmax(scores, dim=0)
                
                loss = -torch.sum(target_probs * torch.log(pred_probs + 1e-10))
                
                # Backward pass
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
                batch_count += 1
                
                # Print loss for first few batches
                if epoch == 1 and batch_idx < 3:
                    print(f"  Batch {batch_idx} loss: {loss.item():.4f}")
                
            except Exception as e:
                print(f"Error in batch {batch_idx}:")
                print(f"  word_indices shape: {word_indices.shape}")
                print(f"  word_values shape: {word_values.shape}")
                print(f"  word_size: {word_size}")
                print(f"  Exception: {str(e)}")
                import traceback
                traceback.print_exc()
                continue
        
        # Calculate average loss
        avg_loss = total_loss / max(1, batch_count)
        train_losses.append(avg_loss)
        
        # Validation
        valid_kendall = evaluate_model(model, valid_loader, device)
        valid_kendalls.append(valid_kendall)
        
        # Print stats
        epoch_time = time() - epoch_start
        print(f"Epoch {epoch}: Loss = {avg_loss:.4f}, Kendall Tau = {valid_kendall:.4f}, Time = {epoch_time:.2f}s")
        
        # Save best model
        if valid_kendall > best_kendall:
            best_kendall = valid_kendall
            best_model_path = f"{savedir}best_model.pt"
            torch.save(model.state_dict(), best_model_path)
            print(f"New best model saved with Kendall Tau = {valid_kendall:.4f}")
        
        # Save checkpoint
        checkpoint_path = f"{savedir}model_epoch_{epoch}.pt"
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss,
            'kendall': valid_kendall
        }, checkpoint_path)
    
    # Plot training curves
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(train_losses)
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    plt.plot(valid_kendalls)
    plt.title('Validation Kendall Tau')
    plt.xlabel('Epoch')
    plt.ylabel('Kendall Tau')
    plt.grid(True)
    
    plt.tight_layout()
    plt.savefig(f"{savedir}training_curves.png")
    plt.close()
    
    return train_losses, valid_kendalls, best_kendall


In [23]:
def evaluate_model(model, data_loader, device):
    """Evaluate model and return Kendall Tau score"""
    model.eval()
    
    all_gt_orders = []
    all_pred_orders = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            try:
                # Move data to device
                notebook_id = batch["notebook_id"][0]
                cell_ids = batch["cell_ids"][0]
                word_indices = batch["word_indices"].to(device)
                word_values = batch["word_values"].to(device)
                word_size = [tensor.to(device) for tensor in batch["word_size"]]
                cell_types = batch["cell_types"].to(device)
                char_sim_matrix = batch["char_sim_matrix"].to(device)
                
                # Get model predictions
                scores = model(word_indices, word_values, word_size, cell_types, char_sim_matrix)
                
                # Sort cells by scores (higher score = earlier position)
                _, indices = torch.sort(scores, descending=True)
                
                # Convert to ordered cell IDs
                pred_order = [cell_ids[i] for i in indices.cpu().numpy()]
                
                # Save for Kendall Tau calculation
                all_gt_orders.append(cell_ids)
                all_pred_orders.append(pred_order)
                
            except Exception as e:
                print(f"Error during evaluation: {str(e)}")
                import traceback
                traceback.print_exc()
                continue
    
    # Calculate Kendall Tau
    if len(all_gt_orders) == 0:
        print("Warning: No valid predictions during evaluation")
        return 0.0
        
    return kendall_tau(all_gt_orders, all_pred_orders)


In [24]:
def generate_submission(model, vocab_size, device, savedir):
    model.eval()
    
    test_files = [f for f in os.listdir(config['test_path']) if f.endswith('.json')]
    print(f"Found {len(test_files)} test files")
    
    submission = {"id": [], "cell_order": []}
    
    word_vectorizer = train_dataset.word_vectorizer
    char_vectorizer = train_dataset.char_vectorizer
    
    for test_file in tqdm(test_files, desc="Generating predictions"):
        notebook_id = test_file.split('.')[0]
        
        with open(os.path.join(config['test_path'], test_file)) as f:
            notebook = json.load(f)
        
        cell_ids = notebook['cell_id']
        cell_texts = []
        cell_types = []
        
        for cell_id in cell_ids:
            source = notebook['source'][cell_id]
            cell_type = notebook['cell_type'][cell_id]
            
            processed_text = preprocess_text(source)
            cell_texts.append(processed_text)
            cell_types.append(1 if cell_type == "code" else 0)
        
        word_vectors = word_vectorizer.transform(cell_texts)
        char_vectors = char_vectorizer.transform(cell_texts)
        
        char_sim_matrix = cosine_similarity(char_vectors)
        
        word_vectors_coo = word_vectors.tocoo()
        word_indices = torch.tensor(np.vstack([word_vectors_coo.row, word_vectors_coo.col]), dtype=torch.long, device=device)
        word_values = torch.tensor(word_vectors_coo.data, dtype=torch.float, device=device)
        word_size = torch.Size(word_vectors.shape)
        
        cell_types = torch.tensor(cell_types, dtype=torch.long, device=device)
        char_sim_matrix = torch.tensor(char_sim_matrix, dtype=torch.float, device=device)
        
        with torch.no_grad():
            scores = model(word_indices, word_values, word_size, cell_types, char_sim_matrix)
            
            _, indices = torch.sort(scores, descending=True)
            
            pred_order = [cell_ids[i] for i in indices.cpu().numpy()]
        
        submission["id"].append(notebook_id)
        submission["cell_order"].append(" ".join(pred_order))
    
    submission_df = pd.DataFrame(submission)
    submission_path = f"{savedir}submission.csv"
    submission_df.to_csv(submission_path, index=False)
    print(f"Submission saved to {submission_path}")
    
    return submission_df

In [None]:
print("*" * 80)
print("Setting up environment")

savedir = prepare_folders()
device = get_device()

print("*" * 80)
print("Loading data")

info = pd.read_csv(config["train_orders_path"], index_col="id")
info["cell_order"] = info["cell_order"].apply(lambda x: x.split())
notebook_ids = list(info.index)

np.random.seed(config["random_seed"])
np.random.shuffle(notebook_ids)

split_idx = int(len(notebook_ids) * config["train_size"])
# train_ids = notebook_ids[:split_idx]
train_ids = notebook_ids[:1000]
valid_ids = notebook_ids[1000:1100]
train_data = info.loc[train_ids]
valid_data = info.loc[valid_ids]

print("Creating datasets...")
train_dataset = TfidfNotebookDataset(config["data_path"], train_data, fit_vectorizers=True)
valid_dataset = TfidfNotebookDataset(config["data_path"], valid_data, fit_vectorizers=False)
print("Datasets created")

vocab_size = len(train_dataset.word_vectorizer.vocabulary_)

print("*" * 80)
print("Creating model")
model = TfidfTransformerModel(vocab_size)
print(f"Model created with {sum(p.numel() for p in model.parameters())} parameters")


print("*" * 80)
print("Training model")
train_losses, valid_kendalls, best_kendall = train_model(model, train_dataset, valid_dataset, device, savedir)
print(f"Training completed with best Kendall Tau: {best_kendall:.4f}")



In [None]:
print("*" * 80)
print("Generating submission")

best_model_path = f"{savedir}best_model.pt"
model.load_state_dict(torch.load(best_model_path))
model.to(device)

submission_df = generate_submission(model, vocab_size, device, savedir)
print("*" * 80)
print("Sample of submission file:")
print(submission_df.head())