In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel
import re
from tqdm import tqdm

# Load metadata and training data
papers_metadata = pd.read_csv('/kaggle/input/gammafestipb/papers_metadata.csv')
train_data = pd.read_csv('/kaggle/input/gammafestipb/train.csv')
test_data = pd.read_csv('/kaggle/input/gammafestipb/test.csv')

# Function to read paper content
def read_paper_content(paper_id):
    try:
        with open(f'/kaggle/input/gammafestipb/Paper Database/Paper Database/{paper_id}.txt', 'r', encoding='utf-8') as f:
            return f.read()
    except:
        return ""  # Return empty string if file doesn't exist

# Create a dictionary for quick paper content lookup
paper_contents = {}
unique_papers = set(train_data['paper'].unique()) | set(train_data['referenced_paper'].unique()) | \
                set(test_data['paper'].unique()) | set(test_data['referenced_paper'].unique())

for paper_id in tqdm(unique_papers):
    paper_contents[paper_id] = read_paper_content(paper_id)

100%|██████████| 4354/4354 [00:25<00:00, 170.72it/s]


In [2]:
# Function to generate paper embeddings
def get_paper_embeddings(paper_contents, papers_metadata, model_name='allenai/scibert_scivocab_uncased', 
                         max_length=512, batch_size=32):
    """
    Generate embeddings for papers using the specified model.
    """
    print(f"Loading model: {model_name}")
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    
    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    print(f"Using device: {device}")
    
    # Function to generate embeddings for a batch of texts
    def get_embeddings_batch(texts):
        if not texts:
            return []
            
        # Clean and prepare texts
        cleaned_texts = [re.sub(r'\s+', ' ', text).strip() for text in texts]
        
        # Tokenize all texts in batch
        inputs = tokenizer(cleaned_texts, 
                          return_tensors="pt", 
                          truncation=True, 
                          max_length=max_length, 
                          padding='max_length')
        
        # Move inputs to device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use CLS token embedding as document embedding
        embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        return embeddings
    
    # Get embedding dimension from model
    embedding_dim = model.config.hidden_size
    
    # Create paper_ids and texts lists for batch processing
    paper_ids = list(paper_contents.keys())
    texts = []
    
    for paper_id in paper_ids:
        # Get title from metadata
        title = papers_metadata.loc[papers_metadata['paper_id'] == paper_id, 'title'].values[0] \
                if paper_id in papers_metadata['paper_id'].values else ""
        
        # Combine title and content
        content = paper_contents.get(paper_id, "")
        text_to_embed = f"{title} {content}"
        texts.append(text_to_embed)
    
    # Process in batches
    paper_embeddings = {}
    for i in tqdm(range(0, len(paper_ids), batch_size), desc=f"Generating embeddings with {model_name}"):
        batch_ids = paper_ids[i:i+batch_size]
        batch_texts = texts[i:i+batch_size]
        
        # Get embeddings for batch
        batch_embeddings = get_embeddings_batch(batch_texts)
        
        # Store embeddings in dictionary
        for j, paper_id in enumerate(batch_ids):
            paper_embeddings[paper_id] = batch_embeddings[j]
            
    print(f"Generated embeddings for {len(paper_embeddings)} papers with dimension {embedding_dim}")
    return paper_embeddings, tokenizer, model

In [3]:
# Chunking functions
def chunk_paper_content(paper_content, chunk_size=15, stride=5):
    """
    Split paper content into overlapping chunks of tokens.
    """
    # Clean the content
    if not paper_content or len(paper_content.strip()) == 0:
        return []
    
    paper_content = re.sub(r'\s+', ' ', paper_content).strip()
    
    # Tokenize to words (simple split by space)
    tokens = paper_content.split()
    
    # Create chunks
    chunks = []
    for i in range(0, len(tokens) - chunk_size + 1, stride):
        chunk = ' '.join(tokens[i:i+chunk_size])
        chunks.append(chunk)
    
    # If content is smaller than chunk_size, use the whole content
    if not chunks and tokens:
        chunks = [' '.join(tokens)]
        
    return chunks

In [4]:
# Dictionary to cache chunk embeddings to avoid redundant computation
chunk_embedding_cache = {}

def get_chunks_embeddings(chunks, tokenizer, model, device, max_length=128):
    """
    Generate embeddings for a list of text chunks with caching.
    """
    if not chunks:
        return np.array([])
    
    # Process in batches to avoid memory issues
    batch_size = 32
    all_embeddings = []
    uncached_chunks = []
    uncached_indices = []
    
    # Check cache first
    for i, chunk in enumerate(chunks):
        chunk_key = hash(chunk)
        if chunk_key in chunk_embedding_cache:
            all_embeddings.append(chunk_embedding_cache[chunk_key])
        else:
            uncached_chunks.append(chunk)
            uncached_indices.append(i)
    
    # If there are uncached chunks, compute their embeddings
    if uncached_chunks:
        for i in range(0, len(uncached_chunks), batch_size):
            batch = uncached_chunks[i:i+batch_size]
            
            # Tokenize
            inputs = tokenizer(batch, 
                            return_tensors="pt", 
                            truncation=True, 
                            max_length=max_length, 
                            padding='max_length')
            
            # Move to device
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Get embeddings
            with torch.no_grad():
                outputs = model(**inputs)
            
            # Use CLS token embedding as chunk embedding
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            
            # Cache the embeddings
            for j, chunk in enumerate(batch):
                chunk_key = hash(chunk)
                chunk_embedding_cache[chunk_key] = batch_embeddings[j]
            
            all_embeddings.extend([batch_embeddings[j] for j in range(len(batch))])
    
    if all_embeddings:
        return np.stack(all_embeddings)
    else:
        return np.array([])

In [5]:
def compute_max_chunk_similarity(paper_id, ref_paper_id, paper_contents, tokenizer, model, device, 
                               chunk_size=15, stride=5, max_length=128):
    """
    Compute maximum similarity between chunks of two papers.
    """
    # Get paper contents
    paper_content = paper_contents.get(paper_id, "")
    ref_paper_content = paper_contents.get(ref_paper_id, "")
    
    # If either paper is empty, return 0 similarity
    if not paper_content or not ref_paper_content:
        return 0.0, [], []
    
    # Split into chunks
    paper_chunks = chunk_paper_content(paper_content, chunk_size, stride)
    ref_paper_chunks = chunk_paper_content(ref_paper_content, chunk_size, stride)
    
    # If chunks are empty, return 0
    if not paper_chunks or not ref_paper_chunks:
        return 0.0, [], []
    
    # Get embeddings for chunks
    paper_chunk_embeddings = get_chunks_embeddings(paper_chunks, tokenizer, model, device, max_length)
    ref_paper_chunk_embeddings = get_chunks_embeddings(ref_paper_chunks, tokenizer, model, device, max_length)
    
    # If embeddings are empty, return 0
    if paper_chunk_embeddings.size == 0 or ref_paper_chunk_embeddings.size == 0:
        return 0.0, [], []
    
    # Calculate pairwise cosine similarities between all chunks
    similarities = cosine_similarity(paper_chunk_embeddings, ref_paper_chunk_embeddings)
    
    # Get maximum similarity
    max_sim = np.max(similarities)
    
    # Get top-5 similarities
    top_similarities = np.sort(similarities.flatten())[-5:]
    
    # Get percentiles
    percentiles = np.percentile(similarities.flatten(), [50, 75, 90, 95])
    
    return max_sim, top_similarities, percentiles

In [6]:
def create_pair_features_gpu(row, embeddings_dict, metadata_df, paper_contents, device):
    """
    Exactly the same metadata pipeline as before, but uses PyTorch on GPU
    for the cosine‐similarity of the two full‐doc embeddings.
    """
    paper_id     = row['paper']
    ref_paper_id = row['referenced_paper']
    
    # 1) Load embeddings & move to GPU
    emb1 = embeddings_dict.get(paper_id, np.zeros(768))
    emb2 = embeddings_dict.get(ref_paper_id, np.zeros(768))
    t1 = torch.tensor(emb1, device=device)
    t2 = torch.tensor(emb2, device=device)
    
    # 2) Compute cosine similarity on GPU
    #    cos_sim = (t1·t2) / (||t1|| * ||t2||)
    dot      = torch.dot(t1, t2)
    norm1    = t1.norm()
    norm2    = t2.norm()
    text_sim = (dot / (norm1 * norm2 + 1e-8)).item()
    
    # 3) The rest is exactly as before, on CPU / Python, since it's metadata logic
    paper_row     = metadata_df[metadata_df['paper_id']==paper_id]
    ref_paper_row = metadata_df[metadata_df['paper_id']==ref_paper_id]
    paper_row     = paper_row.iloc[0] if not paper_row.empty else None
    ref_paper_row = ref_paper_row.iloc[0] if not ref_paper_row.empty else None
    
    features = {
        'text_similarity': text_sim,
        'year_diff': 0,
        'can_cite': 0,
        'cited_by_count_ratio': 0,
        'cited_by_count_ref': 0,
        'cited_by_count_paper': 0,
        'same_year': 0,
        'author_overlap': 0,
        'concept_overlap': 0,
        'same_type': 0,
        'title_similarity': 0,
        'contains_citation_text': 0,
    }
    
    if paper_row is not None and ref_paper_row is not None:
        # temporal
        year_diff = int(paper_row['publication_year']) - int(ref_paper_row['publication_year'])
        features.update({
            'year_diff':     year_diff,
            'can_cite':      1 if year_diff>0 else 0,
            'same_year':     1 if year_diff==0 else 0,
        })
        # citation counts
        c1 = paper_row['cited_by_count']  or 0
        c2 = ref_paper_row['cited_by_count'] or 0
        features.update({
            'cited_by_count_ref':   c2,
            'cited_by_count_paper': c1,
            'cited_by_count_ratio': c2/(c1+1),
        })
        # author overlap
        a1 = set(str(paper_row['authors']).split(';')) if pd.notna(paper_row['authors']) else set()
        a2 = set(str(ref_paper_row['authors']).split(';')) if pd.notna(ref_paper_row['authors']) else set()
        if a1 and a2:
            features['author_overlap'] = len(a1&a2)/len(a1|a2)
        # concept overlap
        c1 = set(str(paper_row['concepts']).lower().split(';')) if pd.notna(paper_row['concepts']) else set()
        c2 = set(str(ref_paper_row['concepts']).lower().split(';')) if pd.notna(ref_paper_row['concepts']) else set()
        if c1 and c2:
            features['concept_overlap'] = len(c1&c2)/len(c1|c2)
        # same_type
        features['same_type'] = int(paper_row['type']==ref_paper_row['type'])
        # title similarity (Jaccard)
        if pd.notna(paper_row['title']) and pd.notna(ref_paper_row['title']):
            tset1 = set(str(paper_row['title']).lower().split())
            tset2 = set(str(ref_paper_row['title']).lower().split())
            if tset1 and tset2:
                features['title_similarity'] = len(tset1&tset2)/len(tset1|tset2)
    # contains_citation_text
    if paper_id in paper_contents and ref_paper_row is not None:
        text  = paper_contents[paper_id].lower()
        title = str(ref_paper_row['title']).lower()
        if len(title)>5 and title in text:
            features['contains_citation_text'] = 1
    
    return features

In [7]:
print("Generating paper embeddings...")
model_name = 'allenai/specter'  # Paper citation-specific model
paper_embeddings, tokenizer, model = get_paper_embeddings(
    paper_contents, 
    papers_metadata, 
    model_name=model_name,
    max_length=512,
    batch_size=16
)

Generating paper embeddings...
Loading model: allenai/specter


tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2025-06-09 08:47:11.957553: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749458832.141355      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749458832.196739      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Using device: cuda


Generating embeddings with allenai/specter:   0%|          | 0/273 [00:00<?, ?it/s]Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generating embeddings with allenai/specter: 100%|██████████| 273/273 [03:31<00:00,  1.29it/s]

Generated embeddings for 4354 papers with dimension 768





In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Computing document‐level features for train...")
train_doc_records = []
for _, row in tqdm(train_data.iterrows(), total=len(train_data), desc="Doc-level train"):
    feats = create_pair_features_gpu(row, paper_embeddings, papers_metadata, paper_contents,device)
    feats['paper'] = row['paper']
    feats['referenced_paper'] = row['referenced_paper']
    feats['is_referenced'] = row['is_referenced']
    train_doc_records.append(feats)
train_doc_df = pd.DataFrame(train_doc_records)

print("Computing document‐level features for test...")
test_doc_records = []
for _, row in tqdm(test_data.iterrows(), total=len(test_data), desc="Doc-level test"):
    feats = create_pair_features_gpu(row, paper_embeddings, papers_metadata, paper_contents,device)
    feats['paper'] = row['paper']
    feats['referenced_paper'] = row['referenced_paper']
    feats['id'] = row['id']
    test_doc_records.append(feats)
test_doc_df = pd.DataFrame(test_doc_records)


Computing document‐level features for train...


Doc-level train: 100%|██████████| 410691/410691 [15:45<00:00, 434.15it/s]


Computing document‐level features for test...


Doc-level test: 100%|██████████| 336021/336021 [12:57<00:00, 432.01it/s]


In [9]:
train_doc_df.to_csv("train_doc_df.csv")
test_doc_df.to_csv("test_doc_df.csv")

In [10]:
def get_chunks_embeddings(chunks, tokenizer, model, device, max_length=128):
    """
    Generate embeddings for a list of text chunks with caching.
    """
    if not chunks:
        return np.array([])
    
    # Process in batches to avoid memory issues
    batch_size = 32
    all_embeddings = []
    uncached_chunks = []
    uncached_indices = []
    
    # Check cache first
    for i, chunk in enumerate(chunks):
        chunk_key = hash(chunk)
        if chunk_key in chunk_embedding_cache:
            all_embeddings.append(chunk_embedding_cache[chunk_key])
        else:
            uncached_chunks.append(chunk)
            uncached_indices.append(i)
    
    # If there are uncached chunks, compute their embeddings
    if uncached_chunks:
        for i in range(0, len(uncached_chunks), batch_size):
            batch = uncached_chunks[i:i+batch_size]
            
            # Tokenize
            inputs = tokenizer(batch, 
                            return_tensors="pt", 
                            truncation=True, 
                            max_length=max_length, 
                            padding='max_length')
            
            # Move to device
            inputs = {k: v.to(device) for k, v in inputs.items()}
            
            # Get embeddings
            with torch.no_grad():
                outputs = model(**inputs)
            
            # Use CLS token embedding as chunk embedding
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            
            # Cache the embeddings
            for j, chunk in enumerate(batch):
                chunk_key = hash(chunk)
                chunk_embedding_cache[chunk_key] = batch_embeddings[j]
            
            all_embeddings.extend([batch_embeddings[j] for j in range(len(batch))])
    
    if all_embeddings:
        return np.stack(all_embeddings)
    else:
        return np.array([])

In [11]:
import pickle

In [12]:
selected = (25,10)

In [13]:
from sentence_transformers import SentenceTransformer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 1) Define your single chunk scheme and load encoder
schemes = [selected]
chunk_encoder = SentenceTransformer('all-MiniLM-L6-v2')
chunk_encoder = chunk_encoder.to(device)
# 2) Precompute chunk embeddings once per paper per scheme
#    Store each scheme’s embeddings in its own file
for cs, st in schemes:
    chunk_embeddings = {}
    for paper_id, content in tqdm(paper_contents.items(), desc=f"Precompute chunks {cs},{st}"):
        tokens = content.split()
        chunks = [
            " ".join(tokens[i : i + cs])
            for i in range(0, len(tokens) - cs + 1, st)
        ]
        if not chunks and tokens:
            chunks = [" ".join(tokens)]
        if chunks:
            embs = chunk_encoder.encode(
                chunks,
                batch_size=128,
                convert_to_numpy=True,
                show_progress_bar=False
            )
        else:
            embs = np.zeros((0, chunk_encoder.get_sentence_embedding_dimension()))
        chunk_embeddings[paper_id] = embs

    # 3) Dynamically name output file based on cs & st
    output_path = f"chunk_embeddings_{cs}_{st}.pkl"
    with open(output_path, "wb") as f:
        pickle.dump(chunk_embeddings, f)
    print(f"Saved chunk embeddings for scheme ({cs},{st}) to {output_path}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Precompute chunks 25,10: 100%|██████████| 4354/4354 [28:47<00:00,  2.52it/s]


Saved chunk embeddings for scheme (25,10) to chunk_embeddings_25_10.pkl


In [14]:
chunk_embeddings = {}
for cs, st in schemes:
    path = f"/kaggle/working/chunk_embeddings_25_10.pkl"
    with open(path, "rb") as f:
        # ini dict: paper_id → np.ndarray
        emb_map = pickle.load(f)
    chunk_embeddings[(cs, st)] = emb_map

# 3) Sekarang bangun chunk_embeddings_gpu dengan kunci yang sama
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
chunk_embeddings_gpu = {}
for cs, st in schemes:
    emb_map = chunk_embeddings[(cs, st)]
    gpu_map = {}
    for pid, embs in emb_map.items():
        # pastikan embs adalah array 2D (n_chunks, dim)
        gpu_map[pid] = torch.from_numpy(embs).to(device)
    chunk_embeddings_gpu[(cs, st)] = gpu_map

# cek
print("Loaded GPU chunk embeddings for schemes:", list(chunk_embeddings_gpu.keys()))

Loaded GPU chunk embeddings for schemes: [(25, 10)]


In [15]:
import math
def compute_chunk_features_gpu_stream(row, block_size=512, threshold=0.8):
    feats = {}
    max_sims = []
    p, r = row['paper'], row['referenced_paper']
    
    for cs, st in schemes:
        e1 = chunk_embeddings_gpu[(cs, st)].get(p, torch.empty((0,)))
        e2 = chunk_embeddings_gpu[(cs, st)].get(r, torch.empty((0,)))
        
        if e1.numel() and e2.numel():
            # normalisasi
            e1n = e1 / (e1.norm(dim=1, keepdim=True) + 1e-8)
            e2n = e2 / (e2.norm(dim=1, keepdim=True) + 1e-8)
            
            total_count = 0
            total_sum = 0.0
            total_sqsum = 0.0
            total_above = 0
            max_sim = 0.0
            
            # streaming block‐by‐block
            for i in range(0, e1n.size(0), block_size):
                b1 = e1n[i : i + block_size]    # (bsize1 × dim)
                for j in range(0, e2n.size(0), block_size):
                    b2 = e2n[j : j + block_size]  # (bsize2 × dim)
                    sims = torch.matmul(b1, b2.t()).flatten()  # (bsize1*bsize2)
                    
                    # block‐stats (GPU)
                    cnt = sims.numel()
                    s   = sims.sum().item()
                    ss  = (sims * sims).sum().item()
                    mb  = sims.max().item()
                    abv = int((sims > threshold).sum().item())
                    
                    total_count += cnt
                    total_sum   += s
                    total_sqsum += ss
                    total_above += abv
                    if mb > max_sim:
                        max_sim = mb
            
            # rangkuman
            mean_sim = total_sum / total_count
            var_sim  = total_sqsum / total_count - mean_sim**2
            std_sim  = math.sqrt(var_sim) if var_sim>0 else 0.0
            frac_above = total_above / total_count
            
        else:
            max_sim = mean_sim = std_sim = frac_above = 0.0
        
        # simpan semua
        feats.update({
            f'max_chunk_sim_{cs}_{st}':  max_sim,
            f'mean_chunk_sim_{cs}_{st}': mean_sim,
            f'std_chunk_sim_{cs}_{st}':  std_sim,
            f'frac_above{int(threshold*100)}_{cs}_{st}': frac_above,
        })
        max_sims.append(max_sim)
    
    # agregasi
    feats['avg_chunk_sim']      = float(np.mean(max_sims))
    feats['max_chunk_sim']      = float(np.max(max_sims))
    feats['chunk_sim_variance'] = float(np.var(max_sims))
    return feats


In [16]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
chunk_embeddings_gpu = {}
for cs, st in schemes:
    submap = chunk_embeddings[(cs, st)]
    # stack dimension d
    gpu_map = {}
    for pid, embs in submap.items():
        # convert to torch tensor on GPU
        gpu_map[pid] = torch.from_numpy(embs).to(device)
    chunk_embeddings_gpu[(cs, st)] = gpu_map


In [17]:
# 3) Build train & test chunk‐only DataFrames
train_chunk_feats = []
for _, row in tqdm(train_data.iterrows(), total=len(train_data), desc="Train chunk features"):
    feats = compute_chunk_features_gpu_stream(row)
    feats['is_referenced'] = row['is_referenced']
    train_chunk_feats.append(feats)
train_chunk_df = pd.DataFrame(train_chunk_feats)

test_chunk_feats = []
for _, row in tqdm(test_data.iterrows(), total=len(test_data), desc="Test chunk features"):
    feats = compute_chunk_features_gpu_stream(row)
    feats['id'] = row['id']
    test_chunk_feats.append(feats)
test_chunk_df = pd.DataFrame(test_chunk_feats)

Train chunk features: 100%|██████████| 410691/410691 [11:37<00:00, 588.40it/s]
Test chunk features: 100%|██████████| 336021/336021 [09:39<00:00, 580.30it/s]


In [18]:
# 7) (Optional) Save to CSV
train_chunk_df.to_csv("train_chunk_df.csv", index=False)
test_chunk_df.to_csv("test_chunk_df.csv", index=False)

In [19]:
# train_doc_df = pd.read_csv("/kaggle/input/doc-level/train_doc_df.csv")
# test_doc_df = pd.read_csv("/kaggle/input/doc-level/test_doc_df.csv")

In [20]:
# you can simply concat them side‐by‐side.
assert len(train_doc_df) == len(train_chunk_df)
assert len(test_doc_df) == len(test_chunk_df)

In [21]:
# --- 2) Merge train:
# Drop duplicate target if needed (train_chunk_df also has 'is_referenced')
train_full_df = pd.concat([
    train_doc_df.reset_index(drop=True),
    train_chunk_df.drop(columns=['is_referenced']).reset_index(drop=True)
], axis=1)

# --- 3) Merge test:
# Drop duplicate 'id' column in chunk, keep only one
test_full_df = pd.concat([
    test_doc_df.reset_index(drop=True),
    test_chunk_df.drop(columns=['id']).reset_index(drop=True)
], axis=1)


In [22]:
train_full_df.dtypes

text_similarity           float64
year_diff                   int64
can_cite                    int64
cited_by_count_ratio      float64
cited_by_count_ref          int64
cited_by_count_paper        int64
same_year                   int64
author_overlap            float64
concept_overlap           float64
same_type                   int64
title_similarity          float64
contains_citation_text      int64
paper                      object
referenced_paper           object
is_referenced               int64
max_chunk_sim_25_10       float64
mean_chunk_sim_25_10      float64
std_chunk_sim_25_10       float64
frac_above80_25_10        float64
avg_chunk_sim             float64
max_chunk_sim             float64
chunk_sim_variance        float64
dtype: object

In [23]:
train_full_df.to_csv("train_full.csv",index=0)
test_full_df.to_csv("test_full.csv",index=0)

In [24]:
train_full_df.dtypes

text_similarity           float64
year_diff                   int64
can_cite                    int64
cited_by_count_ratio      float64
cited_by_count_ref          int64
cited_by_count_paper        int64
same_year                   int64
author_overlap            float64
concept_overlap           float64
same_type                   int64
title_similarity          float64
contains_citation_text      int64
paper                      object
referenced_paper           object
is_referenced               int64
max_chunk_sim_25_10       float64
mean_chunk_sim_25_10      float64
std_chunk_sim_25_10       float64
frac_above80_25_10        float64
avg_chunk_sim             float64
max_chunk_sim             float64
chunk_sim_variance        float64
dtype: object

In [25]:
unused_col = ['paper', 'referenced_paper']

# Modelling

In [26]:
df_train = train_full_df.copy()
df_test = test_full_df.copy()

In [27]:
from xgboost import XGBClassifier
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoTokenizer, AutoModel
import re
from tqdm import tqdm

df_train = pd.read_csv('/kaggle/input/gammafest-final-df/train_full.csv')
df_test  = pd.read_csv('/kaggle/input/gammafest-final-df/test_full.csv')

In [28]:
df_train.head()

Unnamed: 0,text_similarity,year_diff,can_cite,cited_by_count_ratio,cited_by_count_ref,cited_by_count_paper,same_year,author_overlap,concept_overlap,same_type,...,paper,referenced_paper,is_referenced,max_chunk_sim_25_10,mean_chunk_sim_25_10,std_chunk_sim_25_10,frac_above80_25_10,avg_chunk_sim,max_chunk_sim,chunk_sim_variance
0,0.760211,3,1,6.960894,2492,357,0,0.0,0.25,1,...,p2128,p3728,0,0.653185,0.10831,0.092632,0.0,0.653185,0.653185,0.0
1,0.631514,-24,0,1.09776,1078,981,0,0.0,0.0,0,...,p0389,p3811,0,0.595115,0.129211,0.090438,0.0,0.595115,0.595115,0.0
2,0.61087,2,1,0.011196,182,16255,0,0.0,0.0,0,...,p1298,p3760,0,0.586281,0.114077,0.083889,0.0,0.586281,0.586281,0.0
3,0.714809,7,1,0.079337,1479,18641,0,0.0,0.111111,1,...,p0211,p1808,0,0.603561,0.093274,0.089614,0.0,0.603561,0.603561,0.0
4,0.771843,26,1,0.80625,645,799,0,0.0,0.0,0,...,p0843,p2964,0,0.721893,0.093263,0.094657,0.0,0.721893,0.721893,0.0


In [29]:
papers_metadata = pd.read_csv('/kaggle/input/gammafestipb/papers_metadata.csv')

In [30]:
papers_metadata.head()

Unnamed: 0,paper_id,doi,title,publication_year,publication_date,cited_by_count,type,authors,concepts
0,p0000,https://doi.org/10.1161/circulationaha.115.001593,Machine Learning in Medicine,2015,11/16/2015,2662,review,Rahul C. Deo,Medicine; Medical physics; Medical education; ...
1,p0001,https://doi.org/10.1504/ijmmno.2013.055204,A literature survey of benchmark functions for...,2013,1/1/2013,1138,article,Momin Jamil; Xin‐She Yang,Benchmark (surveying); Set (abstract data type...
2,p0002,https://doi.org/10.1109/icip.2017.8296547,Abnormal event detection in videos using gener...,2017,9/1/2017,486,article,Mahdyar Ravanbakhsh; Moin Nabi; Enver Sanginet...,Abnormality; Computer science; Artificial inte...
3,p0003,https://doi.org/10.3115/v1/p15-1001,On Using Very Large Target Vocabulary for Neur...,2015,1/1/2015,916,article,Sébastien Jean; Kyunghyun Cho; Roland Memisevi...,Machine translation; Computer science; Vocabul...
4,p0004,https://doi.org/10.1109/tpami.2007.1167,Gaussian Process Dynamical Models for Human Mo...,2007,12/20/2007,1016,article,Jonathan M. Wang; David J. Fleet; Aaron Hertzmann,Gaussian process; Artificial intelligence; Lat...


In [31]:
# siapkan metadata untuk merge
meta_p       = papers_metadata[['paper_id','authors']].rename(columns={'paper_id':'paper'})
meta_ref_p   = papers_metadata[['paper_id','authors']].rename(
                  columns={'paper_id':'referenced_paper','authors':'referenced_authors'})

# merge ke df_train
df_train = (
    df_train
    .merge(meta_p,       on='paper',            how='left')
    .merge(meta_ref_p,   on='referenced_paper', how='left')
)

# merge ke df_test
df_test = (
    df_test
    .merge(meta_p,       on='paper',            how='left')
    .merge(meta_ref_p,   on='referenced_paper', how='left')
)


In [32]:
categorical_cols = df_train.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_cols

['paper', 'referenced_paper', 'authors', 'referenced_authors']

In [33]:
import numpy as np

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

# …after all of your aggregation & top-200 selection…
# e.g. you now have df_train and df_test

# Apply memory reduction
df_train = reduce_mem_usage(df_train)
df_test  = reduce_mem_usage(df_test)

Memory usage of dataframe is 75.20 MB
Memory usage after optimization is: 18.76 MB
Decreased by 75.1%
Memory usage of dataframe is 61.53 MB
Memory usage after optimization is: 16.38 MB
Decreased by 73.4%


In [34]:
df_train['referenced_authors'].value_counts()

referenced_authors
Kaiming He; Xiangyu Zhang; Shaoqing Ren; Jian Sun                           440
Guobao Wang; Jinyi Qi                                                       439
Luciano Floridi                                                             438
David Mackay                                                                419
Charles F. Manski                                                           339
                                                                           ... 
Li Huang; Lei Wang                                                           78
Yan-Kun Chen; Jingxuan Liu; Lingyun Peng; Yiqi Wu; Yige Xu                   76
Jack Stilgoe                                                                 76
Wang Feng; Xiang Xiang; Jian Cheng; Alan Yuille                              72
Samuel Gehman; Suchin Gururangan; Maarten Sap; Yejin Choi; Noah A. Smith     71
Name: count, Length: 3642, dtype: int64

In [35]:
import pandas as pd
import itertools

# 1) Load

# 2) Track train size
n_train = df_train.shape[0]

# 3) Concat
df_all = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)

# 4) Define categorical_cols (you must have defined this already)
#    e.g.: categorical_cols = ['cat_feat1','cat_feat2', …]
grouping_cols = categorical_cols.copy()

# 5) Numeric columns are everything except your cats + the label
numeric_cols = [c for c in df_train.columns 
                if c not in grouping_cols + ['is_referenced']]

# 6) Which aggs + diffs to create
aggregations = ['mean','median','max','min','std','var']
diff_stats   = ['mean','median']

# 7) Loop through combos of 1…4 categorical keys
for r in [1]:
    for group_combo in itertools.combinations(grouping_cols, r):
        group_list = list(group_combo)
        group_name = "".join(group_list)

        # a) group & aggregate
        agg_df = (
            df_all
            .groupby(group_list)[numeric_cols]
            .agg(aggregations)
            .reset_index()
        )
        # b) flatten + rename
        new_cols = group_list.copy()
        for numcol, stat in agg_df.columns[len(group_list):]:
            new_cols.append(f"{numcol}_agg{stat}{group_name}")
        agg_df.columns = new_cols

        # c) merge back
        df_all = df_all.merge(agg_df, on=group_list, how='left')

        # d) create diff features
        for col in numeric_cols:
            for stat in diff_stats:
                agg_col  = f"{col}_agg{stat}{group_name}"
                diff_col = f"{col}_diff{stat}{group_name}"
                df_all[diff_col] = df_all[col] - df_all[agg_col]

# 8) Split back
df_train = df_all.iloc[:n_train].reset_index(drop=True)
df_test  = df_all.iloc[n_train:].reset_index(drop=True)


  .groupby(group_list)[numeric_cols]
  .groupby(group_list)[numeric_cols]
  .groupby(group_list)[numeric_cols]
  .groupby(group_list)[numeric_cols]


In [36]:
df_train.shape

(410691, 633)

In [37]:
from xgboost import XGBClassifier
import numpy as np

X = df_train.drop(columns=['is_referenced'])
y = df_train['is_referenced']

# Keep only numeric features
X_num = X.select_dtypes(include=[np.number])

# Train an XGBoost classifier for feature importance
model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', verbosity=0)
model.fit(X_num, y)

# Extract importances
imp_df = pd.DataFrame({'feature': X_num.columns, 'importance': model.feature_importances_})
imp_df = imp_df.sort_values('importance', ascending=False)

# Select top 200 features
top200 = imp_df['feature'].iloc[:200].tolist()

# Subset original DataFrames to top features
df_train_top200 = df_train[['is_referenced'] + top200].copy()
df_test_top200  = df_test[top200].copy()

# (Optional) Overwrite the originals
df_train = df_train_top200
df_test  = df_test_top200


In [38]:
df_train.shape

(410691, 201)

In [39]:
X       = df_train.drop(columns=['is_referenced'])
y       = df_train['is_referenced']
X_test = df_test

# 3) Compute scale_pos_weight
scale_pos_weight = (y == 0).sum() / (y == 1).sum()

# 4) Trial 1 params + extras
best_params = {
    'objective':         'binary:logistic',
    'eval_metric':       'error',
    'reg_lambda':        0.04438968972267845,     # was 'lambda'
    'reg_alpha':         0.17803209157241887,     # was 'alpha'
    'colsample_bytree':  0.7874735184704358,
    'subsample':         0.8440834308808433,
    'learning_rate':     0.2085778686861703,
    'n_estimators':      388,
    'max_depth':         10,
    'min_child_weight':  5,
    'gamma':             1.4171005881839482e-05,
    'scale_pos_weight':  scale_pos_weight,
    'random_state':      42,
    'use_label_encoder': False,
    'verbosity':         0
}

# 5) Train on full train set
model = XGBClassifier(**best_params)
model.fit(X, y)

# 6) Predict class labels (0/1) on test
preds2 = model.predict(X_test)

In [40]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import matthews_corrcoef, make_scorer
from xgboost import XGBClassifier

# 1) Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 2) Inisialisasi model
clf = XGBClassifier(**best_params)

# 3) Buat scorer untuk MCC
mcc_scorer = make_scorer(matthews_corrcoef)

# 4) Hitung MCC di tiap fold
mcc_scores = cross_val_score(
    clf, X, y,
    cv=skf,
    scoring=mcc_scorer,
    n_jobs=-1
)

print("CV MCC scores:", mcc_scores)
print("Mean MCC       :", mcc_scores.mean())


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

CV MCC scores: [0.6080689  0.62554884 0.61562595 0.60015708 0.59914916]
Mean MCC       : 0.609709985759617


In [41]:
submisi = pd.read_csv("/kaggle/input/gammafestipb/sample_submission.csv")


# 7) Build submission
submission = pd.DataFrame({
    'id':             submisi['id'],      # adjust if different
    'is_referenced':  preds2
})

# 8) Save
submission.to_csv('submission.csv', index=False)
print("Wrote submission.csv with shape", submission.shape)

Wrote submission.csv with shape (336021, 2)
