In [2]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split

# Paths 
abstracts_path = "/kaggle/input/nlp-cse-uoi-2025/data_new/abstracts.txt"
edgelist_path =  "/kaggle/input/nlp-cse-uoi-2025/data_new/edgelist.txt"

# 1. Load abstracts
abstracts = {}
with open(abstracts_path, "r", encoding="utf-8") as f:
    for line in f:
        pid, txt = line.strip().split("|--|", maxsplit=1)
        abstracts[int(pid)] = txt.strip()

# 2. Build positive pairs
pos = []
pos_set = set()
with open(edgelist_path, "r", encoding="utf-8") as f:
    for line in f:
        a, b = map(int, line.strip().split(","))
        pos.append({
            "sentence1": abstracts[a],
            "sentence2": abstracts[b],
            "label": 1
        })
        pos_set.add((a, b))

df_pos = pd.DataFrame(pos)

# 3. Generate negative pairs (same count as positives)
all_ids = list(abstracts.keys())
neg = set()
while len(neg) < len(df_pos):
    a, b = random.sample(all_ids, 2)
    if (a, b) not in pos_set and (b,a) not in pos_set:
        neg.add((a, b))

df_neg = pd.DataFrame([
    {"sentence1": abstracts[a], "sentence2": abstracts[b], "label": 0}
    for a, b in neg
])

# 4. Combine & shuffle
df = pd.concat([df_pos, df_neg], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# 5. Train/dev split (85/15, stratified by label)
train_df, dev_df = train_test_split(
    df,
    test_size=0.15,
    random_state=42,
    stratify=df["label"]
)

# 6. Save CSVs
train_df.to_csv("train.csv", index=False)
dev_df.to_csv( "dev.csv",   index=False)

print(f"Train size: {len(train_df)}, Dev size: {len(dev_df)}")
print("\nSample from train.csv:")
print(train_df.head())
print("\nSample from dev.csv:")
print(dev_df.head())


Train size: 1856323, Dev size: 327587

Sample from train.csv:
                                                 sentence1  \
807839   View synthesis is a process for generating nov...   
1531135  Extensional (table) constraints are an importa...   
1304282  We present a simple variant of the k-d tree wh...   
656790   This paper presents a passive depth map comput...   
445727   We treat the text summarization problem as max...   

                                                 sentence2  label  
807839   The reduction of inherent ambiguities in struc...      0  
1531135  Special-purpose constraint propagation algorit...      1  
1304282  Estimating the age of a human from the capture...      1  
656790   This paper presents a comparison of six machin...      0  
445727                                                          1  

Sample from dev.csv:
                                                 sentence1  \
785885   We present an algorithm for plane-based self-c...   
680598   Al

In [3]:
import os, numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)


PRETRAINED_MODEL = "distilbert-base-uncased"
TRAIN_FILE,  VALID_FILE  = "/kaggle/working/train.csv", "/kaggle/working/dev.csv"
OUTPUT_DIR    = "./distilbert-finetuned"
NUM_EPOCHS    = 3
BATCH_SIZE    = 16
LEARNING_RATE = 2e-5
MAX_LENGTH    = 128
os.environ["HF_DATASETS_CACHE"] = "./hf_cache"     # local cache
os.environ["WANDB_DISABLED"]    = "true"           # no wandb prompt


# 1. Load CSVs
ds = load_dataset(
    "csv",
    data_files={"train": TRAIN_FILE, "validation": VALID_FILE},
    cache_dir="./hf_cache",
)

# 2. Tokeniser + preprocessing
tok = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

def preprocess(batch):
    enc = tok(
        [str(x) for x in batch["sentence1"]],
        [str(x) for x in batch["sentence2"]],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )
    enc["labels"] = [float(y) for y in batch["label"]]   # regression target
    return enc

ds_tok = ds.map(
    preprocess,
    batched=True,
    remove_columns=["sentence1", "sentence2", "label"],  # drop raw text/old label
)

collate = DataCollatorWithPadding(tok)

# 3. Model
model = AutoModelForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL,
    problem_type="regression",
    num_labels=1,
)

# 4. Metric (NumPy Pearson)
def compute_metrics(pred):
    p, l = pred
    return {"pearson": float(np.corrcoef(p.flatten(), l.astype(float))[0, 1])}

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    do_train=True, do_eval=True,
    num_train_epochs=1,                
    per_device_train_batch_size=32,    
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    save_strategy="steps",
    save_steps=10000,
    eval_strategy="steps",
    eval_steps=10000,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    save_total_limit=1,
    greater_is_better=True,
    dataloader_num_workers=4,
    fp16=True,                         
)



# 6. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["validation"],
    tokenizer=tok,
    data_collator=collate,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# 7. Train & save
trainer.train()
model.save_pretrained(OUTPUT_DIR)
tok.save_pretrained(OUTPUT_DIR)
# ——— LOCAL OUTPUT_DIR ———
model.save_pretrained(OUTPUT_DIR)
tok.save_pretrained(OUTPUT_DIR)



2025-05-25 11:41:29.202428: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748173289.379741      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748173289.435303      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1856323 [00:00<?, ? examples/s]

Map:   0%|          | 0/327587 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks..

Step,Training Loss,Validation Loss,Pearson
10000,0.089,0.086173,0.811838
20000,0.0759,0.076359,0.83429
30000,0.0746,0.070079,0.848589
40000,0.072,0.067341,0.855474
50000,0.0678,0.064604,0.861835


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': 'takizawa143/fine-tuned-bert/distilbert-finetuned'. Use `repo_type` argument if needed.

In [4]:
#abstract text cleaning for upper case and delimeters
cleaned_abstracts = []
with open("/kaggle/input/nlp-cse-uoi-2025/data_new/abstracts.txt", "r", encoding="utf-8") as f:
    for line in f:
       
        parts = line.split("|--|")
        if len(parts) == 2:
            abstract_text = parts[1].strip()
        else:
            abstract_text = line.strip() 
        # 2) Convert to lowercase if using an uncased model
        abstract_text = abstract_text.lower()

        cleaned_abstracts.append(abstract_text)
    print(f"Abstract {1}:\n{abstract_text}\n")
    size=len(cleaned_abstracts)
    print(size)


Abstract 1:
in this work, we face the problem of unsupervised domain adaptation with a novel deep learning approach which leverages on our finding that entropy minimization is induced by the optimal alignment of second order statistics between source and target domains. we formally demonstrate this hypothesis and, aiming at achieving an optimal alignment in practical cases, we adopt a more principled strategy which, differently from the current euclidean approaches, deploys alignment along geodesics. our pipeline can be implemented by adding to the standard classification loss (on the labeled source domain), a source-to-target regularizer that is weighted in an unsupervised and data-driven fashion. we provide extensive experiments to assess the superiority of our framework on standard domain and modality adaptation benchmarks.

138499


In [5]:
import time
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# 1)  list of ~138k cleaned abstracts
print(f"Total abstracts: {len(cleaned_abstracts)}")

# 2) point here at your fine-tuned model dir
model_dir = "/kaggle/working/distilbert-finetuned"

# 3) load tokenizer + model (base model, so we get hidden-states)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = AutoModel.from_pretrained(model_dir)

# 4) to GPU 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

batch_size = 64
all_embeds = []

start = time.time()
for i in tqdm(range(0, len(cleaned_abstracts), batch_size), desc="Embedding"):
    batch = cleaned_abstracts[i : i + batch_size]
    
    # tokenize + move to device
    inputs = tokenizer(batch,
                       return_tensors="pt",
                       padding=True,
                       truncation=True,
                       max_length=128).to(device)
    
    with torch.no_grad():
        out = model(**inputs)
    
    # CLS token is at position 0
    cls_emb = out.last_hidden_state[:, 0, :].cpu()  
    all_embeds.append(cls_emb)

# concat into one tensor
embeddings = torch.cat(all_embeds, dim=0)  # (138000, hidden_size)
end = time.time()

print(f"Done in {end-start:.1f}s — embeddings shape: {embeddings.shape}")

# save to disk (as NumPy .npy)
np.save("cls_embeddings.npy", embeddings.numpy())
print("Saved → cls_embeddings.npy")


Total abstracts: 138499


Embedding: 100%|██████████| 2165/2165 [04:48<00:00,  7.51it/s]


Done in 288.6s — embeddings shape: torch.Size([138499, 768])
Saved → cls_embeddings.npy


In [None]:
import os, numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)


PRETRAINED_MODEL = "allenai/scibert_scivocab_uncased"
TRAIN_FILE,  VALID_FILE  = "/kaggle/working/train.csv", "/kaggle/working/dev.csv"
OUTPUT_DIR    = "./scibert-finetuned"
NUM_EPOCHS    = 3
BATCH_SIZE    = 16
LEARNING_RATE = 2e-5
MAX_LENGTH    = 128
os.environ["HF_DATASETS_CACHE"] = "./hf_cache"     # local cache
os.environ["WANDB_DISABLED"]    = "true"           # no wandb prompt

# 1. Load CSVs
ds = load_dataset(
    "csv",
    data_files={"train": TRAIN_FILE, "validation": VALID_FILE},
    cache_dir="./hf_cache",
)

# 2. Tokeniser + preprocessing
tok = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

def preprocess(batch):
    enc = tok(
        [str(x) for x in batch["sentence1"]],
        [str(x) for x in batch["sentence2"]],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH,
    )
    enc["labels"] = [float(y) for y in batch["label"]]   # regression target
    return enc

ds_tok = ds.map(
    preprocess,
    batched=True,
    remove_columns=["sentence1", "sentence2", "label"],  # drop raw text/old label
)

collate = DataCollatorWithPadding(tok)

# 3. Model
model = AutoModelForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL,
    problem_type="regression",
    num_labels=1,
)

# 4. Metric (NumPy Pearson)
def compute_metrics(pred):
    p, l = pred
    return {"pearson": float(np.corrcoef(p.flatten(), l.astype(float))[0, 1])}

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    do_train=True, do_eval=True,
    num_train_epochs=2,                 # ← one epoch first
    per_device_train_batch_size=32,     # ← or 64 if GPU fits
    per_device_eval_batch_size=64,
    learning_rate=2e-5,
    save_strategy="steps",
    save_steps=10000,
    eval_strategy="steps",
    eval_steps=10000,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    save_total_limit=1,
    greater_is_better=True,
    dataloader_num_workers=4,
    fp16=True,                          # only if GPU is on
)



# 6. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["validation"],
    tokenizer=tok,
    data_collator=collate,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# 7. Train & save
trainer.train()
model.save_pretrained(OUTPUT_DIR)
tok.save_pretrained(OUTPUT_DIR)
# LOCAL OUTPUT_DIR ———
model.save_pretrained(OUTPUT_DIR)
tok.save_pretrained(OUTPUT_DIR)



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

Map:   0%|          | 0/1856323 [00:00<?, ? examples/s]

Map:   0%|          | 0/327587 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork

Step,Training Loss,Validation Loss


model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [1]:
import os
import pickle
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm.auto import tqdm

# ─── Config ─────────────────────────────────────────
ABSTRACTS_TXT = '/kaggle/input/nlp-cse-uoi-2025/data_new/abstracts.txt'
OUTPUT_PKL    = 'scibert_embeddings.pkl'
PRETRAINED    = 'allenai/scibert_scivocab_uncased'  # or _cased

# ─── 1. Load abstracts ─────────────────────────────
abstracts = []
with open(ABSTRACTS_TXT, 'r', encoding='utf-8') as f:
    for ln in f:
        parts = ln.split('|--|')
        txt = parts[1].strip() if len(parts)==2 else ln.strip()
        abstracts.append(txt)

# ─── 2. Load SciBERT ───────────────────────────────
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)
model     = AutoModel.from_pretrained(PRETRAINED)
model.eval()
if torch.cuda.is_available():
    model.to('cuda')

# ─── 3. Compute embeddings ──────────────────────────
batch_size = 16
embeddings = {}  # paper_id -> vector

with torch.no_grad():
    for i in tqdm(range(0, len(abstracts), batch_size), desc="Encoding abstracts"):
        batch_texts = abstracts[i : i + batch_size]
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )
        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k,v in inputs.items()}

        outputs = model(**inputs)
        # take the [CLS] token representation
        cls_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()

        for idx, vec in enumerate(cls_emb, start=i):
            embeddings[idx] = vec

# ─── 4. Save to disk ────────────────────────────────
with open(OUTPUT_PKL, 'wb') as f:
    pickle.dump(embeddings, f)

print(f"✅ Saved {len(embeddings)} SciBERT embeddings to {OUTPUT_PKL}")


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

2025-05-25 22:05:46.747183: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748210746.937994      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748210746.993709      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Encoding abstracts:   0%|          | 0/8657 [00:00<?, ?it/s]

✅ Saved 138499 SciBERT embeddings to scibert_embeddings.pkl
