### Nikolaos Giannopoulos AM 5199
### Team: Trump Tariffed My Datasets

In [2]:
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sentence_transformers import losses
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoTokenizer
from sentence_transformers import SentenceTransformer, InputExample

# Load the data

In [19]:
node_pairs = pd.read_csv('Data/train_pairs.csv', header=None)
labels = pd.read_csv('Data/train_labels.csv', header=None)
#Split to use for training and validation
train_pairs, val_pairs, train_labels, val_labels = train_test_split(node_pairs, labels, train_size=0.8, test_size=0.2, random_state=42)

#Train pairs for SPECTER fine-tuning, val pairs for validation of each model
train_pairs = train_pairs.reset_index(drop=True)
val_pairs = val_pairs.reset_index(drop=True)
train_labels = train_labels.reset_index(drop=True)
val_labels = val_labels.reset_index(drop=True)

In [20]:
# Read the abstract of each paper
abstracts = dict()
with open('abstracts.txt', 'r', encoding='utf-8') as f:
    for line in f:
        node, abstract = line.split('|--|')
        abstracts[int(node)] = abstract

In [59]:
length_total = train_pairs.shape[0]

# Prepare the abstracts for SPECTER fine-tuning

In [4]:
abstracts_all = []

length_pairs = train_pairs.shape[0]

for i in tqdm(range(length_pairs)):
    abstract_1 = int(train_pairs.iloc[[i]][0].item())
    abstract_2 = int(train_pairs.iloc[[i]][1].item())
    linked = train_labels.iloc[[i]][0].item()
    if linked==0:
        abstracts_all.append(InputExample(texts=[abstracts[abstract_1], abstracts[abstract_2]], label=0.0))
    else:
        abstracts_all.append(InputExample(texts=[abstracts[abstract_1], abstracts[abstract_2]], label=1.0))

100%|██████████| 327585/327585 [01:25<00:00, 3830.37it/s]


# Training area for the fine-tuned SPECTER 

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [8]:
#Make sure GPU memory is clean
torch.cuda.empty_cache()

#Load pretrained SPECTER
model = SentenceTransformer('sentence-transformers/allenai-specter', device=device)

#Create DataLoader for all training pairs
train_dataloader = DataLoader(abstracts_all, shuffle=True, batch_size=16)  #Batch size 16-32

#Define cosine similarity loss
train_loss = losses.CosineSimilarityLoss(model)

#Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,                  #Try 3 because it takes a lot of time and huge load
    warmup_steps=100,
    show_progress_bar=True,
    output_path='models/specter_finetuned_full_test',
    use_amp=True
)
model.save('models/specter_finetuned_full_test')




Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
500,0.1081
1000,0.0972
1500,0.0933
2000,0.0913
2500,0.086
3000,0.087
3500,0.0874
4000,0.0861
4500,0.0851
5000,0.0844


# Model training area

In [17]:
model_name = "specter_finetuned_full_test"  # Choose: "bert", "scibert", "specter" and 'specter_finetuned_test_full"
batch_size = 32
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----- INPUT -----
nodes = list(abstracts.keys())
texts = [abstracts[int(i)] for i in nodes]
txt2feat = dict()
article_textual_embeddings = []

# -------------- BATCH UTILS --------------
def batched(iterable, batch_size):
    for i in range(0, len(iterable), batch_size):
        yield iterable[i:i + batch_size]

# -------------- MODEL LOAD AND BATCH EMBEDDING --------------
if model_name == "bert":
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
    model = AutoModel.from_pretrained('bert-base-uncased', torch_dtype=torch.float16).to(device)
    model.eval()

    for batch_nodes in tqdm(list(batched(nodes, batch_size))):
        texts_batch = [abstracts[int(i)] for i in batch_nodes]
        tokens = tokenizer(texts_batch, return_tensors='pt', padding=True, truncation=True, max_length=250).to(device)
        with torch.no_grad():
            outputs = model(**tokens).pooler_output.cpu().numpy()
        for i, node in enumerate(batch_nodes):
            txt2feat[int(node)] = outputs[i]

elif model_name == "scibert":
    tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
    model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased').to(device)
    model.eval()

    for batch_nodes in tqdm(list(batched(nodes, batch_size))):
        texts_batch = [abstracts[int(i)] for i in batch_nodes]
        tokens = tokenizer(texts_batch, return_tensors='pt', padding=True, truncation=True, max_length=250).to(device)
        with torch.no_grad():
            outputs = model(**tokens).pooler_output.cpu().numpy()
        for i, node in enumerate(batch_nodes):
            txt2feat[int(node)] = outputs[i]

elif model_name == "specter":
    model = SentenceTransformer('sentence-transformers/allenai-specter').to(device)
    model.eval()

    for batch_nodes in tqdm(list(batched(nodes, batch_size))):
        texts_batch = [abstracts[int(i)] for i in batch_nodes]
        embeddings = model.encode(texts_batch, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=False)
        for i, node in enumerate(batch_nodes):
            txt2feat[int(node)] = embeddings[i]

elif model_name == "specter_finetuned_full_test":
    model = SentenceTransformer('models/specter_finetuned_full_test').to(device)
    model.eval()

    for batch_nodes in tqdm(list(batched(nodes, batch_size))):
        texts_batch = [abstracts[int(i)] for i in batch_nodes]
        embeddings = model.encode(texts_batch, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=False)
        for i, node in enumerate(batch_nodes):
            txt2feat[int(node)] = embeddings[i]
            
# --------------OOUTPUT AS ARRAY --------------
for i in nodes:
    article_textual_embeddings.append(txt2feat[int(i)])

  3%|▎         | 116/4329 [00:29<18:07,  3.87it/s]


KeyboardInterrupt: 

In [13]:
torch.save(article_textual_embeddings, f'Data/article_textual_embeddings_specter_test.pt')

# Test models

In [21]:
#Create the positive and negative sambles
pos_samples = []
neg_samples = []
length_total = val_pairs.shape[0]

for i in tqdm(range(length_total)):
    linked = val_labels.iloc[[i]][0].item()
    if linked==1:
        pos_samples.append([int(val_pairs.iloc[[i]][0].item()),int(val_pairs.iloc[[i]][1].item())])
    else:
        neg_samples.append([int(val_pairs.iloc[[i]][0].item()),int(val_pairs.iloc[[i]][1].item())])

100%|██████████| 109196/109196 [00:28<00:00, 3889.74it/s]


In [26]:
#Load embeddings
full_array = torch.load("Data/article_textual_embeddings_specter_test.pt", weights_only=False)
simspos = []
simsneg = []

#Compute similarities for the first 500 negative pairs
for neg_sample in tqdm(neg_samples[:500]):
    #Get the embeddings of the two articles in the pair
    node_paper_embedding_1 = np.expand_dims(full_array[neg_sample[0]], axis=0)
    node_paper_embedding_2 = np.expand_dims(full_array[neg_sample[1]], axis=0)

    #Compute cosine similarity
    paper_sim = cosine_similarity(node_paper_embedding_1, node_paper_embedding_2)[0]
    simsneg.append(paper_sim)

#Print average similarity
print("Average cosine similarity on negative:", sum(simsneg) / len(simsneg))

for pos_sample in tqdm(pos_samples[:500]):
    
    node_paper_embedding_1 = np.expand_dims(full_array[pos_sample[0]], axis=0)
    node_paper_embedding_2 = np.expand_dims(full_array[pos_sample[1]], axis=0)
    simspos.append(cosine_similarity(node_paper_embedding_1, node_paper_embedding_2)[0])

print("Average cosine similarity on positive:", sum(simspos) / len(simspos))

100%|██████████| 500/500 [00:00<00:00, 3897.35it/s]


Average cosine similarity on negative: [0.14071403]


100%|██████████| 500/500 [00:00<00:00, 3395.02it/s]

Average cosine similarity on positive: [0.82576215]



