In [1]:
import json
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.cm as cm
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn.functional as F
from sklearn.metrics import precision_recall_fscore_support
import ast

In [2]:
DATA_PATH = '../data/specter'

In [38]:
# Initialize empty lists to store ids and embeddings
ids = []
embeddings = []

embedding_files = [f'{DATA_PATH}/output_10k.json']
# embedding_files = ['custom output/embedding_result_train.jsonl', 'custom output/embedding_result_val.jsonl', 'custom output/embedding_result_test.jsonl']

# Load embeddings from JSON file
for file in embedding_files:
    with open(file, 'r') as f:
        for line in f:
            data = json.loads(line)
            id = data['paper_id']
            # removing the R paper because it has too many citations and makes everything else in graph look tiny
            # if id == '659408b243cec55de8d0a3bc51b81173007aa89b':
            #     continue
            embedding = np.array(data['embedding'])
            ids.append(id)
            embeddings.append(embedding)

In [39]:
with open(f'{DATA_PATH}/output_10k.json', 'r') as f:
    embeddings = [json.loads(line) for line in f]


In [40]:
# embedding_map = {}
# for obj in embeddings:
#     # obj is dict like {paper_id: 768 dimensional array embedding}
#     embedding_map[obj['paper_id']] = np.asarray(obj['embedding'])

In [41]:
with open(f'{DATA_PATH}/metadata_10k_full.json', 'r') as f:
    metadata = json.load(f)

In [42]:
len(embedding_map), len(metadata)

(9922, 9922)

In [43]:
metadata['659408b243cec55de8d0a3bc51b81173007aa89b']

{'title': 'R: A language and environment for statistical computing.',
 'abstract': 'Copyright (©) 1999–2012 R Foundation for Statistical Computing. Permission is granted to make and distribute verbatim copies of this manual provided the copyright notice and this permission notice are preserved on all copies. Permission is granted to copy and distribute modified versions of this manual under the conditions for verbatim copying, provided that the entire resulting derived work is distributed under the terms of a permission notice identical to this one. Permission is granted to copy and distribute translations of this manual into another language, under the above conditions for modified versions, except that this permission notice may be stated in a translation approved by the R Core Team.',
 'year': 2014,
 'referenceCount': 0,
 'citationCount': 295928,
 'influentialCitationCount': 40046,
 'references': '[]',
 'paper_id': '659408b243cec55de8d0a3bc51b81173007aa89b'}

In [66]:
for e in embedding_map:
    print(e)
    break

fa72afa9b2cbc8f0d7b05d52548906610ffbb9c5


In [46]:
weights = torch.stack([torch.tensor(e['embedding']) for e in embeddings]).double()
paper_ids = [e['paper_id'] for e in embeddings]

In [47]:
weights.shape

torch.Size([9922, 768])

In [48]:
specter_tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
specter_model = AutoModel.from_pretrained('allenai/specter')

In [49]:
papers = [{'title': 'BERT', 'abstract': 'We introduce a new language representation model called BERT'},
          {'title': 'Attention is all you need', 'abstract': ' The dominant sequence transduction models are based on complex recurrent or convolutional neural networks'}]

In [50]:
@torch.no_grad()
def get_embedding(paper):
    # tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
    # model = AutoModel.from_pretrained('allenai/specter')
    title_abs = [d['title'] + specter_tokenizer.sep_token + (d.get('abstract') or '') for d in [paper]]
    inputs = specter_tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    result = specter_model(**inputs)
    cur_embedding = result.last_hidden_state[:, 0, :]
    return cur_embedding

In [51]:
@torch.no_grad()
def find_similar_knn(paper, weights, k=10, least = False):
    cur_embedding = get_embedding(paper)
    weights_norm = F.normalize(weights, p=2, dim=1).double() # (N, d)
    cur_em_norm = F.normalize(cur_embedding, p=2, dim=1).double() # (1, d)
    cos_sim = F.cosine_similarity(weights, cur_embedding, dim=1)
    topk = torch.topk(cos_sim, k, largest = False if least else True)
    top_indices = topk.indices
    top_values = topk.values
    return top_indices, top_values

In [105]:
def evaluate(recommended, actual, k=None):
    recommended = np.asarray(recommended)[:k] if k else np.asarray(recommended)
    actual = np.asarray(actual)
    
    true_positives = np.intersect1d(recommended, actual)
    false_positives = np.setdiff1d(recommended, actual)
    false_negatives = np.setdiff1d(actual, recommended)

    precision = len(true_positives) / (len(true_positives) + len(false_positives))
    recall = len(true_positives) / (len(true_positives) + len(false_negatives))
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    # if k:
    #     precision_at_k = len(true_positives) / k
    #     recall_at_k = len(true_positives) / len(actual)
    #     return precision_at_k, recall_at_k, f1_score
    return precision, recall, f1_score


In [52]:
# 204e3073870fae3d05bcbc2f6a8e263d9b72e776
new_title = 'Attention Is All You Need'
new_abstract = 'The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.'

In [14]:
new_title = 'SPECTER: Document-level Representation Learning using Citation-informed Transformers'
# new_abstract = 'Representation learning is a critical ingredient for natural language processing systems. Recent Transformer language models like BERT learn powerful textual representations, but these models are targeted towards token- and sentence-level training objectives and do not leverage information on inter-document relatedness, which limits their document-level representation power. For applications on scientific documents, such as classification and recommendation, the embeddings power strong performance on end tasks. We propose SPECTER, a new method to generate document-level embedding of scientific documents based on pretraining a Transformer language model on a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning. Additionally, to encourage further research on document-level models, we introduce SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. We show that SPECTER outperforms a variety of competitive baselines on the benchmark.'

new_abstract = 'We propose SPECTER, a new method to generate document-level embedding of scientific documents based on pretraining a Transformer language model on a powerful signal of document-level relatedness: the citation graph. Unlike existing pretrained language models, SPECTER can be easily applied to downstream applications without task-specific fine-tuning. Additionally, to encourage further research on document-level models, we introduce SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation prediction, to document classification and recommendation. We show that SPECTER outperforms a variety of competitive baselines on the benchmark.'

In [149]:
# new_paper = {'title': new_title, 'abstract': new_abstract or ''}
# assert new_paper['title'] is not None
paper_id = '204e3073870fae3d05bcbc2f6a8e263d9b72e776'
# paper_id = '156d217b0a911af97fa1b5a71dc909ccef7a8028'
top_indices, top_values = find_similar_knn(metadata[paper_id], weights, k=10, least=False)

In [150]:
recommended_paper_ids = [paper_ids[i] for i in top_indices]
print(recommended_paper_ids[:10])

['204e3073870fae3d05bcbc2f6a8e263d9b72e776', 'b60abe57bc195616063be10638c6437358c81d1e', '9ae0a24f0928cab1554a6ac880f6b350f85be698', 'dbde7dfa6cae81df8ac19ef500c42db96c3d1edd', '43428880d75b3a14257c3ee9bda054e61eb869c0', '93499a7c7f699b6630a86fad964536f9423bb6d0', '4550a4c714920ef57d19878e31c9ebae37b049b2', 'bb669de2fce407df2f5cb2f8c51dedee3f467e04', '25eb839f39507fe6983ad3e692b2f8d93a5cb0cc', 'bf8fe437f779f2098f9af82b534aa51dc9edb06f']


In [108]:
cnt = 0
for paper_id, cos_sim in zip(recommended_paper_ids, top_values):
    title = metadata[paper_id]['title']
    # abstract = metadata[paper_id]['abstract']
    year = metadata[paper_id]['year']
    print(f'Paper ID: {paper_id}\nTitle: {title}\nYear: {year}\nCosine similarity: {cos_sim}\n')
    cnt += 1 
    if cnt == 10:
        break

Paper ID: 156d217b0a911af97fa1b5a71dc909ccef7a8028
Title: SciBERT: A Pretrained Language Model for Scientific Text
Year: 2019
Cosine similarity: 0.9986527573226276

Paper ID: 87078d95bee341a1767034d9432fb34937ecf65a
Title: SciBERT: Pretrained Contextualized Embeddings for Scientific Text
Year: 2019
Cosine similarity: 0.9512990508639432

Paper ID: 54523ff961a1ac57a86696ef9a53b3a630b482c0
Title: Domain-Specific Language Model Pretraining for Biomedical Natural Language Processing
Year: 2020
Cosine similarity: 0.9125203907711272

Paper ID: b73191adcc938cfcf20ce0327cf5cd1f539f7f81
Title: Scientific Information Extraction with Semi-supervised Neural Tagging
Year: 2017
Cosine similarity: 0.9105428039253067

Paper ID: a550ad9d7f47b65f7788287a23717a4f7c5b75c7
Title: Pretrained Language Models for Sequential Sentence Classification
Year: 2019
Cosine similarity: 0.9039615004641104

Paper ID: 81815d9a847e406f8d49fb5051e2ae1055e13208
Title: To Pretrain or Not to Pretrain: Examining the Benefits of

In [153]:
actual_references = ast.literal_eval(metadata['204e3073870fae3d05bcbc2f6a8e263d9b72e776'].get('references'))

In [77]:
# def evaluate(recommended, actual):
#     recommended = np.asarray(recommended)
#     actual = np.asarray(actual)
    
#     true_positives = np.intersect1d(recommended, actual)
#     false_positives = np.setdiff1d(recommended, actual)
#     false_negatives = np.setdiff1d(actual, recommended)

#     precision = len(true_positives) / (len(true_positives) + len(false_positives))
#     recall = len(true_positives) / (len(true_positives) + len(false_negatives))
#     f1_score = 2 * (precision * recall) / (precision + recall)

#     return precision, recall, f1_score


In [79]:
recommended_paper_ids[1]

'1956c239b3552e030db1b78951f64781101125ed'

In [154]:
k = 10
precision, recall, f1_score = evaluate(recommended_paper_ids[1:], actual_references, k=k)
print(f"Precision @ {k}: {precision}")
print(f"Recall @ {k}: {recall}")
print(f"F1 Score: {f1_score}")

Precision @ 10: 0.4444444444444444
Recall @ 10: 0.10810810810810811
F1 Score: 0.17391304347826086


### n = 50
Precision: 0.061224489795918366
Recall: 0.05454545454545454
F1 Score: 0.05769230769230769

### n = 500
Precision: 0.04408817635270541
Recall: 0.4
F1 Score: 0.07942238267148015

# NN-Based Approach to improve P/R/F1

In [64]:
embedding_map = {}
with open('../data/specter/output_10k.json', 'r') as f:
        for line in f:
            data = json.loads(line)
            id = data['paper_id']
            # removing the R paper because it has too many citations and makes everything else in graph look tiny
            # if id == '659408b243cec55de8d0a3bc51b81173007aa89b':
            #     continue
            embedding = np.array(data['embedding'])
            embedding_map[id] = embedding

In [65]:
all_paper_ids = list(embedding_map.keys())

In [67]:
len(embedding_map)

9922

In [68]:
reference_map = {}
# fetch references from metadata dict in the format {paper_id: [list of references]}
for paper_id in all_paper_ids:
    references = ast.literal_eval(metadata[paper_id].get('references'))
    reference_map[paper_id] = references or []

In [127]:
class RankNet(torch.nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RankNet, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, hidden_size)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden_size, 1)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


In [128]:
def pairwise_ranking_loss(scores, labels, margin=1.0):
    diff = labels * (margin - scores)
    loss = torch.nn.functional.relu(diff)
    return loss.mean()


In [123]:
num_recommendations = 50
K = 10

X = []
y = []

for cur_paper_id in all_paper_ids[:5]:
    cur_references = reference_map[cur_paper_id]
    cur_paper = metadata[cur_paper_id]
    top_indices, top_values = find_similar_knn(cur_paper, weights, k=num_recommendations, least=False)
    cur_recommendations = [paper_ids[i] for i in top_indices]

    # TODO remove this line 
    cur_recommendations.remove(cur_paper_id)
    
    p,r,f1 = evaluate(cur_recommendations, cur_references, k=K)
    print(f'{cur_paper_id=}{p=}{r=}{f1=}')
    
    
    
    for rec in cur_recommendations:
        X.append((cur_paper_id, rec))
        y.append(1 if rec in cur_references else -1)
    
    

cur_paper_id='fa72afa9b2cbc8f0d7b05d52548906610ffbb9c5'p=0.1r=0.03571428571428571f1=0.05263157894736841
cur_paper_id='a6cb366736791bcccc5c8639de5a8f9636bf87e8'p=0.1r=0.038461538461538464f1=0.05555555555555555
cur_paper_id='156d217b0a911af97fa1b5a71dc909ccef7a8028'p=0.0r=0.0f1=0
cur_paper_id='bee044c8e8903fb67523c1f8c105ab4718600cdb'p=0.1r=0.05555555555555555f1=0.07142857142857142
cur_paper_id='cb92a7f9d9dbcf9145e32fdfa0e70e2a6b828eb1'p=0.2r=0.11764705882352941f1=0.14814814814814817


In [145]:
embedding_dim = 768
model = RankNet(embedding_dim * 2, 256)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
# loss = torch.nn.MarginRankingLoss(
for epoch in range(1):
    for i, (cur_id, rec_id) in enumerate(X):
        x1 = torch.tensor(embedding_map[cur_id]).float().unsqueeze(0)
        x2 = torch.tensor(embedding_map[rec_id]).float().unsqueeze(0)
        score1 = model(x1, x2)
        score2 = model(x2, x1)
        print(score1)
        print(score2)
        loss = pairwise_ranking_loss(score1 - score2, y[i])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i % 10 == 0:
            print(loss.item())
            
        break
        
    print(f'{epoch=}: {loss=}')


tensor([[-0.2746]], grad_fn=<AddmmBackward0>)
tensor([[-0.1548]], grad_fn=<AddmmBackward0>)
0.0
epoch=0: loss=tensor(0., grad_fn=<MeanBackward0>)


In [146]:
torch.__version__

'1.13.1'

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class PaperRankingNet(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(PaperRankingNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

def pairwise_ranking_loss(pos_score, neg_score):
    return torch.mean(torch.relu(1 - (pos_score - neg_score)))

# Initialize the model
input_dim = 768
hidden_dim = 512
model = PaperRankingNet(input_dim, hidden_dim)

# Define the loss function and optimizer
criterion = pairwise_ranking_loss
optimizer = optim.Adam(model.parameters())

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    for input_paper, relevant_papers in training_set:
        pos_score = model(input_paper)
        neg_scores = [model(paper) for paper in training_set if paper not in relevant_papers]
        loss = sum([criterion(pos_score, neg_score) for neg_score in neg_scores])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Evaluate the model
for input_paper, actual_references in testing_set:
    scores = {paper: model(input_paper) for paper in training_set}
    ranked_papers = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)
    recommended_papers = ranked_papers[:10]
    precision = len(set(recommended_papers) & set(actual_references)) / len(recommended_papers)
    recall = len(set(recommended_papers) & set(actual_references)) / len(actual_references)
    f1_score = 2 * (precision * recall) / (precision + recall)


NameError: name 'training_set' is not defined