In [1]:
import spacy
import pickle
from datetime import datetime
import numpy as np

import torch
from sentence_transformers import SentenceTransformer, util

from src.connection import client
from src.config import index_name
from src.utils import process_query, construct_elasticsearch_query

In [2]:
# Load the spaCy model
model = 'en_core_sci_sm'
nlp = spacy.load(model)

# List of stop words to be added
stop_words = ['.', ':', ',', '(',')', '[',']','?', '\\','/', '+', '-','\"','\'','1','2',' ']
# Add stop words to nlp.vocab
for word in stop_words:
    nlp.vocab[word].is_stop = True


In [3]:
query = {"query": {"match_all": {}}}
response = client.search(index=index_name, body=query, scroll="2m", size=1000)  # Adjust size based on your needs
scroll_id = response['_scroll_id']

docs=[]
while True:
    # Process the current batch of results
    for hit in response['hits']['hits']:
        docs.append(hit["_source"])
        
    response = client.scroll(scroll_id=response['_scroll_id'], scroll='2m')
    if not response['hits']['hits']:
        break

In [4]:
# corpus = []
# step = len(docs)/10
# start_t = datetime.now()
# for i, doc in enumerate(docs):
#     text = doc["Title"] + " " + doc["Abstract"]
#     tokens = [token.text.lower() for token in nlp(text)]
#     tokenized_text = " ".join([token for token in tokens if not nlp.vocab[token].is_stop])
#     corpus.append(tokenized_text)

#     if (i+1) % step  == 0:
#         with open("data/preprocessed_corpus.pkl", "wb") as f:
#             pickle.dump(corpus, f)
            
#         end_t = datetime.now() 
#         print(f"Preprocessing progress: {(i+1) * 100 / len(docs):.1f}%. Spend {(end_t-start_t).total_seconds()/60:.2f} minutes until now")

In [5]:
# # Save preprocessed_corpus to a pickle file
# with open("data/preprocessed_corpus.pkl", "wb") as f:
#     pickle.dump(corpus, f)

In [6]:
# Load preprocessed_corpus from the pickle file
with open("data/preprocessed_corpus.pkl", "rb") as f:
    corpus = pickle.load(f)

## SentenceTransformers

In [7]:
query = "What are the key cognitive abilities associated with human intelligence?"
tokens, entities = process_query(query,nlp)
tokenized_query = " ".join(tokens)
print(tokenized_query)

key cognitive abilities associated human intelligence


In [8]:
# Load pre-trained SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Similarly, you can batch process the query if needed
query_embedding = model.encode(tokenized_query, convert_to_tensor=True)

## V2

In [9]:
# Encode documents in batches
# batch_size = 64
# corpus_batches = [corpus[i:i + batch_size] for i in range(0, len(corpus), batch_size)]

# doc_embeddings_list = []
# start_t = datetime.now()
# for i, batch in enumerate(corpus_batches):
#     doc_embeddings_batch = model.encode(batch, convert_to_tensor=True)
#     doc_embeddings_list.append(doc_embeddings_batch)
    
#     if (i+1) % 100  == 0:
#         end_t = datetime.now()
#         print(f"progress: {(i+1) * 100 / len(corpus_batches):.1f}%. Spend {(end_t-start_t).total_seconds()/60:.2f} minutes until now")
        
# doc_embeddings = torch.cat(doc_embeddings_list)
# np.save('data/doc_embeddings_v2.npy', doc_embeddings.numpy())

## V1

In [10]:
# Load pre-trained SentenceTransformer model
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Encode documents and query
# doc_embeddings = model.encode(corpus, convert_to_tensor=True)

# np.save('data/doc_embeddings_v1.npy', doc_embeddings)

In [11]:
# np.save('data/doc_embeddings.npy', doc_embeddings.numpy())

In [12]:
# Load the embeddings back
doc_embeddings = np.load('data/doc_embeddings_v1.npy')

In [13]:
# Calculate cosine similarity between query and documents
st_scores = util.pytorch_cos_sim(query_embedding, doc_embeddings).numpy().flatten()

In [14]:
ranked_docs_indices = sorted(range(len(st_scores)), key=lambda i: st_scores[i], reverse=True)

# Specify the date range for PubDateEDAT facet search
start_date = datetime.strptime("2016/01/01", "%Y/%m/%d")
end_date = datetime.strptime("2016/12/31", "%Y/%m/%d")

# Filter documents based on PubDateEDAT facet
filtered_docs = [(st_scores[i],docs[i]) for i in ranked_docs_indices if start_date <= datetime.strptime(docs[i]["PubDateEDAT"], "%Y/%m/%d") <= end_date]

# Display top N similar documents
top_n = min(5,len(filtered_docs))
for i, (score, doc) in enumerate(filtered_docs[:top_n]):
    print(f"Rank {i + 1}: Score {score}")
    print(doc["PMID"],doc["Title"])
    print("=" * 50)

Rank 1: Score 0.5498866438865662
27150661 Executive function and intelligence in the resolution of temporary syntactic ambiguity: an individual differences investigation.
Rank 2: Score 0.546873927116394
27750571 Cognitive Deficits Post-Traumatic Brain Injury and Their Association with Injury Severity and Gray Matter Volumes.
Rank 3: Score 0.5415911674499512
27809665 Executive abilities in children with congenital visual impairment in mid-childhood.
Rank 4: Score 0.5327783226966858
27726852 Does the way we read others' mind change over the lifespan? Insights from a massive web poll of cognitive skills from childhood to late adulthood.
Rank 5: Score 0.5259026885032654
27825737 Profile of cognitive function in adults with duchenne muscular dystrophy.
