In [1]:
import spacy
import pickle
from datetime import datetime

from rank_bm25 import BM25Okapi

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics.pairwise import cosine_similarity

from src.connection import client
from src.config import index_name
from src.utils import process_query, construct_elasticsearch_query

In [2]:
# Load the spaCy model
model = 'en_core_sci_sm'
nlp = spacy.load(model)

# List of stop words to be added
stop_words = ['.', ':', ',', '(',')', '[',']','?', '\\','/', '+', '-','\"','\'','1','2',' ']
# Add stop words to nlp.vocab
for word in stop_words:
    nlp.vocab[word].is_stop = True


In [3]:
query = {"query": {"match_all": {}}}
response = client.search(index=index_name, body=query, scroll="2m", size=1000)  # Adjust size based on your needs
scroll_id = response['_scroll_id']

docs=[]
while True:
    # Process the current batch of results
    for hit in response['hits']['hits']:
        docs.append(hit["_source"])
        
    response = client.scroll(scroll_id=response['_scroll_id'], scroll='2m')
    if not response['hits']['hits']:
        break

In [4]:
# corpus = []
# step = len(docs)/10
# start_t = datetime.now()
# for i, doc in enumerate(docs):
#     text = doc["Title"] + " " + doc["Abstract"]
#     tokens = [token.text.lower() for token in nlp(text)]
#     tokenized_text = " ".join([token for token in tokens if not nlp.vocab[token].is_stop])
#     corpus.append(tokenized_text)

#     if (i+1) % step  == 0:
#         with open("data/preprocessed_corpus.pkl", "wb") as f:
#             pickle.dump(corpus, f)
            
#         end_t = datetime.now() 
#         print(f"Preprocessing progress: {(i+1) * 100 / len(docs):.1f}%. Spend {(end_t-start_t).total_seconds()/60:.2f} minutes until now")

In [5]:
# # Save preprocessed_corpus to a pickle file
# with open("data/preprocessed_corpus.pkl", "wb") as f:
#     pickle.dump(corpus, f)

In [6]:
# Load preprocessed_corpus from the pickle file
with open("data/preprocessed_corpus.pkl", "rb") as f:
    corpus = pickle.load(f)

## BM25

In [7]:
query = "What are the key cognitive abilities associated with human intelligence?"
tokens, entities = process_query(query,nlp)
tokenized_query = " ".join(tokens)
print(tokenized_query)

key cognitive abilities associated human intelligence


In [8]:
# Calculate BM25 scores
k1=1.6
b= 0.5
bm25 = BM25Okapi(corpus,k1=k1, b=b)
scores = bm25.get_scores(tokenized_query)

In [9]:
bm25_ranked_docs_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)

# Specify the date range for PubDateEDAT facet search
pub_date_range = ["2019/01/01", "2023/12/31"]
start_date = datetime.strptime(pub_date_range[0], "%Y/%m/%d")
end_date = datetime.strptime(pub_date_range[1], "%Y/%m/%d")

# Filter documents based on PubDateEDAT facet
bm25_filtered_docs = [(scores[i],docs[i]) for i in bm25_ranked_docs_indices if start_date <= datetime.strptime(docs[i]["PubDateEDAT"], "%Y/%m/%d") <= end_date]

# Display top N similar documents
top_n = min(5,len(bm25_filtered_docs))
for i, (score, doc) in enumerate(bm25_filtered_docs[:top_n]):
    print(f"Rank {i + 1}: Score {score}")
    print(doc["PMID"],doc["Title"])
    print("=" * 50)

Rank 1: Score 145.52693536266966
35476562 Recurrence-Aware Long-Term Cognitive Network for Explainable Pattern Classification.
Rank 2: Score 145.38190541553934
33838025 [Teleradiology-based stroke network in Western and Southern Transdanubia in Hungary].
Rank 3: Score 145.3439609080656
33270387 [From psychoanalysis to psychodynamic psychotherapy at Albert-Prevost].
Rank 4: Score 145.33504813365943
33584450 Developing an Instrument for Assessing Self-Efficacy in Data Mining and Analysis.
Rank 5: Score 145.3080217841081
33486897 Generalized neurocognitive impairment in individuals at ultra-high risk for psychosis: The possible key role of slowed processing speed.


## TF-IDF

In [10]:
query = "What are the key cognitive abilities associated with human intelligence?"
tokens, entities = process_query(query,nlp)
tokenized_query = " ".join(tokens)
print(tokenized_query)

key cognitive abilities associated human intelligence


In [11]:
import copy

tfidf_corpus = copy.deepcopy(corpus)
tfidf_corpus.append(tokenized_query)

# TF-IDF vectorizer
vectorizer = TfidfVectorizer(norm='l2')
tfidf_embeddings = vectorizer.fit_transform(tfidf_corpus)

tfidf_scores = cosine_similarity(tfidf_embeddings[:-1], tfidf_embeddings[-1].reshape(1, -1)).flatten()

In [12]:
tfidf_ranked_docs_indices = sorted(range(len(tfidf_scores)), key=lambda i: scores[i], reverse=True)

# Specify the date range for PubDateEDAT facet search
pub_date_range = ["2019/01/01", "2023/12/31"]
start_date = datetime.strptime(pub_date_range[0], "%Y/%m/%d")
end_date = datetime.strptime(pub_date_range[1], "%Y/%m/%d")

# Filter documents based on PubDateEDAT facet
tfidf_filtered_docs = [(scores[i],docs[i]) for i in tfidf_ranked_docs_indices if start_date <= datetime.strptime(docs[i]["PubDateEDAT"], "%Y/%m/%d") <= end_date]

# Display top N similar documents
top_n = min(5,len(tfidf_filtered_docs))
for i, (score, doc) in enumerate(tfidf_filtered_docs[:top_n]):
    print(f"Rank {i + 1}: Score {score}")
    print(doc["PMID"],doc["Title"])
    print("=" * 50)

Rank 1: Score 145.52693536266966
35476562 Recurrence-Aware Long-Term Cognitive Network for Explainable Pattern Classification.
Rank 2: Score 145.38190541553934
33838025 [Teleradiology-based stroke network in Western and Southern Transdanubia in Hungary].
Rank 3: Score 145.3439609080656
33270387 [From psychoanalysis to psychodynamic psychotherapy at Albert-Prevost].
Rank 4: Score 145.33504813365943
33584450 Developing an Instrument for Assessing Self-Efficacy in Data Mining and Analysis.
Rank 5: Score 145.3080217841081
33486897 Generalized neurocognitive impairment in individuals at ultra-high risk for psychosis: The possible key role of slowed processing speed.
