In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import json
import os
import pandas as pd
import re

csv_dir = '/content/drive/MyDrive/Data mining assignment/Findings'
# csv_dir = '/kaggle/input/findings/Findings'

dataframes = []

counter = 0

csv_files = [f for f in os.listdir(csv_dir)]

for file in csv_files:
  counter = counter + 1
  file_path = os.path.join(csv_dir, file)
  df = pd.read_csv(file_path)
  # empty csvs or manually flagged files removed [ Manually Flagged files contained findings with very short responses]
  if df.empty or file in ["PMC4951957.csv", "PMC1413717.csv", "PMC6483904.csv", "PMC3367590.csv", "PMC6044682.csv", "PMC1854975.csv", "PMC5456283.csv", "PMC5778373.csv", "PMC3125328.csv", "PMC3291402.csv","PMC4032042.csv" ]:
    print("problem importing the following file", file)
    print(df)
    continue
  print(counter, file)
  dataframes.append(pd.DataFrame({
          "pubmed_id": [df['pubmed_id'][0]],
          "findings": ["\n".join(df['finding'].to_list())]
       }))

df = pd.concat(dataframes, ignore_index=True)

1 PMC4342244.csv
2 PMC1125584.csv
3 PMC1797619.csv
4 PMC4855071.csv
5 PMC6057395.csv
6 PMC4189045.csv
7 PMC3694142.csv
8 PMC3706368.csv
9 PMC5655174.csv
10 PMC2860042.csv
11 PMC1317511.csv
12 PMC5718809.csv
13 PMC4794695.csv
14 PMC1186024.csv
15 PMC4429104.csv
16 PMC5502318.csv
17 PMC4311960.csv
18 PMC6237255.csv
19 PMC3291383.csv
20 PMC2765826.csv
21 PMC6593266.csv
22 PMC6066233.csv
23 PMC4088223.csv
24 PMC4687909.csv
25 PMC5558907.csv
26 PMC5098735.csv
27 PMC4117846.csv
28 PMC87870.csv
29 PMC3573272.csv
30 PMC4281269.csv
31 PMC1866046.csv
32 PMC4941746.csv
33 PMC5477201.csv
34 PMC2828449.csv
35 PMC3291344.csv
36 PMC3126774.csv
37 PMC2582611.csv
38 PMC6405118.csv
39 PMC4618063.csv
40 PMC5445526.csv
41 PMC6964588.csv
42 PMC112517.csv
43 PMC4125703.csv
44 PMC1895271.csv
45 PMC6928875.csv
46 PMC3065724.csv
47 PMC3985287.csv
48 PMC4789606.csv
49 PMC3879270.csv
50 PMC6631848.csv
51 PMC3970135.csv
52 PMC5285497.csv
53 PMC1900074.csv
54 PMC3338671.csv
55 PMC2942877.csv
56 PMC2765446.csv
57 P

In [None]:
#df.drop('full_text') # we do not need full_text column anymore
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1348 entries, 0 to 1347
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pubmed_id  1348 non-null   object
 1   findings   1348 non-null   object
dtypes: object(2)
memory usage: 21.2+ KB


In [None]:
print(df.isnull().sum())

pubmed_id    0
findings     0
dtype: int64


In [None]:
df.head()

Unnamed: 0,pubmed_id,findings
0,PMC4342244,"VLPs, when injected post-exposure, significant..."
1,PMC1125584,Emergence of new human infectious diseases is ...
2,PMC1797619,In the study on the effects of alkaline phosph...
3,PMC4855071,"GroEL, a molecular adjuvant derived from Esche..."
4,PMC6057395,A review by Hemila et al. (2013) found that th...


In [None]:
df.head(1)["findings"][0]

"VLPs, when injected post-exposure, significantly accelerate the onset of isg induction in Ebola-infected mice, allowing for timely establishment of anti-viral and anti-inflammatory states.\nVLPs trigger early induction of negative regulatory isgs, which limits excess inflammatory responses and allows for maturation of antigen presentation function, resulting in robust adaptive immune responses.\nVLPs provide post-exposure protection against Ebola infection by relieving Ebola's antagonism against type I interferons, resulting in reduced systemic inflammation and subsequent enhancement in acquired immune responses.\nThe transcription factor Irf8 is required for VLP-mediated post-exposure protection against Ebola infection, as it amplifies type I interferon gene induction and boosts interferon biological activities.\nVLPs augment type I interferon-responsive genes via Irf8-dependent manner in Ebola-infected mice, leading to reduced viral replication and inflammatory gene expression."

In [None]:
df = df.map(lambda x: x.replace('\n', ' ') if isinstance(x, str) else x)
df = df.map(lambda x: x.replace(':', ' ') if isinstance(x, str) else x)
df = df.map(lambda x: x.replace('(', ' ') if isinstance(x, str) else x)
df = df.map(lambda x: x.replace(')', ' ') if isinstance(x, str) else x)
df = df.map(lambda x: x.lower() if isinstance(x, str) else x)

In [None]:
df.replace('', float('nan'), inplace=True)
df.dropna(inplace=True)

df = df.reset_index(drop=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1348 entries, 0 to 1347
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   pubmed_id  1348 non-null   object
 1   findings   1348 non-null   object
dtypes: object(2)
memory usage: 21.2+ KB


From notes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tf_idf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)

fitted_tf_idf_vectorizer = tf_idf_vectorizer.fit(df['findings'])

tfidf_matrix = tf_idf_vectorizer.transform(df['findings'])

#pickle matrix
#pickle vextorizer

In [None]:
def sparse_retrieval(query, documentDf = df,top_k=5):
    query_vec = tf_idf_vectorizer.transform([query])

    scores = cosine_similarity(query_vec, tfidf_matrix)[0]

    top_indices = scores.argsort()[-top_k:][::-1]
    print(top_indices)
    return documentDf.iloc[top_indices]

    # return [(titles[i], documents[i]) for i in top_indices]

obtain_documents = sparse_retrieval('covid19 geneno testing')


[ 521  433 1313  302  542]


In [None]:
obtain_documents

Unnamed: 0,pubmed_id,findings
521,pmc6174282,the study found that diagnostic testing for co...
433,pmc5853820,the study found that rapid multiplex testing f...
1313,pmc5325537,"this retrospective study, conducted over a 3-y..."
302,pmc4713849,in the outbreak of mers-cov in the republic of...
542,pmc2992623,"in the context of the global epidemic of hiv, ..."


From notes

In [None]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
from transformers import DPRContextEncoder, DPRQuestionEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer
import numpy as np
import faiss

# Load DPR models and tokenizers
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/492 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/493 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
context_encoder.to(device)

context_embeddings = []

# Batch processing with GPU
batch_size = 20
for i in range(0, len(df), batch_size):
    batch = df['findings'][i:i+batch_size]
    inputs = context_tokenizer(batch.tolist(), padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    embeddings = context_encoder(**inputs).pooler_output.detach().cpu().numpy()
    context_embeddings.append(embeddings)
    print(i)

context_embeddings = np.vstack(context_embeddings)


0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940
960
980
1000
1020
1040
1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340


In [None]:


# Use FAISS for efficient similarity search
faiss_index = faiss.IndexFlatIP(context_embeddings.shape[1])
faiss_index.add(context_embeddings)

def dense_retrieval(query,documentDf = df ,top_k=5):
    query_inputs = question_tokenizer(query, return_tensors="pt", truncation=True, max_length=512)
    query_embedding = question_encoder(**query_inputs).pooler_output.detach().numpy()
    scores, indices = faiss_index.search(query_embedding, top_k)
    print(indices[0])

    return documentDf.iloc[indices[0]]

# Example query
results = dense_retrieval('covid19 geneno testing')

[ 758   81  214 1136  413]


In [None]:
results

Unnamed: 0,pubmed_id,findings
758,pmc6236892,the study revealed 2363 differentially express...
81,pmc4018855,the study successfully integrated expression c...
214,pmc5348035,"two amino acid substitutions, p305l and n345d,..."
1136,pmc3353197,this study identified 933 differentially expre...
413,pmc4012804,the study developed an snp real-time rt-pcr fo...


In [None]:
import pickle

with open('tfidf_vectorizer_findings.pkl', 'wb') as f:
    pickle.dump(fitted_tf_idf_vectorizer, f)


with open('tfidf_matrix_findings.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)


with open('context_embeddings_findings.pkl', 'wb') as f:
    pickle.dump(context_embeddings, f)



In [None]:
df.to_csv('dataset.csv',index=False, encoding="utf-8")

In [None]:
import zipfile

with zipfile.ZipFile('datasetcsv.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('dataset.csv')