In [9]:
!pip install faiss-cpu transformers nltk sentence-transformers



In [10]:
import os
import re
import ast
import pandas as pd
from nltk.corpus import stopwords
import nltk
from tqdm import tqdm

import faiss
import numpy as np
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

nltk.download('stopwords')
tqdm.pandas()

base_path = "./"
os.chdir(base_path)

# Load curated data
df = pd.read_csv("smoking_covid_curated.csv")

# Clean text function
stop_words = set(stopwords.words('english'))  # Define once for speed
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))  # Remove special chars
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply cleaning to abstract
df['clean_abstract'] = df['abstract'].progress_apply(clean_text)

# Apply cleaning to full_text (JSON/dict-like field)
def process_full_text(x):
    if pd.isna(x) or x == {}:
        return ""
    try:
        if isinstance(x, str):
            x = ast.literal_eval(x)  # Convert string to dict
        if not isinstance(x, dict):
            return ""
        return ' '.join(clean_text(t) for section in x.values() for t in section)
    except Exception:
        return ""

df['clean_full_text'] = df['full_text'].progress_apply(process_full_text)


[nltk_data] Downloading package punkt to /home/anton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/anton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|███████████████████████████████████████████████████████████████████████████| 11354/11354 [00:01<00:00, 6194.61it/s]
100%|███████████████████████████████████████████████████████████████████████████| 11354/11354 [00:04<00:00, 2437.02it/s]


In [11]:
df['text'] = df.progress_apply(
    lambda x: ' '.join(part for part in [str(x.get('title', '')), x.get('clean_abstract', ''), x.get('clean_full_text', '')] if part),
    axis=1
)

100%|██████████████████████████████████████████████████████████████████████████| 11354/11354 [00:00<00:00, 70319.61it/s]


In [12]:
df.head()

Unnamed: 0,cord_uid,title,abstract,publish_time,source_x,authors,pdf_json_files,pmc_json_files,full_text,clean_abstract,clean_full_text,text
0,8qnrcgnk,Heme oxygenase-1 and carbon monoxide in pulmon...,"Heme oxygenase-1 (HO-1), an inducible stress p...",2003-08-07,PMC,"Slebos, Dirk-Jan; Ryter, Stefan W; Choi, Augus...",document_parses/pdf_json/faaf1022ccfe93b032c56...,document_parses/pmc_json/PMC193681.xml.json,{'introduction': ['The heme oxygenase-1/carbon...,heme oxygenase ho inducible stress protein con...,heme oxygenasecarbon monoxide hoco system rece...,Heme oxygenase-1 and carbon monoxide in pulmon...
1,qva0jt86,Relevance of human metapneumovirus in exacerba...,BACKGROUND AND METHODS: Human metapneumovirus ...,2005-12-21,PMC,"Rohde, G; Borg, I; Arinir, U; Kronsbein, J; Ra...",document_parses/pdf_json/4ba79e54ecf81b30b5646...,document_parses/pmc_json/PMC1334186.xml.json,{'methods': ['Three different groups were stud...,background methods human metapneumovirus hmpv ...,three different groups studied first group con...,Relevance of human metapneumovirus in exacerba...
2,bnnl700a,Public awareness of risk factors for cancer am...,BACKGROUND: The present study aimed to provide...,2006-01-10,PMC,"Inoue, Manami; Iwasaki, Motoki; Otani, Tetsuya...",document_parses/pdf_json/a78fd1b34372e1e54bf2a...,document_parses/pmc_json/PMC1351169.xml.json,{'methods': ['The study was conducted as a par...,background present study aimed provide informa...,study conducted part omnibus survey december c...,Public awareness of risk factors for cancer am...
3,ft5wl70x,Involvement of microRNAs in physiological and ...,"To date, at least 900 different microRNA (miRN...",2010-11-23,PMC,"Tomankova, Tereza; Petrek, Martin; Kriegova, Eva",document_parses/pdf_json/b97de55ba907c3b1f3048...,document_parses/pmc_json/PMC3001429.xml.json,{'references': []},date least different microrna mirna genes disc...,,Involvement of microRNAs in physiological and ...
4,1h6jz1h5,Plant Plastid Engineering,Genetic material in plants is distributed into...,2010-11-03,PMC,"Wani, Shabir H.; Haider, Nadia; Kumar, Hitesh;...",document_parses/pdf_json/79979652a864cef3a4134...,document_parses/pmc_json/PMC3048312.xml.json,"{'introduction': [""Genetic material in plants ...",genetic material plants distributed nucleus pl...,genetic material plants distributed nucleus ch...,Plant Plastid Engineering genetic material pla...


In [13]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
df['embedding'] = df['text'].progress_apply(lambda x: embedding_model.encode(x, show_progress_bar=False))
normalized_embeddings = normalize(np.vstack(df['embedding'].values))

# Build FAISS index
index = faiss.IndexFlatIP(384)
index.add(normalized_embeddings)

100%|█████████████████████████████████████████████████████████████████████████████| 11354/11354 [02:07<00:00, 88.83it/s]


In [17]:
tokenizer = AutoTokenizer.from_pretrained("allenai/biomed_roberta_base")
model = AutoModelForQuestionAnswering.from_pretrained("allenai/biomed_roberta_base")

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/656M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at allenai/biomed_roberta_base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


In [15]:
def chunk_text(text, max_tokens=400, overlap=50):
    sentences = sent_tokenize(text)
    chunks = []
    chunk = []
    tokens = 0

    for sentence in sentences:
        n_tokens = len(sentence.split())
        if tokens + n_tokens > max_tokens:
            chunks.append(' '.join(chunk))
            chunk = chunk[-overlap:]  # Retain overlap
            tokens = sum(len(c.split()) for c in chunk)
        chunk.append(sentence)
        tokens += n_tokens

    if chunk:
        chunks.append(' '.join(chunk))
    return chunks


def extract_answer(question, context_chunk):
    try:
        result = qa_pipeline(question=question, context=context_chunk, truncation=True)
        return result['answer'], result['score']
    except Exception as e:
        return "No answer found", 0.0

def answer_question(question, top_k=5, alpha=0.5):
    # Step 1: Retrieve top documents
    question_embedding = normalize(embedding_model.encode([question]))
    D, I = index.search(question_embedding, top_k)

    answers = []
    for i, idx in enumerate(I[0]):
        paper = df.iloc[idx]
        chunks = chunk_text(paper['text'])

        for chunk in chunks:
            answer, score = extract_answer(question, chunk)
            if answer and score > 0:
                sim = D[0][i]  # cosine similarity
                confidence = alpha * sim + (1 - alpha) * score
                answers.append({
                    "source": paper['title'],
                    "answer": answer,
                    "confidence": confidence,
                    "qa_score": score,
                    "semantic_similarity": sim,
                    "context": chunk
                })

    return sorted(answers, key=lambda x: x["confidence"], reverse=True)

In [18]:
questions = [
    "What is COVID-19?",
    "What is the effect of nicotine on ACE2 receptors?",
    "How does vaping affect lung inflammation in coronavirus cases?"
]

for q in questions:
    print(f"\n\033[1mQuestion:\033[0m {q}")
    results = answer_question(q)
    for i, r in enumerate(results[:2]):
        print(f"\n\033[1mAnswer {i+1}:\033[0m {r['answer']}")
        print(f"\033[1mSource:\033[0m {r['source']}")
        print(f"\033[1mConfidence:\033[0m {r['confidence']:.2f}")
        print(f"\033[1mContext:\033[0m [...]{r['context'][:300]}[...]\n")


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



[1mQuestion:[0m What is COVID-19?

[1mAnswer 1:[0m 19
[1mSource:[0m COVID-19 and smoking
[1mConfidence:[0m 0.36
[1mContext:[0m [...]COVID-19 and smoking[...]


[1mAnswer 2:[0m 19
[1mSource:[0m COVID-19 and Smoking
[1mConfidence:[0m 0.36
[1mContext:[0m [...]COVID-19 and Smoking[...]


[1mQuestion:[0m What is the effect of nicotine on ACE2 receptors?

[1mAnswer 1:[0m : Possible Relevance
[1mSource:[0m Late Breaking Abstract-ACE2 Overexpression Modulates Nicotine Receptors In Cell Type Specific Manner: Possible Relevance In Covid-19
[1mConfidence:[0m 0.38
[1mContext:[0m [...]Late Breaking Abstract-ACE2 Overexpression Modulates Nicotine Receptors In Cell Type Specific Manner: Possible Relevance In Covid-19[...]


[1mAnswer 2:[0m nicotine as a mediator
[1mSource:[0m COVID-19 and nicotine as a mediator of ACE-2
[1mConfidence:[0m 0.36
[1mContext:[0m [...]COVID-19 and nicotine as a mediator of ACE-2[...]


[1mQuestion:[0m How does vaping affect lung infla

model.safetensors:   0%|          | 0.00/656M [00:00<?, ?B/s]