In [45]:
!pip install transformers torch sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [15]:
import os
import re
import ast
import pandas as pd
from nltk.corpus import stopwords
import nltk
from tqdm import tqdm

nltk.download('stopwords')
tqdm.pandas()

base_path = "./"
os.chdir(base_path)

# Load curated data
df = pd.read_csv("smoking_covid_curated.csv")

# Clean text function
stop_words = set(stopwords.words('english'))  # Define once for speed
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))  # Remove special chars
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply cleaning to abstract
df['clean_abstract'] = df['abstract'].progress_apply(clean_text)

# Apply cleaning to full_text (JSON/dict-like field)
def process_full_text(x):
    if pd.isna(x) or x == {}:
        return ""
    try:
        if isinstance(x, str):
            x = ast.literal_eval(x)  # Convert string to dict
        if not isinstance(x, dict):
            return ""
        return ' '.join(clean_text(t) for section in x.values() for t in section)
    except Exception:
        return ""

df['clean_full_text'] = df['full_text'].progress_apply(process_full_text)


[nltk_data] Downloading package stopwords to /home/anton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████████████████████████████████████████████████████████████████████| 11354/11354 [00:00<00:00, 14631.58it/s]
100%|███████████████████████████████████████████████████████████████████████████| 11354/11354 [00:02<00:00, 4802.77it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11354 entries, 0 to 11353
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   cord_uid         11354 non-null  object
 1   title            11353 non-null  object
 2   abstract         10625 non-null  object
 3   publish_time     11342 non-null  object
 4   source_x         11354 non-null  object
 5   authors          11285 non-null  object
 6   pdf_json_files   4540 non-null   object
 7   pmc_json_files   3961 non-null   object
 8   full_text        11354 non-null  object
 9   clean_abstract   11354 non-null  object
 10  clean_full_text  11354 non-null  object
dtypes: object(11)
memory usage: 975.9+ KB





In [35]:
df['text'] = df.progress_apply(
    lambda x: ' '.join(part for part in [str(x.get('title', '')), x.get('clean_abstract', ''), x.get('clean_full_text', '')] if part),
    axis=1
)

100%|█████████████████████████████████████████████████████████████████████████| 11354/11354 [00:00<00:00, 140528.00it/s]


In [38]:
df.head()

Unnamed: 0,cord_uid,title,abstract,publish_time,source_x,authors,pdf_json_files,pmc_json_files,full_text,clean_abstract,clean_full_text,text
0,8qnrcgnk,Heme oxygenase-1 and carbon monoxide in pulmon...,"Heme oxygenase-1 (HO-1), an inducible stress p...",2003-08-07,PMC,"Slebos, Dirk-Jan; Ryter, Stefan W; Choi, Augus...",document_parses/pdf_json/faaf1022ccfe93b032c56...,document_parses/pmc_json/PMC193681.xml.json,{'introduction': ['The heme oxygenase-1/carbon...,heme oxygenase ho inducible stress protein con...,heme oxygenasecarbon monoxide hoco system rece...,Heme oxygenase-1 and carbon monoxide in pulmon...
1,qva0jt86,Relevance of human metapneumovirus in exacerba...,BACKGROUND AND METHODS: Human metapneumovirus ...,2005-12-21,PMC,"Rohde, G; Borg, I; Arinir, U; Kronsbein, J; Ra...",document_parses/pdf_json/4ba79e54ecf81b30b5646...,document_parses/pmc_json/PMC1334186.xml.json,{'methods': ['Three different groups were stud...,background methods human metapneumovirus hmpv ...,three different groups studied first group con...,Relevance of human metapneumovirus in exacerba...
2,bnnl700a,Public awareness of risk factors for cancer am...,BACKGROUND: The present study aimed to provide...,2006-01-10,PMC,"Inoue, Manami; Iwasaki, Motoki; Otani, Tetsuya...",document_parses/pdf_json/a78fd1b34372e1e54bf2a...,document_parses/pmc_json/PMC1351169.xml.json,{'methods': ['The study was conducted as a par...,background present study aimed provide informa...,study conducted part omnibus survey december c...,Public awareness of risk factors for cancer am...
3,ft5wl70x,Involvement of microRNAs in physiological and ...,"To date, at least 900 different microRNA (miRN...",2010-11-23,PMC,"Tomankova, Tereza; Petrek, Martin; Kriegova, Eva",document_parses/pdf_json/b97de55ba907c3b1f3048...,document_parses/pmc_json/PMC3001429.xml.json,{'references': []},date least different microrna mirna genes disc...,,Involvement of microRNAs in physiological and ...
4,1h6jz1h5,Plant Plastid Engineering,Genetic material in plants is distributed into...,2010-11-03,PMC,"Wani, Shabir H.; Haider, Nadia; Kumar, Hitesh;...",document_parses/pdf_json/79979652a864cef3a4134...,document_parses/pmc_json/PMC3048312.xml.json,"{'introduction': [""Genetic material in plants ...",genetic material plants distributed nucleus pl...,genetic material plants distributed nucleus ch...,Plant Plastid Engineering genetic material pla...


In [42]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')  # 384-dimensional embeddings

# Generate embeddings in batches to avoid memory issues
batch_size = 100
embeddings = []
for i in range(0, len(df), batch_size):
    batch = df['text'][i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, show_progress_bar=True)
    embeddings.extend(batch_embeddings)

df['embedding'] = list(embeddings)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [46]:
from transformers import pipeline

# Load QA pipeline (biomedical-focused for better COVID/smoking answers)
qa_model = pipeline(
    "question-answering",
    model="ktrapeznikov/biobert_v1.1_pubmed_squad_v2",
    tokenizer="ktrapeznikov/biobert_v1.1_pubmed_squad_v2"
)

def extract_answer(question, context, max_length=512):
    """
    Extracts precise answers from context using BioBERT QA.
    Handles long documents by chunking.
    """
    # Truncate context to model's max length
    truncated_context = context[:max_length*4]  # Allow some overlap
    
    try:
        result = qa_model(question=question, context=truncated_context)
        return {
            "answer": result["answer"],
            "score": result["score"],
            "start": result["start"],
            "end": result["end"]
        }
    except Exception as e:
        return {"answer": "No answer found", "error": str(e)}

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at ktrapeznikov/biobert_v1.1_pubmed_squad_v2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


In [47]:
def answer_question(question, n_sources=3):
    # 1. Retrieve relevant documents
    question_embedding = model.encode([clean_text(question)])
    distances, indices = nn.kneighbors(question_embedding, n_neighbors=n_sources)
    
    answers = []
    for i, idx in enumerate(indices[0]):
        paper = df.iloc[idx]
        context = paper['text']
        
        # 2. Extract answer from each relevant paper
        qa_result = extract_answer(question, context)
        
        answers.append({
            "source": paper['title'],
            "answer": qa_result["answer"],
            "confidence": qa_result["score"] * (1 - distances[0][i]),  # Combine QA score and semantic similarity
            "context": context[qa_result["start"]-50:qa_result["end"]+50] if qa_result.get("start") else context[:500]
        })
    
    # Sort by combined confidence
    return sorted(answers, key=lambda x: x["confidence"], reverse=True)

In [49]:
questions = [
    "What is COVID-19?",
    "What is the effect of nicotine on ACE2 receptors?",
    "How does vaping affect lung inflammation in coronavirus cases?"
]

for q in questions:
    print(f"\n\033[1mQuestion:\033[0m {q}")
    results = answer_question(q)
    for i, r in enumerate(results[:2]):  # Show top 2 answers
        print(f"\n\033[1mAnswer {i+1}:\033[0m {r['answer']}")
        print(f"\033[1mSource:\033[0m {r['source']}")
        print(f"\033[1mConfidence:\033[0m {r['confidence']:.2f}")
        print(f"\033[1mContext:\033[0m [...]{r['context']}[...]\n")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



[1mQuestion:[0m What is COVID-19?

[1mAnswer 1:[0m Smoking
[1mSource:[0m COVID-19 and Smoking
[1mConfidence:[0m 0.16
[1mContext:[0m [...]COVID-19 and Smoking[...]


[1mAnswer 2:[0m Smoking
[1mSource:[0m COVID-19 and Smoking
[1mConfidence:[0m 0.16
[1mContext:[0m [...]COVID-19 and Smoking[...]


[1mQuestion:[0m What is the effect of nicotine on ACE2 receptors?

[1mAnswer 1:[0m upregulate
[1mSource:[0m COVID-19 and Nicotine as a Mediator of ACE-2
[1mConfidence:[0m 0.06
[1mContext:[0m [...]-19 and Nicotine as a Mediator of ACE-2 nachr may upregulate ace[...]


[1mAnswer 2:[0m Overexpression
[1mSource:[0m Late Breaking Abstract-ACE2 Overexpression Modulates Nicotine Receptors In Cell Type Specific Manner: Possible Relevance In Covid-19
[1mConfidence:[0m 0.05
[1mContext:[0m [...][...]


[1mQuestion:[0m How does vaping affect lung inflammation in coronavirus cases?

[1mAnswer 1:[0m lung injury
[1mSource:[0m An Epidemic Supplanted by a Pandemic: Vapin

In [None]:
from transformers import pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

def extract_answer(context, question):
    return qa_pipeline(question=question, context=context)['answer']