In [None]:
!pip install llama-index llama-index-embeddings-huggingface llama-index-llms-huggingface bitsandbytes torch notebook

In [None]:
import os
import pandas as pd
import json

from tqdm import tqdm

import re
import unicodedata

from nltk.corpus import stopwords
import nltk

from llama_index.core import Settings
from llama_index.core import Document
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core import load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.memory import ChatMemoryBuffer

from matplotlib import pyplot as plt
import torch
print(torch.cuda.is_available())  # Should print True

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device " + device)
base_path = "./"
pdf_json_dir = 'document_parses/pdf_json'
pmc_json_dir = 'document_parses/pmc_json'
#base_path = "/content/drive/MyDrive/Projektmunka Smoking and COVID19"
os.chdir(base_path)
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
metadata_path = "metadata.csv"
metadata = pd.read_csv(metadata_path, dtype=str)

# Define smoking-related keywords (expand as needed)
smoking_keywords = [
    "smoking", "smoker", "smoke", "ecigarett", "cigarett",  "tobacco", "cigarette", "nicotine",
    "vaping", "vape", "e-cigarette", "smoker", "cigar", "weed", "marijuana"
]

# Filter papers where title/abstract contains smoking-related terms
filtered_papers = metadata[
    metadata["title"].str.lower().str.contains('|'.join(smoking_keywords), na=False) |
    metadata["abstract"].str.lower().str.contains('|'.join(smoking_keywords), na=False)
].copy()

print(f"Found {len(filtered_papers)} smoking-related papers")

In [29]:
columns_to_keep = ['cord_uid', 'title', 'abstract', 'publish_time', 'source_x', 'authors', 'pdf_json_files', 'pmc_json_files']

filtered_papers = filtered_papers[columns_to_keep]

In [30]:
filtered_papers.head()

Unnamed: 0,cord_uid,title,abstract,publish_time,source_x,authors,pdf_json_files,pmc_json_files
8,8qnrcgnk,Heme oxygenase-1 and carbon monoxide in pulmon...,"Heme oxygenase-1 (HO-1), an inducible stress p...",2003-08-07,PMC,"Slebos, Dirk-Jan; Ryter, Stefan W; Choi, Augus...",document_parses/pdf_json/faaf1022ccfe93b032c56...,document_parses/pmc_json/PMC193681.xml.json
41,qva0jt86,Relevance of human metapneumovirus in exacerba...,BACKGROUND AND METHODS: Human metapneumovirus ...,2005-12-21,PMC,"Rohde, G; Borg, I; Arinir, U; Kronsbein, J; Ra...",document_parses/pdf_json/4ba79e54ecf81b30b5646...,document_parses/pmc_json/PMC1334186.xml.json
43,bnnl700a,Public awareness of risk factors for cancer am...,BACKGROUND: The present study aimed to provide...,2006-01-10,PMC,"Inoue, Manami; Iwasaki, Motoki; Otani, Tetsuya...",document_parses/pdf_json/a78fd1b34372e1e54bf2a...,document_parses/pmc_json/PMC1351169.xml.json
473,ft5wl70x,Involvement of microRNAs in physiological and ...,"To date, at least 900 different microRNA (miRN...",2010-11-23,PMC,"Tomankova, Tereza; Petrek, Martin; Kriegova, Eva",document_parses/pdf_json/b97de55ba907c3b1f3048...,document_parses/pmc_json/PMC3001429.xml.json
507,1h6jz1h5,Plant Plastid Engineering,Genetic material in plants is distributed into...,2010-11-03,PMC,"Wani, Shabir H.; Haider, Nadia; Kumar, Hitesh;...",document_parses/pdf_json/79979652a864cef3a4134...,document_parses/pmc_json/PMC3048312.xml.json


In [31]:
def extract_body_text(json_path):
    """Extract and concatenate all 'text' fields from 'body_text' in a JSON file."""
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            return ' '.join(para['text'] for para in data.get('body_text', []))
    except Exception as e:
        # Optionally print or log the error
        return None

def get_full_text(row):
    # Try PDF JSON first
    if pd.notna(row['pdf_json_files']):
        for json_path in row['pdf_json_files'].split('; '):
            full_path = os.path.join(base_path, json_path.strip())
            if os.path.exists(full_path):
                return extract_body_text(full_path)
    return None  # Return empty dict if no files found

In [32]:
tqdm.pandas(desc="Extracting full text sections")
filtered_papers['full_text'] = filtered_papers.progress_apply(get_full_text, axis=1)

Extracting full text sections: 100%|██████████| 11354/11354 [00:01<00:00, 6586.70it/s]


In [33]:
filtered_papers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11354 entries, 8 to 1056628
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cord_uid        11354 non-null  object
 1   title           11353 non-null  object
 2   abstract        10625 non-null  object
 3   publish_time    11342 non-null  object
 4   source_x        11354 non-null  object
 5   authors         11285 non-null  object
 6   pdf_json_files  4540 non-null   object
 7   pmc_json_files  3961 non-null   object
 8   full_text       4540 non-null   object
dtypes: object(9)
memory usage: 887.0+ KB


In [34]:
filtered_papers = filtered_papers.dropna(subset=['title', 'abstract', 'full_text'])
filtered_papers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4378 entries, 8 to 1056628
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cord_uid        4378 non-null   object
 1   title           4378 non-null   object
 2   abstract        4378 non-null   object
 3   publish_time    4378 non-null   object
 4   source_x        4378 non-null   object
 5   authors         4370 non-null   object
 6   pdf_json_files  4378 non-null   object
 7   pmc_json_files  3684 non-null   object
 8   full_text       4378 non-null   object
dtypes: object(9)
memory usage: 342.0+ KB


In [35]:
print(filtered_papers.iloc[0].to_dict())

{'cord_uid': '8qnrcgnk', 'title': 'Heme oxygenase-1 and carbon monoxide in pulmonary medicine', 'abstract': 'Heme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IXα, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products

In [36]:
filtered_papers['combined_text'] = (
    filtered_papers['title'].fillna('') + '. ' +
    filtered_papers['abstract'].fillna('') + '. ' +
    filtered_papers['full_text'].fillna('')
)

# Basic statistics
filtered_papers['text_length'] = filtered_papers['combined_text'].str.len()
print(filtered_papers['text_length'].describe())

# Example anomaly filter: drop if text is too short or too long
min_length = 200   # adjust as needed
max_length = 30000 # adjust as needed
filtered_papers = filtered_papers[
    (filtered_papers['text_length'] >= min_length) &
    (filtered_papers['text_length'] <= max_length)
].copy()

count    4.378000e+03
mean     2.693090e+04
std      2.441819e+04
min      1.227000e+03
25%      1.748800e+04
50%      2.425250e+04
75%      3.197325e+04
max      1.276458e+06
Name: text_length, dtype: float64


In [37]:
filtered_papers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3077 entries, 41 to 1056343
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cord_uid        3077 non-null   object
 1   title           3077 non-null   object
 2   abstract        3077 non-null   object
 3   publish_time    3077 non-null   object
 4   source_x        3077 non-null   object
 5   authors         3076 non-null   object
 6   pdf_json_files  3077 non-null   object
 7   pmc_json_files  2538 non-null   object
 8   full_text       3077 non-null   object
 9   combined_text   3077 non-null   object
 10  text_length     3077 non-null   int64 
dtypes: int64(1), object(10)
memory usage: 288.5+ KB


In [38]:
with open('extracted_texts.json', 'r') as f:
    data = json.load(f)

In [39]:
data

{'s11739-020-02355-7.pdf': 'Vol.:(0123456789)\n1 3\nInternal and Emergency Medicine (2020) 15:845–852 \nhttps://doi.org/10.1007/s11739-020-02355-7\nCE-SYSTEMATIC REVIEWS AND\xa0META-ANALYSIS\nSystematic review of\xa0the\xa0prevalence of\xa0current smoking \namong\xa0hospitalized COVID‑19 patients in\xa0China: could nicotine be \na\xa0therapeutic option?\nKonstantinos\xa0Farsalinos1\u200a \xa0· Anastasia\xa0Barbouni1\xa0· Raymond\xa0Niaura2\nReceived: 5 April 2020 / Accepted: 23 April 2020 / Published online: 9 May 2020 \n© Società Italiana di Medicina Interna (SIMI) 2020\nAbstract\nThe effects of smoking on Corona Virus Disease 2019 (COVID-19) are currently unknown. The purpose of this study was to \nsystematically examine the prevalence of current smoking among hospitalized patients with COVID-19 in China, considering \nthe high-population smoking prevalence in China (26.6%). A systematic review of the literature (PubMed) was performed \non April 1. Thirteen studies examining the clin

In [40]:
# Print the structure (keys and a sample value) of the loaded JSON data
print("Top-level keys:", list(data.keys()))
first_key = next(iter(data))
print(f"Sample value for key '{first_key}':\n", data[first_key][:500])  # Print first 500 chars for brevity

Top-level keys: ['s11739-020-02355-7.pdf', 'association_between_smoking_and_covid_19_severity_.39.pdf', 'bogdański-et-al-2020-smoking-vaping-and-tobacco-industry-during-covid-19-pandemic-twitter-data-analysis.pdf', 'smoking.pdf', 'fphys-12-603850.pdf', 'TID-19-09.pdf', 'PJMS-36-S104.pdf', '1-s2.0-S1054139X20303992-main.pdf', 's10900-020-00880-2.pdf', 'main (2).pdf', '1-s2.0-S0376871621006396-main.pdf', 'jcvtr-12-136.pdf', '30-Articolo-97-2-10-20210907.pdf', 'TID-18-20.pdf', 's12889-021-11579-x.pdf', 'TID-18-63.pdf', 'file.pdf', 'hpr-8-1-9124.pdf', 'Journal of Medical Virology - 2020 - Reddy - The effect of smoking on COVID‐19 severity  A systematic review and.pdf', 'beliefs_about_the_effects_of_smoking_on_corona.2.pdf', 'ntab112.pdf', 'ERJ-01290-2020.pdf', 'Addiction - 2020 - Tattan‐Birch - COVID‐19  smoking  vaping and quitting  a representative population survey in England.pdf', 'IJPH-50-431.pdf', 'journal.pone.0238552.pdf', 'TID-21-77.pdf', 'main (3).pdf', 'haddad-et-al-2021-smoking

In [47]:
filtered_papers

Unnamed: 0,cord_uid,title,abstract,publish_time,source_x,authors,pdf_json_files,pmc_json_files,full_text,combined_text,text_length
41,qva0jt86,Relevance of human metapneumovirus in exacerba...,BACKGROUND AND METHODS: Human metapneumovirus ...,2005-12-21,PMC,"Rohde, G; Borg, I; Arinir, U; Kronsbein, J; Ra...",document_parses/pdf_json/4ba79e54ecf81b30b5646...,document_parses/pmc_json/PMC1334186.xml.json,Respiratory viruses play an important role in ...,Relevance of human metapneumovirus in exacerba...,12379
43,bnnl700a,Public awareness of risk factors for cancer am...,BACKGROUND: The present study aimed to provide...,2006-01-10,PMC,"Inoue, Manami; Iwasaki, Motoki; Otani, Tetsuya...",document_parses/pdf_json/a78fd1b34372e1e54bf2a...,document_parses/pmc_json/PMC1351169.xml.json,"In Japan, cancer has been recognized as a majo...",Public awareness of risk factors for cancer am...,13759
473,ft5wl70x,Involvement of microRNAs in physiological and ...,"To date, at least 900 different microRNA (miRN...",2010-11-23,PMC,"Tomankova, Tereza; Petrek, Martin; Kriegova, Eva",document_parses/pdf_json/b97de55ba907c3b1f3048...,document_parses/pmc_json/PMC3001429.xml.json,"A. miRNA definition, biology and function Disc...",Involvement of microRNAs in physiological and ...,22583
1019,5is9kc52,First Discovery and Stucture-Activity Relation...,A series of phenanthroquinolizidine alkaloids ...,2012-12-28,PMC,"Wang, Ziwen; Feng, Anzheng; Cui, Mingbo; Liu, ...",document_parses/pdf_json/a4ffcadecc4b60c30df8f...,document_parses/pmc_json/PMC3532156.xml.json,Plant viruses cause numerous diseases in a wid...,First Discovery and Stucture-Activity Relation...,29796
1416,ww9k0cf0,Respiratory Syncytial Virus in Hematopoietic C...,Background. Respiratory syncytial virus (RSV) ...,2013-12-23,PMC,"Kim, Yae-Jean; Guthrie, Katherine A.; Waghmare...",document_parses/pdf_json/b7aed141810c0294eae23...,document_parses/pmc_json/PMC3969549.xml.json,Infection caused by respiratory viruses is a t...,Respiratory Syncytial Virus in Hematopoietic C...,24511
...,...,...,...,...,...,...,...,...,...,...,...
1055319,xqw9nir5,Seroprevalence of Anti-SARS-CoV-2 Antibodies a...,Background: Some studies have assessed the ser...,2021-11-28,Medline; PMC; WHO,"Gashi, Bujar; Osmani, Vesa; Halili, Rrezart; H...",document_parses/pdf_json/122977193948ea3da197d...,document_parses/pmc_json/PMC8656675.xml.json,Kosovo was among the last-hit countries in the...,Seroprevalence of Anti-SARS-CoV-2 Antibodies a...,26966
1056016,o5zyw0ug,Effects of host genetic variations on response...,The recent outbreak of the severe acute respir...,2020-05-29,Elsevier; Medline; PMC,"Ghafouri-Fard, Soudeh; Noroozi, Rezvan; Vafaee...",document_parses/pdf_json/44ad5b23b316b3c2f85c0...,document_parses/pmc_json/PMC7258806.xml.json,The recent pandemic of the severe acute respir...,Effects of host genetic variations on response...,15353
1056063,gxui4jy6,Negative Impact of Comorbidity on Health-Relat...,"BACKGROUND: Comorbidity, along with aging, aff...",2022-05-06,Medline; PMC,"Pham, Thu T. M.; Vu, Manh-Tan; Luong, Thuc C.;...",document_parses/pdf_json/bef391a9d9e9bfad71ab2...,document_parses/pmc_json/PMC9121115.xml.json,"Stroke, a common non-communicable disease (NCD...",Negative Impact of Comorbidity on Health-Relat...,19599
1056201,w0mu7y98,"A content analysis of the aims, strategies, an...",BACKGROUND: Placing limitations on advertising...,2021-09-15,Medline; PMC; WHO,"Nanchahal, Kiran; Vasiljevic, Milica; Petticre...",document_parses/pdf_json/7c023057d8979cff0dfce...,document_parses/pmc_json/PMC8976544.xml.json,The obesogenic food environment is implicated ...,"A content analysis of the aims, strategies, an...",19410


In [45]:
li=[]
for key in list(data.keys()):
    li.append({"combined_text":data[key]})
li

[{'combined_text': 'Vol.:(0123456789)\n1 3\nInternal and Emergency Medicine (2020) 15:845–852 \nhttps://doi.org/10.1007/s11739-020-02355-7\nCE-SYSTEMATIC REVIEWS AND\xa0META-ANALYSIS\nSystematic review of\xa0the\xa0prevalence of\xa0current smoking \namong\xa0hospitalized COVID‑19 patients in\xa0China: could nicotine be \na\xa0therapeutic option?\nKonstantinos\xa0Farsalinos1\u200a \xa0· Anastasia\xa0Barbouni1\xa0· Raymond\xa0Niaura2\nReceived: 5 April 2020 / Accepted: 23 April 2020 / Published online: 9 May 2020 \n© Società Italiana di Medicina Interna (SIMI) 2020\nAbstract\nThe effects of smoking on Corona Virus Disease 2019 (COVID-19) are currently unknown. The purpose of this study was to \nsystematically examine the prevalence of current smoking among hospitalized patients with COVID-19 in China, considering \nthe high-population smoking prevalence in China (26.6%). A systematic review of the literature (PubMed) was performed \non April 1. Thirteen studies examining the clinical cha

In [51]:
import pandas as pd

# Convert li to a DataFrame
new_rows = pd.DataFrame(li)

# Concatenate with filtered_papers
filtered_papers = pd.concat([filtered_papers, new_rows], ignore_index=True)

In [53]:
filtered_papers.tail(2)

Unnamed: 0,cord_uid,title,abstract,publish_time,source_x,authors,pdf_json_files,pmc_json_files,full_text,combined_text,text_length
3249,,,,,,,,,,ARTICLE\nSmoking is signiﬁcantly associated wi...,
3250,,,,,,,,,,Open Peer Review on Qeios\nSmoking and COVID-1...,


In [54]:
def clean_text(text):
    # Remove non-UTF8 and normalize unicode
    text = unicodedata.normalize("NFKC", text)
    text = text.encode("utf-8", "ignore").decode("utf-8", "ignore")
    # Remove HTML/XML tags
    text = re.sub(r"<[^>]+>", " ", text)
    # Remove LaTeX (very basic)
    text = re.sub(r"\$.*?\$", " ", text)
    # Remove references like [1], (1), etc.
    text = re.sub(r"\[\d+\]|\(\d+\)", " ", text)
    # Remove non-printable characters
    text = re.sub(r"[^\x20-\x7E]", " ", text)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)
    # Lowercase for stopword removal
    text = text.lower()
    # # Remove stopwords
    # words = text.split()
    # words = [word for word in words if word not in stop_words]
    # text = " ".join(words)
    # Strip leading/trailing whitespace
    text = text.strip()
    return text

In [55]:
model_name_embed="sentence-transformers/all-MiniLM-L6-v2"
model_name_llm="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
chunk_size=200
persist_dir="storage"

In [57]:
def chunk_text(text, chunk_size):
    words = text.split(" ")
    return [
        " ".join(words[i:i + chunk_size])
        for i in range(0, len(words), chunk_size)
    ]

def prepare_documents(df, chunk_size, text_column="combined_text"):
    print("Chunking documents...")
    chunks = []
    for text in tqdm(df[text_column].dropna().values):
        for chunk in chunk_text(text, chunk_size):
            chunks.append(Document(text=chunk))
    print(f"Total chunks: {len(chunks)}")
    return chunks

def build_index(documents, model_name_embed, device, persist_dir):
    print("Building vector index with CUDA embeddings...")
    Settings.llm = None
    Settings.embed_model = HuggingFaceEmbedding(
        model_name=model_name_embed, device=device
    )
    index = VectorStoreIndex.from_documents(
        documents, show_progress=True, insert_batch_size=len(documents)
    )
    print("Persisting index to disk...")
    index.storage_context.persist(persist_dir=persist_dir)
    print(f"VectorStoreIndex saved to {persist_dir}.")
    return index

def load_index(persist_dir):
    print(f"Loading index from {persist_dir}...")
    loaded_storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
    index = load_index_from_storage(loaded_storage_context)
    print("Index loaded.")
    return index

def setup_llm(model_name_llm):
    print("Setting up local LLM...")
    llm = HuggingFaceLLM(
        model_name=model_name_llm,
        tokenizer_name=model_name_llm,
        context_window=2048,
        max_new_tokens=256,
        device_map="cuda:0",
        generate_kwargs={"temperature": 0.95, "do_sample": True},
    )
    Settings.llm = llm

def setup_chat_engine(index, system_prompt=None):
    print("Setting up chat engine...")
    if system_prompt is None:
        system_prompt = (
            "You are a medical chatbot, able to have normal interactions. "
            "You only answer based on the Cord19 dataset."
        )
    chat_engine = index.as_chat_engine(
        chat_mode="context",
        memory=ChatMemoryBuffer.from_defaults(token_limit=32000),
        system_prompt=system_prompt,
    )
    return chat_engine

def chat(chat_engine):
    print("Chatbot is ready! Type your question or 'quit' to exit.")
    while True:
        query = input("> ")
        if query.lower() == "quit":
            break
        print("Agent: ", end="", flush=True)
        response = chat_engine.stream_chat(query)
        for token in response.response_gen:
            print(token, end="", flush=True)
        print()
    chat_engine.reset()

In [58]:
# Load your DataFrame (replace with your actual loading code)
# df = pd.read_csv("your_data.csv")
# For demonstration, let's assume df is already loaded and cleaned


# Step 1: Prepare documents (chunking)
documents = prepare_documents(filtered_papers, chunk_size)

Chunking documents...


100%|██████████| 3251/3251 [00:00<00:00, 3767.19it/s]

Total chunks: 51615





In [59]:
# Step 2: Build and persist the vector index
build_index(documents, model_name_embed, device, persist_dir)

Building vector index with CUDA embeddings...
LLM is explicitly disabled. Using MockLLM.


Parsing nodes: 100%|██████████| 51615/51615 [00:06<00:00, 7549.11it/s]


Some nodes are missing content, skipping them...


Generating embeddings: 100%|██████████| 51615/51615 [01:23<00:00, 614.87it/s]
Generating embeddings: 100%|██████████| 45/45 [00:00<00:00, 560.17it/s]


Persisting index to disk...
VectorStoreIndex saved to storage.


<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7c81fe156b90>

In [60]:
# Step 3: Load the index (optional, for a new session)
index = load_index(persist_dir)

Loading index from storage...
Index loaded.


In [61]:
# Step 4: Setup the LLM
llm = HuggingFaceLLM(
    model_name=model_name_llm,       # Nyelvi modell beállítása
    tokenizer_name=model_name_llm,   # Nyelvi modell tokenizátorának beállítása
    context_window=2048,                                          # Maximum token limit
    max_new_tokens=256,                                           # Válasz maximális hossza
    device_map="cuda:0",                                          # GPU használata,
    generate_kwargs={"temperature": 0.95, "do_sample": True},     # Ezek a paraméterek befolyásolják a modell válaszainak véletlenszerűségét és kreativitását.
)
Settings.llm = llm

In [None]:
# Step 5: Setup the chat engine
chat_engine = setup_chat_engine(index, system_prompt=None)

In [None]:
# Step 6: Start chatting
chat(chat_engine)