In [1]:
!pip install llama-index llama-index-embeddings-huggingface llama-index-llms-huggingface bitsandbytes torch



In [41]:
import os
import pandas as pd
import json

from tqdm import tqdm

import re
import unicodedata

from nltk.corpus import stopwords
import nltk

from llama_index.core import Settings
from llama_index.core import Document
from llama_index.core import StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.core import load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.memory import ChatMemoryBuffer

from matplotlib import pyplot as plt
import torch
print(torch.cuda.is_available())  # Should print True

True


In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device " + device)
base_path = "./"
pdf_json_dir = 'document_parses/pdf_json'
pmc_json_dir = 'document_parses/pmc_json'
#base_path = "/content/drive/MyDrive/Projektmunka Smoking and COVID19"
os.chdir(base_path)
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

Device cuda


[nltk_data] Downloading package punkt to /home/anton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/anton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
metadata_path = "metadata.csv"
metadata = pd.read_csv(metadata_path, dtype=str)

# Define smoking-related keywords (expand as needed)
smoking_keywords = [
    "smoking", "smoker", "smoke", "ecigarett", "cigarett",  "tobacco", "cigarette", "nicotine",
    "vaping", "vape", "e-cigarette", "smoker", "cigar", "weed", "marijuana"
]

# Filter papers where title/abstract contains smoking-related terms
filtered_papers = metadata[
    metadata["title"].str.lower().str.contains('|'.join(smoking_keywords), na=False) |
    metadata["abstract"].str.lower().str.contains('|'.join(smoking_keywords), na=False)
].copy()

print(f"Found {len(filtered_papers)} smoking-related papers")

Found 11354 smoking-related papers


In [8]:
columns_to_keep = ['cord_uid', 'title', 'abstract', 'publish_time', 'source_x', 'authors', 'pdf_json_files', 'pmc_json_files']

filtered_papers = filtered_papers[columns_to_keep]

In [9]:
filtered_papers.head()

Unnamed: 0,cord_uid,title,abstract,publish_time,source_x,authors,pdf_json_files,pmc_json_files
8,8qnrcgnk,Heme oxygenase-1 and carbon monoxide in pulmon...,"Heme oxygenase-1 (HO-1), an inducible stress p...",2003-08-07,PMC,"Slebos, Dirk-Jan; Ryter, Stefan W; Choi, Augus...",document_parses/pdf_json/faaf1022ccfe93b032c56...,document_parses/pmc_json/PMC193681.xml.json
41,qva0jt86,Relevance of human metapneumovirus in exacerba...,BACKGROUND AND METHODS: Human metapneumovirus ...,2005-12-21,PMC,"Rohde, G; Borg, I; Arinir, U; Kronsbein, J; Ra...",document_parses/pdf_json/4ba79e54ecf81b30b5646...,document_parses/pmc_json/PMC1334186.xml.json
43,bnnl700a,Public awareness of risk factors for cancer am...,BACKGROUND: The present study aimed to provide...,2006-01-10,PMC,"Inoue, Manami; Iwasaki, Motoki; Otani, Tetsuya...",document_parses/pdf_json/a78fd1b34372e1e54bf2a...,document_parses/pmc_json/PMC1351169.xml.json
473,ft5wl70x,Involvement of microRNAs in physiological and ...,"To date, at least 900 different microRNA (miRN...",2010-11-23,PMC,"Tomankova, Tereza; Petrek, Martin; Kriegova, Eva",document_parses/pdf_json/b97de55ba907c3b1f3048...,document_parses/pmc_json/PMC3001429.xml.json
507,1h6jz1h5,Plant Plastid Engineering,Genetic material in plants is distributed into...,2010-11-03,PMC,"Wani, Shabir H.; Haider, Nadia; Kumar, Hitesh;...",document_parses/pdf_json/79979652a864cef3a4134...,document_parses/pmc_json/PMC3048312.xml.json


In [10]:
def extract_body_text(json_path):
    """Extract and concatenate all 'text' fields from 'body_text' in a JSON file."""
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            return ' '.join(para['text'] for para in data.get('body_text', []))
    except Exception as e:
        # Optionally print or log the error
        return None

def get_full_text(row):
    # Try PDF JSON first
    if pd.notna(row['pdf_json_files']):
        for json_path in row['pdf_json_files'].split('; '):
            full_path = os.path.join(base_path, json_path.strip())
            if os.path.exists(full_path):
                return extract_body_text(full_path)
    return None  # Return empty dict if no files found

In [11]:
tqdm.pandas(desc="Extracting full text sections")
filtered_papers['full_text'] = filtered_papers.progress_apply(get_full_text, axis=1)

Extracting full text sections: 100%|████████████████████████████████████████████| 11354/11354 [00:04<00:00, 2561.58it/s]


In [12]:
filtered_papers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11354 entries, 8 to 1056628
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cord_uid        11354 non-null  object
 1   title           11353 non-null  object
 2   abstract        10625 non-null  object
 3   publish_time    11342 non-null  object
 4   source_x        11354 non-null  object
 5   authors         11285 non-null  object
 6   pdf_json_files  4540 non-null   object
 7   pmc_json_files  3961 non-null   object
 8   full_text       4540 non-null   object
dtypes: object(9)
memory usage: 887.0+ KB


In [13]:
filtered_papers = filtered_papers.dropna(subset=['title', 'abstract', 'full_text'])
filtered_papers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4378 entries, 8 to 1056628
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cord_uid        4378 non-null   object
 1   title           4378 non-null   object
 2   abstract        4378 non-null   object
 3   publish_time    4378 non-null   object
 4   source_x        4378 non-null   object
 5   authors         4370 non-null   object
 6   pdf_json_files  4378 non-null   object
 7   pmc_json_files  3684 non-null   object
 8   full_text       4378 non-null   object
dtypes: object(9)
memory usage: 342.0+ KB


In [14]:
print(filtered_papers.iloc[0].to_dict())

{'cord_uid': '8qnrcgnk', 'title': 'Heme oxygenase-1 and carbon monoxide in pulmonary medicine', 'abstract': 'Heme oxygenase-1 (HO-1), an inducible stress protein, confers cytoprotection against oxidative stress in vitro and in vivo. In addition to its physiological role in heme degradation, HO-1 may influence a number of cellular processes, including growth, inflammation, and apoptosis. By virtue of anti-inflammatory effects, HO-1 limits tissue damage in response to proinflammatory stimuli and prevents allograft rejection after transplantation. The transcriptional upregulation of HO-1 responds to many agents, such as hypoxia, bacterial lipopolysaccharide, and reactive oxygen/nitrogen species. HO-1 and its constitutively expressed isozyme, heme oxygenase-2, catalyze the rate-limiting step in the conversion of heme to its metabolites, bilirubin IXα, ferrous iron, and carbon monoxide (CO). The mechanisms by which HO-1 provides protection most likely involve its enzymatic reaction products

In [15]:
filtered_papers['combined_text'] = (
    filtered_papers['title'].fillna('') + '. ' +
    filtered_papers['abstract'].fillna('') + '. ' +
    filtered_papers['full_text'].fillna('')
)

# Basic statistics
filtered_papers['text_length'] = filtered_papers['combined_text'].str.len()
print(filtered_papers['text_length'].describe())

# Example anomaly filter: drop if text is too short or too long
min_length = 200   # adjust as needed
max_length = 30000 # adjust as needed
filtered_papers = filtered_papers[
    (filtered_papers['text_length'] >= min_length) &
    (filtered_papers['text_length'] <= max_length)
].copy()

count    4.378000e+03
mean     2.693090e+04
std      2.441819e+04
min      1.227000e+03
25%      1.748800e+04
50%      2.425250e+04
75%      3.197325e+04
max      1.276458e+06
Name: text_length, dtype: float64


In [16]:
filtered_papers.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3077 entries, 41 to 1056343
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   cord_uid        3077 non-null   object
 1   title           3077 non-null   object
 2   abstract        3077 non-null   object
 3   publish_time    3077 non-null   object
 4   source_x        3077 non-null   object
 5   authors         3076 non-null   object
 6   pdf_json_files  3077 non-null   object
 7   pmc_json_files  2538 non-null   object
 8   full_text       3077 non-null   object
 9   combined_text   3077 non-null   object
 10  text_length     3077 non-null   int64 
dtypes: int64(1), object(10)
memory usage: 288.5+ KB


In [17]:
def clean_text(text):
    # Remove non-UTF8 and normalize unicode
    text = unicodedata.normalize("NFKC", text)
    text = text.encode("utf-8", "ignore").decode("utf-8", "ignore")
    # Remove HTML/XML tags
    text = re.sub(r"<[^>]+>", " ", text)
    # Remove LaTeX (very basic)
    text = re.sub(r"\$.*?\$", " ", text)
    # Remove references like [1], (1), etc.
    text = re.sub(r"\[\d+\]|\(\d+\)", " ", text)
    # Remove non-printable characters
    text = re.sub(r"[^\x20-\x7E]", " ", text)
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)
    # Lowercase for stopword removal
    text = text.lower()
    # # Remove stopwords
    # words = text.split()
    # words = [word for word in words if word not in stop_words]
    # text = " ".join(words)
    # Strip leading/trailing whitespace
    text = text.strip()
    return text

In [32]:
model_name_embed="sentence-transformers/all-MiniLM-L6-v2"
model_name_llm="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
chunk_size=200
persist_dir="storage"

In [33]:
def chunk_text(text, chunk_size):
    words = text.split(" ")
    return [
        " ".join(words[i:i + chunk_size])
        for i in range(0, len(words), chunk_size)
    ]

def prepare_documents(df, chunk_size, text_column="combined_text"):
    print("Chunking documents...")
    chunks = []
    for text in tqdm(df[text_column].dropna().values):
        for chunk in chunk_text(text, chunk_size):
            chunks.append(Document(text=chunk))
    print(f"Total chunks: {len(chunks)}")
    return chunks

def build_index(documents, model_name_embed, device, persist_dir):
    print("Building vector index with CUDA embeddings...")
    Settings.llm = None
    Settings.embed_model = HuggingFaceEmbedding(
        model_name=model_name_embed, device=device
    )
    index = VectorStoreIndex.from_documents(
        documents, show_progress=True, insert_batch_size=len(documents)
    )
    print("Persisting index to disk...")
    index.storage_context.persist(persist_dir=persist_dir)
    print(f"VectorStoreIndex saved to {persist_dir}.")
    return index

def load_index(persist_dir):
    print(f"Loading index from {persist_dir}...")
    loaded_storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
    index = load_index_from_storage(loaded_storage_context)
    print("Index loaded.")
    return index

def setup_llm(model_name_llm):
    print("Setting up local LLM...")
    llm = HuggingFaceLLM(
        model_name=model_name_llm,
        tokenizer_name=model_name_llm,
        context_window=2048,
        max_new_tokens=256,
        device_map="cuda:0",
        generate_kwargs={"temperature": 0.95, "do_sample": True},
    )
    Settings.llm = llm

def setup_chat_engine(index, system_prompt=None):
    print("Setting up chat engine...")
    if system_prompt is None:
        system_prompt = (
            "You are a medical chatbot, able to have normal interactions. "
            "You only answer based on the Cord19 dataset."
        )
    chat_engine = index.as_chat_engine(
        chat_mode="context",
        memory=ChatMemoryBuffer.from_defaults(token_limit=32000),
        system_prompt=system_prompt,
    )
    return chat_engine

def chat(chat_engine):
    print("Chatbot is ready! Type your question or 'quit' to exit.")
    while True:
        query = input("> ")
        if query.lower() == "quit":
            break
        print("Agent: ", end="", flush=True)
        response = chat_engine.stream_chat(query)
        for token in response.response_gen:
            print(token, end="", flush=True)
        print()
    chat_engine.reset()

In [24]:
# Load your DataFrame (replace with your actual loading code)
# df = pd.read_csv("your_data.csv")
# For demonstration, let's assume df is already loaded and cleaned


# Step 1: Prepare documents (chunking)
documents = prepare_documents(filtered_papers, chunk_size)

Chunking documents...


100%|█████████████████████████████████████████████████████████████████████████████| 3077/3077 [00:01<00:00, 2507.37it/s]

Total chunks: 47447





In [26]:
# Step 2: Build and persist the vector index
build_index(documents, model_name_embed, device, persist_dir)

Building vector index with CUDA embeddings...
LLM is explicitly disabled. Using MockLLM.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Parsing nodes:   0%|          | 0/47447 [00:00<?, ?it/s]

Some nodes are missing content, skipping them...


Generating embeddings:   0%|          | 0/47442 [00:00<?, ?it/s]

Persisting index to disk...
VectorStoreIndex saved to storage.


<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7ff02cd0d820>

In [27]:
# Step 3: Load the index (optional, for a new session)
index = load_index(persist_dir)

Loading index from storage...
Index loaded.


In [39]:
# Step 4: Setup the LLM
llm = HuggingFaceLLM(
    model_name=model_name_llm,       # Nyelvi modell beállítása
    tokenizer_name=model_name_llm,   # Nyelvi modell tokenizátorának beállítása
    context_window=2048,                                          # Maximum token limit
    max_new_tokens=256,                                           # Válasz maximális hossza
    device_map="cuda:0",                                          # GPU használata,
    generate_kwargs={"temperature": 0.95, "do_sample": True},     # Ezek a paraméterek befolyásolják a modell válaszainak véletlenszerűségét és kreativitását.
)
Settings.llm = llm

In [42]:
# Step 5: Setup the chat engine
chat_engine = setup_chat_engine(index, system_prompt=None)

Setting up chat engine...


In [None]:
# Step 6: Start chatting
chat(chat_engine)

Chatbot is ready! Type your question or 'quit' to exit.


>  What is covid19?


Agent: Covid-19 is a respiratory illness caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) that emerged in Wuhan, China during the 2019-2020 outbreak. COVID-19 is a respiratory disease, with symptoms including fever, cough, shortness of breath, and possible pneumonia. The disease can lead to severe complications such as respiratory failure and death in severe cases if appropriate treatment is not received.


>  What is the role of carbon monoxide in smoking?


Agent: Carbon monoxide (CO) is an odourless, tasteless, and colorless gas produced from the incomplete combustion of organic compounds (such as carbohydrates, amines, and alcohols). CO is a potent toxic gas that rapidly inhibits the ability of oxygen to enter cells, leading to the breakdown of the mitochondria and causing cellular degradation. The concentration of CO in smokers can be significantly higher than in non-smokers, and their breathing is often characterized by poor flow of air to their 