In [None]:
!pip install llama-index llama-index-embeddings-huggingface llama-index-llms-huggingface bitsandbytes torch spacy
!python -m spacy download en_core_web_lg

In [1]:
# =========================
# 1. Imports and Setup
# =========================

import os
import pandas as pd
import numpy as np
import json
import re
import unicodedata
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
import torch

from llama_index.core import Settings, Document, StorageContext, VectorStoreIndex, load_index_from_storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.memory import ChatMemoryBuffer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import spacy

In [16]:
# =========================
# 2. Configuration
# =========================

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Device:", device)
base_path = "./"
os.chdir(base_path)

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

model_name_embed = "sentence-transformers/all-MiniLM-L6-v2"
model_name_llm = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
chunk_size = 200
persist_dir = "storage"

# Switch: Use vector DB or not
USE_VECTOR_DB = True  # Set to False to run agent without vector DB

Device: cuda


[nltk_data] Downloading package punkt to /home/anton/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/anton/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# =========================
# 3. Data Loading & Filtering
# =========================

metadata_path = "metadata.csv"
metadata = pd.read_csv(metadata_path, dtype=str)

smoking_keywords = [
    "smoking", "smoker", "smoke", "ecigarett", "cigarett", "tobacco", "cigarette", "nicotine",
    "vaping", "vape", "e-cigarette", "cigar", "weed", "marijuana"
]
covid_terms = ["covid", "sars-cov-2", "coronavirus"]

filtered_papers = metadata[
    metadata["title"].str.lower().str.contains('|'.join(smoking_keywords), na=False) |
    metadata["abstract"].str.lower().str.contains('|'.join(smoking_keywords), na=False)
].copy()

columns_to_keep = ['cord_uid', 'title', 'abstract', 'publish_time', 'source_x', 'authors', 'pdf_json_files', 'pmc_json_files']
filtered_papers = filtered_papers[columns_to_keep]

In [None]:
# =========================
# 4. Text Extraction & Preprocessing
# =========================

def extract_body_text(json_path):
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
        return ' '.join(para['text'] for para in data.get('body_text', []))
    except Exception:
        return None

def get_full_text(row):
    if pd.notna(row['pdf_json_files']):
        for json_path in row['pdf_json_files'].split('; '):
            full_path = os.path.join(base_path, json_path.strip())
            if os.path.exists(full_path):
                return extract_body_text(full_path)
    return None

tqdm.pandas(desc="Extracting full text sections")
filtered_papers['full_text'] = filtered_papers.progress_apply(get_full_text, axis=1)

filtered_papers = filtered_papers.dropna(subset=['title', 'abstract', 'full_text'])

filtered_papers['combined_text'] = (
    filtered_papers['title'].fillna('') + '. ' +
    filtered_papers['abstract'].fillna('') + '. ' +
    filtered_papers['full_text'].fillna('')
)

def clean_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = text.encode("utf-8", "ignore").decode("utf-8", "ignore")
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"\$.*?\$", " ", text)
    text = re.sub(r"\[\d+\]|\(\d+\)", " ", text)
    text = re.sub(r"[^\x20-\x7E]", " ", text)
    text = re.sub(r"\s+", " ", text)
    text = text.lower().strip()
    return text

filtered_papers['combined_text'] = filtered_papers['combined_text'].apply(clean_text)

# --- Outlier Detection (Text Length) ---
plt.figure(figsize=(8, 4))
plt.hist(df['full_text'].str.len(), bins=50, color='skyblue')
plt.title("Distribution of Document Lengths")
plt.xlabel("Text Length")
plt.ylabel("Count")
plt.show()

In [None]:
filtered_papers['text_length'] = filtered_papers['combined_text'].str.len()
min_length = 200
max_length = 30000
filtered_papers = filtered_papers[
    (filtered_papers['text_length'] >= min_length) &
    (filtered_papers['text_length'] <= max_length)
].copy()

In [None]:
# =========================
# 5. Professional Data Validation
# =========================

df = filtered_papers.copy()
df.rename(columns={'full_text': 'article_text', 'combined_text': 'full_text'}, inplace=True)

# --- Completeness & Consistency ---
print("Checking for missing values:")
print(df.isnull().sum())

print("\nChecking for duplicate titles:")
print(df['title'].duplicated().sum())

# Remove duplicates
df = df.drop_duplicates(subset=['title'])

# --- Relevance Validation ---
def is_relevant(text):
    if isinstance(text, str):
        has_covid = any(term in text.lower() for term in covid_terms)
        has_smoking = any(term in text.lower() for term in smoking_keywords)
        return has_covid and has_smoking
    return False

df['is_relevant'] = df['full_text'].apply(is_relevant)
print(f"Relevant documents: {df['is_relevant'].sum()}/{len(df)}")

# --- Topic Modeling (LDA) ---
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df[df['is_relevant']]['full_text'])
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X)
terms = vectorizer.get_feature_names_out()
for idx, topic in enumerate(lda.components_):
    print(f"Topic #{idx+1}:")
    print([terms[i] for i in topic.argsort()[-10:][::-1]])

# --- Semantic Similarity Validation ---
print("Loading spaCy model...")
nlp = spacy.load("en_core_web_lg")
query = "Impact of smoking on COVID-19 severity"
query_vec = nlp(query).vector.reshape(1, -1)

def validate_semantic_similarity(text):
    if isinstance(text, str):
        doc = nlp(text)
        doc_vec = doc.vector.reshape(1, -1)
        return cosine_similarity(query_vec, doc_vec)[0][0]
    return 0

print("Calculating semantic similarities...")
df['semantic_score'] = df['full_text'].progress_apply(validate_semantic_similarity)
print(df[['title', 'semantic_score']].sort_values('semantic_score', ascending=False).head())


In [22]:
# =========================
# 6. Agent Preparation
# =========================

def chunk_text(text, chunk_size):
    words = text.split(" ")
    return [
        " ".join(words[i:i + chunk_size])
        for i in range(0, len(words), chunk_size)
    ]

def prepare_documents(df, chunk_size, text_column="full_text"):
    print("Chunking documents...")
    chunks = []
    for text in tqdm(df[text_column].dropna().values):
        for chunk in chunk_text(text, chunk_size):
            chunks.append(Document(text=chunk))
    print(f"Total chunks: {len(chunks)}")
    return chunks

def build_index(documents, model_name_embed, device, persist_dir):
    print("Building vector index with CUDA embeddings...")
    Settings.llm = None
    Settings.embed_model = HuggingFaceEmbedding(
        model_name=model_name_embed, device=device
    )
    index = VectorStoreIndex.from_documents(
        documents, show_progress=True, insert_batch_size=len(documents)
    )
    print("Persisting index to disk...")
    index.storage_context.persist(persist_dir=persist_dir)
    print(f"VectorStoreIndex saved to {persist_dir}.")
    return index

def load_index(persist_dir):
    print(f"Loading index from {persist_dir}...")
    Settings.embed_model = HuggingFaceEmbedding(
        model_name=model_name_embed, device=device
    )
    loaded_storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
    index = load_index_from_storage(loaded_storage_context)
    print("Index loaded.")
    return index

def setup_llm(model_name_llm):
    print("Setting up local LLM...")
    llm = HuggingFaceLLM(
        model_name=model_name_llm,
        tokenizer_name=model_name_llm,
        context_window=2048,
        max_new_tokens=256,
        device_map="cuda:0",
        generate_kwargs={"temperature": 0.95, "do_sample": True},
    )
    Settings.llm = llm

def setup_chat_engine(index, system_prompt=None):
    print("Setting up chat engine...")
    if system_prompt is None:
        system_prompt = (
            "You are a medical chatbot, able to have normal interactions. "
            "You only answer based on the provided context."
        )
    chat_engine = index.as_chat_engine(
        chat_mode="context",
        memory=ChatMemoryBuffer.from_defaults(token_limit=32000),
        system_prompt=system_prompt,
    )
    return chat_engine

def chat(chat_engine):
    print("Chatbot is ready! Type your question or 'quit' to exit.")
    llm = Settings.llm
    if hasattr(llm, "model_name"):
        print("Current LLM model:", llm.model_name)
    else:
        print("Current LLM:", type(llm))

    while True:
        query = input("> ")
        if query.lower() == "quit":
            break
        print("Agent: ", end="", flush=True)
        response = chat_engine.stream_chat(query)
        for token in response.response_gen:
            print(token, end="", flush=True)
        print()
        chat_engine.reset()

def agent_with_vector_db(index, llm, system_prompt=None, top_k=5):
    if system_prompt is None:
        system_prompt = (
            "You are a medical chatbot. You only answer based on the provided context."
        )
    retriever = index.as_retriever(similarity_top_k=top_k)
    print("Agent running in vector DB mode. Type your question or 'quit' to exit.")
    while True:
        query = input("> ")
        if query.lower() == "quit":
            break
        # 1. Retrieve relevant context
        retrieved_nodes = retriever.retrieve(query)
        retrieved_texts = [node.get_content() for node in retrieved_nodes]
        print("Context:\n", retrieved_texts)
        # 2. Build prompt
        context = "\n\n".join(retrieved_texts)
        prompt = f"{system_prompt}\n\nContext:\n{context}\n\nQuestion: {query}"
        # 3. Get answer from LLM
        response = llm.complete(prompt)
        print("Agent:", response)

def index_exists(persist_dir):
    return os.path.exists(persist_dir) and len(os.listdir(persist_dir)) > 0


In [None]:
# =========================
# 7. Agent Execution
# =========================

if USE_VECTOR_DB:
    # --- With Vector Database ---
    if index_exists(persist_dir):
        print(f"Index found in '{persist_dir}'. Loading index...")
        index = load_index(persist_dir)
    else:
        print(f"No index found in '{persist_dir}'. Building new index...")
        documents = prepare_documents(df[df['is_relevant']], chunk_size)
        build_index(documents, model_name_embed, device, persist_dir)
        index = load_index(persist_dir)
    setup_llm(model_name_llm)
    chat_engine = setup_chat_engine(index)
    chat(chat_engine)
else:
    # --- Without Vector Database: Simple Keyword Search + LLM ---
    setup_llm(model_name_llm)
    llm = Settings.llm
    index = load_index(persist_dir)  # or build_index(...) if not already built
    agent_with_vector_db(index, llm)

Index found in 'storage'. Loading index...
Loading index from storage...
Index loaded.
Setting up local LLM...
Setting up chat engine...
Chatbot is ready! Type your question or 'quit' to exit.
Current LLM model: TinyLlama/TinyLlama-1.1B-Chat-v1.0


>  What is the capital of France?


Agent: Yes, I understand. The capital of France is Paris.


>  Does smoking increase the risk of hospitalization for COVID-19 patients?


Agent: Yes, smoking can increase the risk of hospitalization for COVID-19 patients. A systematic review and meta-analysis of hospitalized COVID-19 patients found an average odds ratio of 1.9 for hospitalization for COVID-19 among smokers compared to non-smokers (95% confidence interval [CI]: 1.1-3.5). This means for every 100 people with COVID-19 who smokes, 19 people will experience hospitalization compared to 14 people who do not smoke. The overall hospitalization rate for COVID-19 was 9.2% in the US hospital system, and smoking was associated with an even higher risk of hospitalization, with 13% of hospitalized COVID-19 patients smoking compared to 1% of non-smokers. This suggests that smoking increases the risk of severe COVID-19 illness in hospitalized patients.
