PreProcessing DATA

In [None]:
import json
import pandas as pd
from transformers import AutoTokenizer
from nltk.tokenize import sent_tokenize
from typing import List, Dict
import nltk
nltk.download('punkt')

class SmartChunker:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", max_tokens=384):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_tokens = max_tokens
    
    def count_tokens(self, text: str) -> int:
        # Truncate tokens to model max length
        return len(self.tokenizer(text, truncation=True, max_length=512)['input_ids'])
    
    def split_long_sentence(self, sentence: str) -> List[str]:
        words = sentence.split()
        sub_sentences = []
        current_chunk = []
        current_token_count = 0
        
        for word in words:
            word_tokens = self.count_tokens(word)
            if current_token_count + word_tokens < self.max_tokens:
                current_chunk.append(word)
                current_token_count += word_tokens
            else:
                if current_chunk:
                    sub_sentences.append(' '.join(current_chunk))
                current_chunk = [word]
                current_token_count = word_tokens
        
        if current_chunk:
            sub_sentences.append(' '.join(current_chunk))
        return sub_sentences

    def create_smart_chunks(self, text: str) -> List[str]:
        if not text:
            return []
            
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_token_count = 0
        
        for sentence in sentences:
            # Always use truncation when counting tokens
            sentence_tokens = self.count_tokens(sentence)
            
            if sentence_tokens > self.max_tokens:
                if current_chunk:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = []
                    current_token_count = 0
                
                sub_parts = self.split_long_sentence(sentence)
                chunks.extend(sub_parts)
                continue
            
            if current_token_count + sentence_tokens > self.max_tokens:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_token_count = sentence_tokens
            else:
                current_chunk.append(sentence)
                current_token_count += sentence_tokens
        
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        
        return chunks


class PMCProcessor:
    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", max_tokens=384):
        self.chunker = SmartChunker(model_name, max_tokens)
    
    def process_article(self, article: Dict) -> Dict:
        title = article.get('title', '')
        abstract = article.get('abstract', '')
        full_text = article.get('full_text', '')
        
        return {
            'pmcid': article['pmcid'],
            'chunks': {
                'title': self.chunker.create_smart_chunks(title),
                'abstract': self.chunker.create_smart_chunks(abstract),
                'full_text': self.chunker.create_smart_chunks(full_text)
            },
            'metadata': {
                'authors': article.get('authors', []),
                'journal': article.get('journal', ''),
                'doi': article.get('doi', '')
            }
        }

def validate_chunks(processed_articles: List[Dict]) -> pd.DataFrame:
    validation_data = []
    tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
    
    for article in processed_articles:
        pmcid = article['pmcid']
        for section, chunks in article['chunks'].items():
            for i, chunk in enumerate(chunks):
                token_count = len(tokenizer.encode(chunk))
                validation_data.append({
                    'pmcid': pmcid,
                    'section': section,
                    'chunk_id': i,
                    'token_count': token_count,
                    'within_limit': token_count <= 384,
                    'chunk_preview': chunk[:100] + '...' if len(chunk) > 100 else chunk
                })
    
    return pd.DataFrame(validation_data)

def analyze_chunks(df: pd.DataFrame):
    print("\n=== Chunking Analysis Summary ===")
    summary_stats = {
        'Total Chunks': len(df),
        'Average Tokens': df['token_count'].mean(),
        'Max Tokens': df['token_count'].max(),
        'Chunks Near Limit (>350)': len(df[df['token_count'] > 350]),
        'Articles Processed': df['pmcid'].nunique()
    }
    print(pd.Series(summary_stats))
    
    print("\n=== Token Distribution by Section ===")
    print(df.groupby('section')['token_count'].describe())
    
    # Export results
    df.to_csv('chunk_analysis.csv', index=False)
    
    return df
def debug_chunks(processed_articles):
    for article in processed_articles:
        print(f"\nPMCID: {article['pmcid']}")
        print(f"Number of full text chunks: {len(article['chunks']['full_text'])}")
        if len(article['chunks']['full_text']) > 0:
            print(f"First chunk preview: {article['chunks']['full_text'][0][:100]}...")
def main(input_file: str):
    # Load data
    with open(input_file, 'r') as f:
        articles = json.load(f)
    print(f"Number of articles loaded: {len(articles)}")
    # Process articles
    processor = PMCProcessor()
    processed_articles = [processor.process_article(article) for article in articles]
    debug_chunks(processed_articles)
    # Analyze results
    analysis_df = validate_chunks(processed_articles)
    analyzed_df = analyze_chunks(analysis_df)
    df = pd.DataFrame(analysis_df)
    df.describe()
    print(f"\nResults saved to chunk_analysis.csv")
    return analyzed_df, processed_articles

if __name__ == "__main__":
    # Example usage
    input_file = "pmc_articles.json"
    df, processed_data = main(input_file)


# **Creating Embeddings**



In [None]:
# Cell 1 - Load chunks from CSV using basic Python
import csv
import numpy as np
from sentence_transformers import SentenceTransformer
import os
import pickle

def load_chunks_from_csv(csv_file: str = 'remaining_rows.csv'):
    chunks_data = []
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            chunks_data.append(row)
    print(f"Loaded {len(chunks_data)} chunks from CSV")
    return chunks_data

class EmbeddingsManager:
    def __init__(self, model_name="sentence-transformers/all-mpnet-base-v2"):
        self.model_name = model_name
        self.model = SentenceTransformer(model_name)

    def generate_and_save(self, chunks_data: list, save_dir: str):
        os.makedirs(save_dir, exist_ok=True)

        # Using chunk_preview from CSV
        texts = [chunk['chunk_preview'] for chunk in chunks_data]
        print(f"Generating embeddings for {len(texts)} chunks...")

        embeddings = self.model.encode(
            texts,
            batch_size=32,
            show_progress_bar=True,
            normalize_embeddings=True
        )

        save_path = os.path.join(save_dir, 'embeddings_package.pkl')
        data_package = {
            'embeddings': embeddings,
            'chunks_data': chunks_data
        }

        with open(save_path, 'wb') as f:
            pickle.dump(data_package, f)

        print(f"Saved embeddings package to: {save_path}")
        return embeddings

# Run the pipeline
chunks_data = load_chunks_from_csv()
embeddings_manager = EmbeddingsManager()
embeddings = embeddings_manager.generate_and_save(chunks_data, "embeddings_data")


Loaded 11363 chunks from CSV
Generating embeddings for 11363 chunks...


Batches:   0%|          | 0/356 [00:00<?, ?it/s]

Saved embeddings package to: embeddings_data/embeddings_package.pkl


In [None]:
with open('embeddings_data/embeddings_package.pkl', 'rb') as f:
    data = pickle.load(f)

print("Embeddings shape:", data['embeddings'].shape)
print("Number of chunks:", len(data['chunks_data']))
print("\nSample chunk metadata:", list(data['chunks_data'][0].keys()))


Embeddings shape: (11363, 768)
Number of chunks: 11363

Sample chunk metadata: ['pmcid', 'section', 'chunk_id', 'token_count', 'within_limit', 'chunk_preview']


# Performing Semantic search and LLM ready **text**

In [None]:
from typing import List, Dict
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle
import json
import os

class SemanticSearcher:
    def __init__(self, model_name="sentence-transformers/all-mpnet-base-v2", max_tokens=2000):
        self.model = SentenceTransformer(model_name)
        self.embeddings, self.chunks_data = self.load_search_data()
        self.original_texts = self.load_original_texts()
        self.max_tokens = max_tokens

    def load_search_data(self, pkl_path='embeddings_data/embeddings_package.pkl'):
        with open(pkl_path, 'rb') as f:
            data = pickle.load(f)
        return data['embeddings'], data['chunks_data']

    def load_original_texts(self):
        with open('pmc_articles.json', 'r') as f:
            articles = json.load(f)
        return {article['pmcid']: article for article in articles}

    def get_full_text(self, chunk):
        article = self.original_texts.get(chunk['pmcid'])
        if article:
            if chunk['section'] == 'title':
                return article.get('title', '')
            elif chunk['section'] == 'abstract':
                return article.get('abstract', '')
            else:
                return article.get('full_text', '')
        return chunk['chunk_preview']

    def truncate_text(self, text: str) -> str:
        # Simple splitting by periods
        sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
        tokens = 0
        result_sentences = []

        for sentence in sentences:
            tokens += len(sentence.split())
            if tokens > self.max_tokens:
                break
            result_sentences.append(sentence)

        return ' '.join(result_sentences)

    def search(self, query: str, top_k: int = 5):
        query_embedding = self.model.encode(query, normalize_embeddings=True)
        similarities = np.dot(self.embeddings, query_embedding)
        top_indices = np.argsort(similarities)[-top_k:][::-1]

        results = []
        for idx in top_indices:
            chunk = self.chunks_data[idx]
            full_text = self.get_full_text(chunk)
            truncated_text = self.truncate_text(full_text)

            results.append({
                'score': float(similarities[idx]),
                'pmcid': chunk['pmcid'],
                'section': chunk['section'],
                'text': truncated_text,
                'is_truncated': len(full_text) > len(truncated_text)
            })
        return results

    def prepare_for_llm(self, results: List[Dict]) -> str:
        context = []
        for result in results:
            context.append(f"""
Source: {result['pmcid']} ({result['section']})
Relevance: {result['score']:.2f}
Content: {result['text']}
---""")
        return "\n".join(context)

def display_results(searcher: SemanticSearcher, query: str, top_k: int = 5):
    print(f"\nSearch Query: {query}")
    print("=" * 100)

    results = searcher.search(query, top_k=top_k)
    llm_ready_context = searcher.prepare_for_llm(results)
    print(llm_ready_context)

# Example usage
def main():
    searcher = SemanticSearcher(max_tokens=1000)
    test_queries = [
        "What are the main symptoms of COVID-19?",
        "How does the virus spread?",
        "What are the treatment options available?"
    ]

    for query in test_queries:
        display_results(searcher, query)

if __name__ == "__main__":
    main()



Search Query: What are the main symptoms of COVID-19?

Source: 11657336 (title)
Relevance: 0.52
Content: Impact-of-COVID-19 on mortality and implications for adolescent and young-adult healthcare.
---

Source: 11669564 (full_text)
Relevance: 0.51
Content: pmc Risk Anal Risk Anal 10. 1111/(ISSN)1539-6924 RISA Risk Analysis 0272-4332 1539-6924 John Wiley and Sons Inc. Hoboken 37660243 11669564 10. 1111/risa. 14213 RISA14213 Original Article Original Article The foundations of influencing policy and practice: How risk science discourse shaped government action during COVID‐19 SHAW and SCULLY INFLUENCING POLICY AND PRACTICE Shaw Duncan 1 2 duncan. shaw-2@manchester. ac. uk Scully Judy 1 1 Alliance Manchester Business School The University of Manchester Manchester UK 2 Humanitarian and Conflict Response Institute The University of Manchester Manchester UK * Correspondence Duncan Shaw, Alliance Manchester Business School, Booth Street West, The University of Manchester, Manchester M15 6 PB,

In [None]:
from typing import List, Dict
from sentence_transformers import SentenceTransformer
import numpy as np
import pickle
import json
import os
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

class SemanticSearcher:
    def __init__(self, model_name="sentence-transformers/all-mpnet-base-v2", max_tokens=2000):
        self.model = SentenceTransformer(model_name)
        self.embeddings, self.chunks_data = self.load_search_data()
        self.original_texts = self.load_original_texts()
        self.max_tokens = max_tokens

    def load_search_data(self, pkl_path='embeddings_package.pkl'):
        with open(pkl_path, 'rb') as f:
            data = pickle.load(f)
        return data['embeddings'], data['chunks_data']

    def load_original_texts(self):
        with open('pmc_articles.json', 'r') as f:
            articles = json.load(f)
        return {article['pmcid']: article for article in articles}

    def get_full_text(self, chunk):
        article = self.original_texts.get(chunk['pmcid'])
        if article:
            if chunk['section'] == 'title':
                return article.get('title', '')
            elif chunk['section'] == 'abstract':
                return article.get('abstract', '')
            else:
                return article.get('full_text', '')
        return chunk['chunk_preview']

    def truncate_text(self, text: str) -> str:
        sentences = [s.strip() + '.' for s in text.split('.') if s.strip()]
        tokens = 0
        result_sentences = []

        for sentence in sentences:
            tokens += len(sentence.split())
            if tokens > self.max_tokens:
                break
            result_sentences.append(sentence)

        return ' '.join(result_sentences)

    def search(self, query: str, top_k: int = 5):
        query_embedding = self.model.encode(query, normalize_embeddings=True)
        similarities = np.dot(self.embeddings, query_embedding)
        top_indices = np.argsort(similarities)[-top_k:][::-1]

        results = []
        for idx in top_indices:
            chunk = self.chunks_data[idx]
            full_text = self.get_full_text(chunk)
            truncated_text = self.truncate_text(full_text)

            results.append({
                'score': float(similarities[idx]),
                'pmcid': chunk['pmcid'],
                'section': chunk['section'],
                'text': truncated_text,
                'is_truncated': len(full_text) > len(truncated_text)
            })
        return results

    def create_pmc_link(self, pmcid: str) -> str:
        return f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{pmcid}/"

    def prepare_smart_context_for_llm(self, results: List[Dict], token_budget=8000) -> str:
      context = []
      tokens_used = 0

      for result in results:
          result_tokens = len(result['text'].split())
          if tokens_used + result_tokens > token_budget:
              break

          pmc_link = self.create_pmc_link(result['pmcid'])
          context.append(f"""
  Source: PMC{result['pmcid']} ({result['section']})
  Link: {pmc_link}
  Relevance: {result['score']:.2f}
  Content: {result['text']}
  ---""")
          tokens_used += result_tokens

      prompt = (
          f"Based on the following research articles, provide a detailed and comprehensive answer. "
          f"Include relevant PMC article links as citations in your response.\n\n"
          f"{chr(10).join(context)}\n\n"
          f"Instructions:\n"
          f"1. Provide a clear and detailed answer\n"
          f"2. Include citations using PMC links when referencing specific information\n"
          f"3. Format citations as [https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{'{pmcid}'}]\n"
          f"4. Ensure all key information is properly cited\n"
          f"4. Create a section of reference at the end and provide all the links of articles you take info"
          f"5. Maintain scientific accuracy while being accessible\n\n"
          f"Please provide your response now."
      )

      return prompt


class GeminiLLM:
    def __init__(self, api_key):
        genai.configure(api_key=api_key)
        self.model = genai.GenerativeModel('gemini-pro')

    def generate_response(self, prompt: str) -> str:
        response = self.model.generate_content(prompt)
        return response.text

def process_query(searcher: SemanticSearcher, llm: GeminiLLM, query: str):
    print(f"\nProcessing Query: {query}")
    print("=" * 80)

    # Get search results and prepare context
    results = searcher.search(query, top_k=5)
    prompt = searcher.prepare_smart_context_for_llm(results)

    # Generate and display response
    response = llm.generate_response(prompt)
    print("\nResponse:")
    print(response)
    print("=" * 80)

def main():
    GOOGLE_API_KEY = ""  # Replace with your API key

    searcher = SemanticSearcher(max_tokens=1000)
    llm = GeminiLLM(GOOGLE_API_KEY)

    test_queries = [
        "molecular biotechnolgy"
    ]

    for query in test_queries:
        process_query(searcher, llm, query)

if __name__ == "__main__":
    main()



Processing Query: molecular biotechnolgy

Response:
**Shared Molecular Mechanisms in Pediatric Acute Lymphoblastic Leukemia (ALL) and Pediatric Sepsis**

Pediatric ALL and sepsis share similar molecular mechanisms, particularly involving three key genes: HCK, NOG, and RNF125 [https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11698841/]. These genes play crucial roles in regulating immune responses and cellular signaling pathways.

* **HCK:** HCK is a tyrosine kinase involved in immune cell activation and differentiation. It regulates the activity of other signaling proteins, affecting cell growth, proliferation, and apoptosis.
* **NOG:** NOG encodes a protein that inhibits bone morphogenetic proteins (BMPs). BMPs are involved in bone development and immune regulation. NOG's dysregulation can disrupt immune cell function and contribute to disease progression.
* **RNF125:** RNF125 is an E3 ubiquitin ligase that participates in protein degradation pathways. It regulates the stability and func