**Bio-RAG Research Assistant**

**1. Setup and Installation**

In [None]:
pip install nltk

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# --- Step 0: Download NLTK resources (Run this once) ---
# 'punkt' is for splitting sentences/words
# 'stopwords' is the list of common useless words
# 'wordnet' is the dictionary for lemmatization
nltk.download('punkt')
nltk.download('punkt_tab') # sometimes needed for newer nltk versions
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

def preprocess_text(text):
    """
    Takes raw biological text and cleans it for the AI model.
    Steps: Lowercase -> Tokenize -> Remove Punctuation/Stopwords -> Lemmatize
    """

    # 1. Lowercasing
    # Why: "Gene" and "gene" should be treated as the same thing by the model.
    text = text.lower()

    # 2. Tokenization
    # Why: Models can't read sentences; they read individual units (tokens).
    tokens = word_tokenize(text)

    # 3. Setting up removals
    stop_words = set(stopwords.words('english'))
    # Customizing stopwords: In bio, we might want to keep words like 'not' (negation),
    # but for now, we use the standard list.

    # 4. Lemmatization Initialization
    # Why: Lemmatization is smarter than Stemming.
    # It converts "studies" -> "study" (valid word), whereas Stemming might do "studie" (garbage).
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []

    for token in tokens:
        # Remove punctuation and stop words
        if token not in string.punctuation and token not in stop_words:
            # Apply Lemmatization
            lemma = lemmatizer.lemmatize(token)
            clean_tokens.append(lemma)

    return clean_tokens

# --- Testing with a Biology Example (Relevant to Genomiki) ---
raw_data = """
CRISPR-Cas9 is a unique technology that enables geneticists and medical researchers
to edit parts of the genome by removing, adding or altering sections of the DNA sequence.
"""

processed_data = preprocess_text(raw_data)

print("ORIGINAL TEXT:")
print(raw_data)
print("\n--- Processing ---\n")
print("FINAL TOKENS (Input for the Model):")
print(processed_data)

**2. Data Ingestion & Vector Embedding Retrieval (Semantic Search)**

In [None]:
# 1. Install the library
!pip install sentence-transformers

# 2. Import libraries
from sentence_transformers import SentenceTransformer, util

# 3. Load the Model
print("Loading Model... (Downloading from Hugging Face)")
model = SentenceTransformer('all-MiniLM-L6-v2')

# 4. Prepare Data
sentences = [
    "CRISPR-Cas9 is a gene-editing tool.",
    "DNA consists of two strands.",
    "The mitochondria is the powerhouse of the cell.",
    "Python is a programming language."
]

# 5. Create Embeddings
embeddings = model.encode(sentences)

# 6. Search
query = "How do we edit genomes?"
query_embedding = model.encode(query)

hits = util.semantic_search(query_embedding, embeddings, top_k=1)
best_hit = hits[0][0]

print("\n--- SUCCESSFUL RESULT ---")
print(f"Query: {query}")
print(f"Match: {sentences[best_hit['corpus_id']]}")

**3. Generation (LLM Answer)**

In [None]:
# 1. Install transformers (if not already installed)
!pip install transformers accelerate

# 2. Import libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# 3. Load the the LLM
print("Loading Generator Model (Flan-T5)...")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
llm_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

def generate_answer(question, context):
    # This is "Prompt Engineering"
    input_text = f"question: {question} context: {context}"

    input_ids = tokenizer(input_text, return_tensors="pt").input_ids
    outputs = llm_model.generate(input_ids, max_length=50)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 4. Connect it to your previous search result
# (We use the 'best_hit' variable from your previous code block)
retrieved_doc = sentences[best_hit['corpus_id']]

print(f"\nUser Question: {query}")
print(f"Retrieved Context: {retrieved_doc}")

# 5. Generate the final answer
final_answer = generate_answer(query, retrieved_doc)

print("-" * 30)
print(f"AI Generated Answer: {final_answer}")
print("-" * 30)