# **Notebook for uploading embeddings to pinecone and chatbot creation**

In [None]:
!pip install pandas trafilatura langchain sentence-transformers pinecone groq

Collecting trafilatura
  Downloading trafilatura-2.0.0-py3-none-any.whl.metadata (12 kB)
Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting groq
  Downloading groq-0.32.0-py3-none-any.whl.metadata (16 kB)
Collecting courlan>=1.3.2 (from trafilatura)
  Downloading courlan-1.3.2-py3-none-any.whl.metadata (17 kB)
Collecting htmldate>=1.9.2 (from trafilatura)
  Downloading htmldate-1.9.3-py3-none-any.whl.metadata (10 kB)
Collecting justext>=3.0.1 (from trafilatura)
  Downloading justext-3.0.2-py2.py3-none-any.whl.metadata (7.3 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.8.0-py3-none-any.whl.metadata (30 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Collecting tld>=0.13 (from courlan>=1.3.2->trafilatura)
  Downloading tld-0.13.1-py2.py3-none-any.whl.metadata (10 kB)
Collectin

In [None]:
import pandas as pd
import trafilatura
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from groq import Groq

In [None]:
# Upload your CSV in Colab or mount Google Drive
df = pd.read_csv("/content/space_docs.csv")
df.tail()

Unnamed: 0,Title,Link
602,Spaceflight alters host-gut microbiota interac...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
603,Discoveries from human stem cell research in s...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
604,Simulated microgravity impairs human NK [natur...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
605,Simulated microgravity alters gene regulation ...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
606,Spaceflight increases sarcoplasmic reticulum C...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...


In [None]:
texts = []

for url in df["Link"]:
    downloaded = trafilatura.fetch_url(url)
    if downloaded:
        text = trafilatura.extract(downloaded, include_comments=False, include_tables=True)
        texts.append(text if text else "")
    else:
        texts.append("")

df["content"] = texts
df.to_csv("articles_with_text.csv", index=False)  # backup

ERROR:trafilatura.downloads:download error: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5470433/ HTTPSConnectionPool(host='pmc.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /articles/PMC5470433/ (Caused by ResponseError('too many 429 error responses'))
ERROR:trafilatura.downloads:download error: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5491145/ HTTPSConnectionPool(host='pmc.ncbi.nlm.nih.gov', port=443): Max retries exceeded with url: /articles/PMC5491145/ (Caused by ResponseError('too many 429 error responses'))


In [None]:
PINECONE_API_KEY = ""
PINECONE_ENV = "us-east-1"

In [None]:
# Load embedding model
model = SentenceTransformer("BAAI/bge-m3")

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

# Create index if not exists
index_name = "spacebio"
index = pc.Index(index_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
from tqdm import tqdm
import torch

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load embedding model on GPU
model = SentenceTransformer("BAAI/bge-m3", device=device)

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

# Create index if not exists
index_name = "spacebio"
index = pc.Index(index_name)

# 1. Load your extracted text CSV
df1 = pd.read_csv("/content/articles_with_text.csv")   # update filename if needed
df1 = df1.rename(columns={"Title": "title", "Link": "url", "content": "text"})

# 2. Initialize text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# 3. Create chunks_df in one go
rows = []
for i, row in df1.iterrows():  # Use df1 which has the 'text' column
    if isinstance(row["text"], str):
        splits = splitter.split_text(row["text"])
        for j, chunk in enumerate(splits):
            rows.append({
                "paper_id": i,
                "title": row["title"],
                "url": row["url"],
                "year": row.get("year", None),
                "section": row.get("section", None),
                "chunk_id": j,
                "text": chunk,
                "token_count": len(chunk.split())  # simple token count
            })
    else:
        print(f"Skipping row {i} due to non-string text content.")

chunks_df = pd.DataFrame(rows)

print(f"Created chunks_df with {len(chunks_df)} total chunks")
print(chunks_df.head(5))

# 4. Upsert to Pinecone
batch_size = 200  # you can increase this with GPU

for start in tqdm(range(0, len(chunks_df), batch_size)):
    end = start + batch_size
    batch = chunks_df.iloc[start:end]

    # Embed all texts in this batch on GPU
    embeddings = model.encode(batch["text"].tolist(),
                              batch_size=128,  # GPU handles bigger batches
                              show_progress_bar=False,
                              convert_to_numpy=True,
                              device="cuda").tolist()

    to_upsert = []
    for (row, embedding) in zip(batch.itertuples(index=False), embeddings):
        vector_id = f"{row.paper_id}_chunk{row.chunk_id}"

        metadata = {
            "paper_id": row.paper_id,
            "title": row.title,
            "url": row.url,
            "year": str(row.year) if pd.notna(row.year) else "",
            "section": str(row.section) if pd.notna(row.section) else "",
            "chunk_id": int(row.chunk_id),
            "text": row.text,
            "token_count": int(row.token_count)
        }

        to_upsert.append((vector_id, embedding, metadata))

    # Upload this batch
    index.upsert(vectors=to_upsert)


Using device: cuda
Skipping row 145 due to non-string text content.
Skipping row 146 due to non-string text content.
Created chunks_df with 49892 total chunks
   paper_id                                              title  \
0         0  Mice in Bion-M 1 space mission: training and s...   
1         0  Mice in Bion-M 1 space mission: training and s...   
2         0  Mice in Bion-M 1 space mission: training and s...   
3         0  Mice in Bion-M 1 space mission: training and s...   
4         0  Mice in Bion-M 1 space mission: training and s...   

                                                 url  year section  chunk_id  \
0  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...  None    None         0   
1  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...  None    None         1   
2  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...  None    None         2   
3  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...  None    None         3   
4  https://www.ncbi.nlm.nih.gov/pmc/articles

100%|██████████| 250/250 [1:08:20<00:00, 16.40s/it]


In [None]:
from tqdm import tqdm
import torch

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Example query
query = "What experiments have been done on plant growth in space?"

# Encode query into embedding using the determined device
query_embedding = model.encode(
    [query],
    convert_to_numpy=True,
    device=device  # Use the determined device
).tolist()[0]

# Search in Pinecone
results = index.query(
    vector=query_embedding,
    top_k=5,   # number of results to return
    include_metadata=True
)

# Display results
for match in results["matches"]:
    score = match["score"]
    meta = match["metadata"]
    print(f"\nTitle: {meta['title']} ({meta['year']})")
    print(f"URL: {meta['url']}")
    print(f"Section: {meta.get('section','')}")
    print(f"Score: {score:.4f}")
    print(f"---\n{meta['text'][:500]}...\n")

Using device: cpu

Title: Red light enhances plant adaptation to spaceflight and Mars g-levels. ()
URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9605285/
Section: 
Score: 0.7215
---
These results complement experiments specifically addressed to complete the full life cycle of plants (seed-to-seed) in space, carried out with A. thaliana. Using advanced growth chambers which, in general, provide a well-regulated environment for growing plants in microgravity on the ISS, fertile adult plants have been produced from seeds germinated in space; seeds obtained from these plants have, in turn, been germinated. The ADVANCED ASTROCULTURE (ADVASC) experiment, consisting of two success...


Title: Red light enhances plant adaptation to spaceflight and Mars g-levels. ()
URL: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9605285/
Section: 
Score: 0.7201
---
Plant seeds were, strictly speaking, the first organisms in space, launched on a U.S. V-2 rocket in 1946, representing some early suborbital 

# Chatbot


In [None]:
!pip install requests



In [None]:
# Initialize Groq client

GROQ_API_KEY = ""
client = Groq(api_key=GROQ_API_KEY)

In [None]:
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone
from groq import Groq # Import the Groq class

embed_model = SentenceTransformer("BAAI/bge-m3")

pc = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
index = pc.Index("spacebio")

# Initialize Groq client
client = Groq(api_key=GROQ_API_KEY)  # Use the imported Groq class

# ----------------------
# 2. Generate Answer Function (Groq)
# ----------------------
def generate_answer_groq(matches, query):
    # Build context from retrieved chunks
    context = "\n\n".join([match['metadata']['text'] for match in matches])

    # Call Groq LLM
    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",  # Groq-supported model
        messages=[
            {
                "role": "system",
                "content": (
                    "You are an expert researcher in space biology, human physiology, plant science, and other life "
                    "and physical sciences conducted in space. Based on the context provided from multiple NASA "
                    "publications or scientific papers, provide a clear, coherent, and thorough answer to the user’s "
                    "question. Your response should:\n\n"
                    "1. Integrate findings across all sources, identifying consensus, key experiments, and "
                    "notable discrepancies. \n"
                    "2. Emphasize scientific reasoning and mechanisms rather than just listing facts. \n"
                    "3. Include references or citations where possible, using the provided context as the source, ensuring no duplicates. \n"
                    "4. Avoid redundancy and ensure smooth narrative flow, like a researcher summarizing evidence for "
                    "colleagues or writing a literature review. \n"
                    "5. Highlight knowledge gaps or open questions where relevant. \n"
                    "6. Make the text readable and professional, but natural—avoid overly mechanical or generic phrasing. \n"
                    "7. Only use bullet points when summarizing multiple distinct studies or mechanisms clearly.\n\n"
                    "Always assume the reader is scientifically literate but may not be an expert in this exact "
                    "subfield."
                )
            },
            {
                "role": "user",
                "content": f"Context:\n{context}\n\nQuestion: {query}"
            }
        ]
    )

    return response.choices[0].message.content  # fixed indentation


# ----------------------
# 3. Chatbot Function
# ----------------------
def chatbot_groq(query, top_k=5):
    # Step 1: Embed query
    q_emb = embed_model.encode(query).tolist()

    # Step 2: Retrieve top chunks from Pinecone
    results = index.query(vector=q_emb, top_k=top_k, include_metadata=True)

    # Step 3: Generate answer using Groq LLM
    answer = generate_answer_groq(results['matches'], query)

    # Step 4: Return answer + sources
    sources = [m['metadata']['url'] for m in results['matches']]
    return answer, sources

# ----------------------
# 4. Test Chatbot
# ----------------------
answer, sources = chatbot_groq("What have we learned about plant growth in microgravity?")
print("Answer:\n", answer)
print("\nSources:\n", sources)


Answer:
 Our understanding of plant growth in microgravity has evolved significantly over the years, with early research focusing on whole-plant level changes and more recent studies utilizing molecular techniques to uncover the underlying mechanisms. One of the key findings is that microgravity affects plant growth and architecture, with changes observed in root growth, stem orientation, and leaf development (Kiss 2014). For instance, studies have shown that plants grown in microgravity often exhibit altered root morphology, with roots growing in random directions rather than being oriented towards the gravity vector (Kordyum 1994).

Molecular studies have also shed light on the changes that occur in plants grown in microgravity. For example, microarray and RNA-seq analyses have revealed changes in gene expression patterns, including alterations in genes involved in cell wall modification, hormone signaling, and stress response (Mazars et al. 2014). Proteomic studies have also identif