In [3]:
import os
import pandas as pd
from langchain.docstore.document import Document
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms.ollama import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_and_format_csv(csv_path):
    """Load CSV file and format as a readable table"""
    try:
        df = pd.read_csv(csv_path)
        return df.to_markdown(index=False)
    except Exception as e:
        print(f"Error loading {csv_path}: {str(e)}")
        return "CSV data could not be loaded"

def load_experiment_documents(csv_folder, text_folder):
    """Load and format experiment documents with structured metadata"""
    documents = []
    for filename in os.listdir(csv_folder):
        if filename.endswith('.csv'):
            base_name = filename[:-4]
            csv_path = os.path.join(csv_folder, filename)
            txt_path = os.path.join(text_folder, base_name + '.txt')
            
            if not os.path.exists(txt_path):
                print(f"Warning: No text file for {filename}")
                continue
                
            try:
                # Format CSV as table
                csv_table = load_and_format_csv(csv_path)
                
                # Load and structure text description
                with open(txt_path, 'r', encoding='utf-8') as f:
                    text_content = f.read().strip()
                
                # Create structured document content
                formatted_content = (
                    f"EXPERIMENT: {base_name}\n\n"
                    f"DESCRIPTION:\n{text_content}\n\n"
                    f"DATA RESULTS:\n{csv_table}\n\n"
                    f"END OF EXPERIMENT {base_name}"
                )
                
                metadata = {
                    "experiment": base_name,
                    "source_type": "experiment_data",
                    "data_columns": str(pd.read_csv(csv_path).columns.tolist())
                }
                
                documents.append(Document(
                    page_content=formatted_content,
                    metadata=metadata
                ))
                
            except Exception as e:
                print(f"Error processing {base_name}: {str(e)}")
    
    return documents

# Configuration
text_folder = "/kaggle/input/llama-stuff/Intros"
csv_folder = "/kaggle/input/llama-stuff/Downloaded Samples"
model_name = "llama3.2:3b"  # Verify correct model name

# Load and process documents
documents = load_experiment_documents(csv_folder, text_folder)

# Split documents for better retrieval
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\nEXPERIMENT:", "\n\nDESCRIPTION:", "\n\nDATA RESULTS:"]
)

split_docs = text_splitter.split_documents(documents)

# Initialize embeddings and vector store
embeddings = OllamaEmbeddings(model=model_name)
vectorstore = FAISS.from_documents(split_docs, embeddings)

# Custom prompt template
qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are an analytical research assistant. Use ONLY the following context to answer.
If you don't know the answer, say you don't know. Never make up answers.

Context:
{context}

Question: {question}

Answer in this structured format:
- Start with a main summary of findings
- Reference specific experiments by name
- Cite relevant numbers from data tables
- End with potential limitations in the data"""
)

# Initialize QA chain with enhanced configuration
llm = Ollama(model=model_name, temperature=0.3)  # Lower temperature for less randomness

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(
        search_type="mmr",  # Use Maximal Marginal Relevance for better diversity
        search_kwargs={"k": 5}
    ),
    chain_type_kwargs={"prompt": qa_prompt},
    return_source_documents=True
)

# Enhanced query processing
def ask_question(query):
    result = qa_chain({"query": query})
    
    print("Answer:")
    print(result["result"])
    
    print("\nSources:")
    for doc in result["source_documents"]:
        print(f"- Experiment: {doc.metadata['experiment']}")
        print(f"  Data Columns: {doc.metadata['data_columns']}")
        print(f"  Content Excerpt: {doc.page_content[:200]}...\n")

# Example usage
#ask_question("Oryza sativa agriculture based on the experimental data")

  embeddings = OllamaEmbeddings(model=model_name)
  llm = Ollama(model=model_name, temperature=0.3)  # Lower temperature for less randomness


In [7]:
# ... (keep previous imports and document loading functions)

def load_experiment_documents(csv_folder, text_folder):
    """Enhanced document loader with biological entity highlighting"""
    documents = []
    for filename in os.listdir(csv_folder):
        if filename.endswith('.csv'):
            base_name = filename[:-4]
            csv_path = os.path.join(csv_folder, filename)
            txt_path = os.path.join(text_folder, base_name + '.txt')
            
            if not os.path.exists(txt_path):
                continue
                
            try:
                # Load and analyze CSV data
                df = pd.read_csv(csv_path)
                csv_table = df.to_markdown(index=False)
                
                # Extract biological entities
                org_data = df['Characteristics: Organism'].unique() if 'Characteristics: Organism' in df.columns else []
                genotype_data = df['Characteristics: Genotype'].unique() if 'Characteristics: Genotype' in df.columns else []
                
                with open(txt_path, 'r', encoding='utf-8') as f:
                    text_content = f.read().strip()
                
                # Create enhanced document structure
                formatted_content = (
                    f"EXPERIMENT: {base_name}\n\n"
                    f"BIOLOGICAL SYSTEM:\n"
                    f"- Organism(s): {', '.join(org_data)}\n"
                    f"- Genotype(s): {', '.join(genotype_data)}\n\n"
                    f"DESCRIPTION:\n{text_content}\n\n"
                    f"KEY DATA COLUMNS:\n{', '.join(df.columns)}\n\n"
                    f"FULL RESULTS:\n{csv_table}"
                )

                metadata = {
                    "experiment": base_name,
                    "organisms": org_data.tolist(),
                    "genotypes": genotype_data.tolist(),
                    "data_columns": df.columns.tolist()
                }
                
                documents.append(Document(
                    page_content=formatted_content,
                    metadata=metadata
                ))
                
            except Exception as e:
                print(f"Error processing {base_name}: {str(e)}")
    
    return documents

# New prompt template focusing on biological analysis
qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are a plant biology research analyst. Analyze this experimental data to answer agricultural questions:

Context:
{context}

Question: {question}

Follow these steps:
1. Identify relevant experiments mentioning Oryza sativa
2. Examine "Characteristics: Organism" and "Factor Value:" columns
3. Compare different gravity/spaceflight conditions
4. Look for growth patterns or stress responses
5. Connect findings to Earth agriculture potential

Present your answer with:
- 3 key observations from the data
- Specific experimental conditions used
- Quantitative results from tables
- Relevance to crop cultivation

If no rice data exists, state that clearly."""
)

# Modified QA chain with metadata filtering
qa_chain = RetrievalQA.from_chain_type(
    llm=Ollama(model="llama3.1:8b", temperature=0.2),
    chain_type="stuff",
    retriever=vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={
            "k": 5,
            "filter": {"organisms": "Oryza sativa"}  # Metadata filter
        }
    ),
    chain_type_kwargs={"prompt": qa_prompt},
    return_source_documents=True
)

def analyze_agricultural_trends(query):
    """Specialized analysis function for agricultural insights"""
    response = qa_chain({"query": query})
    
    print("Agricultural Analysis:")
    print(response["result"])
    
    print("\nSupporting Evidence:")
    for doc in response["source_documents"]:
        if "Oryza sativa" in doc.metadata.get("organisms", []):
            print(f"Experiment {doc.metadata['experiment']}:")
            print(f"- Conditions Tested: {doc.metadata.get('data_columns', [])}")
            print(f"- Key Parameters: {[c for c in doc.metadata['data_columns'] if 'Factor Value' in c]}")
            print(f"- Sample Description: {doc.page_content[:300]}...\n")

# Example usage
analyze_agricultural_trends("What cultivation insights can we gain from spaceflight experiments on Oryza sativa?")

Agricultural Analysis:
After analyzing the experimental data on spaceflight experiments involving Oryza sativa (rice), I present the following findings:

**Key Observations:**

1. **Reduced root growth in microgravity**: In Experiment 3, where Oryza sativa was grown in a microgravity environment for 14 days, the average root length was significantly reduced by 23% compared to the control group on Earth (Table 1).
2. **Increased shoot growth under hypergravity**: In Experiment 5, rice plants were exposed to 2g of acceleration, resulting in a 15% increase in shoot height and a 12% increase in leaf area compared to the control group (Table 2).
3. **Enhanced stress tolerance in spaceflight**: In Experiment 1, Oryza sativa was grown in a spaceflight environment for 30 days, where it showed improved resistance to drought stress, with a 25% increase in water use efficiency and a 18% increase in biomass production compared to the control group (Table 3).

**Experimental Conditions:**

* Experi

In [9]:
# ... (keep previous imports and document loading functions)

def load_experiment_documents(csv_folder, text_folder):
    """Enhanced document loader with biological entity highlighting"""
    documents = []
    for filename in os.listdir(csv_folder):
        if filename.endswith('.csv'):
            base_name = filename[:-4]
            csv_path = os.path.join(csv_folder, filename)
            txt_path = os.path.join(text_folder, base_name + '.txt')
            
            if not os.path.exists(txt_path):
                continue
                
            try:
                # Load and analyze CSV data
                df = pd.read_csv(csv_path)
                csv_table = df.to_markdown(index=False)
                
                # Extract biological entities
                org_data = df['Characteristics: Organism'].unique() if 'Characteristics: Organism' in df.columns else []
                genotype_data = df['Characteristics: Genotype'].unique() if 'Characteristics: Genotype' in df.columns else []
                
                with open(txt_path, 'r', encoding='utf-8') as f:
                    text_content = f.read().strip()
                
                # Create enhanced document structure
                formatted_content = (
                    f"EXPERIMENT: {base_name}\n\n"
                    f"BIOLOGICAL SYSTEM:\n"
                    f"- Organism(s): {', '.join(org_data)}\n"
                    f"- Genotype(s): {', '.join(genotype_data)}\n\n"
                    f"DESCRIPTION:\n{text_content}\n\n"
                    f"KEY DATA COLUMNS:\n{', '.join(df.columns)}\n\n"
                    f"FULL RESULTS:\n{csv_table}"
                )

                metadata = {
                    "experiment": base_name,
                    "organisms": org_data.tolist(),
                    "genotypes": genotype_data.tolist(),
                    "data_columns": df.columns.tolist()
                }
                
                documents.append(Document(
                    page_content=formatted_content,
                    metadata=metadata
                ))
                
            except Exception as e:
                print(f"Error processing {base_name}: {str(e)}")
    
    return documents

# New prompt template focusing on biological analysis
qa_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are a plant biology research analyst. Analyze this experimental data to answer agricultural questions:

Context:
{context}

Question: {question}

Follow these steps:
1. Identify relevant experiments mentioning Oryza sativa
2. Examine "Characteristics: Organism" and "Factor Value:" columns
3. Compare different gravity/spaceflight conditions
4. Look for growth patterns or stress responses
5. Connect findings to Earth agriculture potential

Present your answer with:
- 3 key observations from the data
- Specific experimental conditions used
- Quantitative results from tables
- Relevance to crop cultivation

If no rice data exists, state that clearly."""
)

# Modified QA chain with metadata filtering
qa_chain = RetrievalQA.from_chain_type(
    llm=Ollama(model="llama3.2:3b", temperature=0.2),
    chain_type="stuff",
    retriever=vectorstore.as_retriever(
        search_type="mmr",
        search_kwargs={
            "k": 5,
            "filter": {"organisms": "Oryza sativa"}  # Metadata filter
        }
    ),
    chain_type_kwargs={"prompt": qa_prompt},
    return_source_documents=True
)

def analyze_agricultural_trends(query):
    """Specialized analysis function for agricultural insights"""
    response = qa_chain({"query": query})
    
    print("Agricultural Analysis:")
    print(response["result"])
    
    print("\nSupporting Evidence:")
    for doc in response["source_documents"]:
        if "Oryza sativa" in doc.metadata.get("organisms", []):
            print(f"Experiment {doc.metadata['experiment']}:")
            print(f"- Conditions Tested: {doc.metadata.get('data_columns', [])}")
            print(f"- Key Parameters: {[c for c in doc.metadata['data_columns'] if 'Factor Value' in c]}")
            print(f"- Sample Description: {doc.page_content[:300]}...\n")

# Example usage
analyze_agricultural_trends("What cultivation insights can we gain from spaceflight experiments on Oryza sativa?")

Agricultural Analysis:
As a plant biology research analyst, I analyzed the available data on spaceflight experiments involving Oryza sativa (rice). After reviewing the relevant experiments, I found three key observations that provide insights into cultivation practices for this crop.

**Relevant Experiments:**

1. "Microgravity Effects on Rice Growth and Development" (Experiment ID: MG-001)
2. "Spaceflight-Induced Stress Response in Oryza sativa" (Experiment ID: SS-002)
3. "Comparative Study of Space-Grown and Earth-Grown Rice" (Experiment ID: CG-003)

**Data Analysis:**

1. **Characteristics: Organism** and **Factor Value:** columns revealed the following information:
	* Oryza sativa was grown in three different gravity/spaceflight conditions: microgravity (MG), reduced gravity (RG), and Earth surface control (ESC).
	* The "Factor Value:" column showed that the plants were exposed to varying levels of radiation, temperature, and humidity during the spaceflight experiments.
2. **Compar

In [2]:
!curl -fsSL https://ollama.com/install.sh | sh
import subprocess
process = subprocess.Popen("ollama serve", shell=True) #runs on a different thread
#Download model
!ollama pull llama3.2:3b

!pip install -U langchain-community faiss-gpu ollama

>>> Cleaning up old version at /usr/local/lib/ollama
>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
############################################################################################# 100.0%                                                                            6.6%                                                                             16.1%################################                                                51.4%#################################################################################        93.9%
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest 
pulling dde5aa3fc5ff... 100% ▕████████████████▏ 2.0 GB  