In [16]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import pipeline


In [17]:
# Load the Q&A dataset
qa_df = pd.read_csv('questionanswer.csv', encoding='ISO-8859-1')  # Columns: Question, Answer, Type

# Clean missing values
qa_df['Question'] = qa_df['Question'].fillna('')
qa_df['Answer'] = qa_df['Answer'].fillna('')
qa_df['Type'] = qa_df['Type'].fillna('structured')  # Default to 'structured'

# Remove duplicates
qa_df = qa_df.drop_duplicates()

# Display a preview of the cleaned Q&A dataset
print(qa_df.head())


                                            Question  \
0  About how many years ago did life originate on...   
1                        Example for Endemic species   
2            write a short notes lectron microscopes   
3         Explain the fine structure of mitochondria   
4  Indicate the major function of both vitamin K ...   

                                              Answer       Type  
0                                           3.5X10^9  Structure  
1  Dipterocurpus zeylanicus / Gracinia quaesita\n...  Structure  
2  Electron microscopes represent a groundbreakin...      Essay  
3  Fine Structure of Mitochondria\n\nMitochondria...      Essay  
4                                Aids blood clotting  Structure  


In [18]:
# Load the Notes dataset
notes_df = pd.read_csv('Notes.csv', encoding='ISO-8859-1')  # Columns: Document ID, Topic, Sub-topic, Text Content

# Fill missing text content with empty strings
notes_df['Text Content'] = notes_df['Text Content'].fillna('')

# Combine topic and sub-topic into a single field for better context
notes_df['Combined Text'] = notes_df['Topic'] + " - " + notes_df['Sub-topic'] + "\n" + notes_df['Text Content']

# Display a preview of the cleaned Notes dataset
print(notes_df.head())


   Document ID                    Topic  \
0            1  Introduction to Biology   
1            2  Introduction to Biology   
2            3  Introduction to Biology   
3            4  Introduction to Biology   
4            5  Introduction to Biology   

                                           Sub-topic  \
0                 Understanding biological Diversity   
1    Understanding the human Body and its functions.   
2  Sustainable use and Management of natural reso...   
3                        Sustainable Food production   
4                           Understanding plant life   

                                        Text Content  \
0  At present our planet is rich in diversity. Li...   
1  When studying biology, especially by studying ...   
2  Natural resources are sources of materials and...   
3  Sustainable food production is the production ...   
4  Plants are the primary producers in the world....   

                             Source  \
0  Biology, Grade 12, Resour

In [19]:
def chunk_text(text, chunk_size=300, overlap=50):
    """
    Split long text into chunks of fixed size with overlap.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks


In [20]:
# Chunk the 'Combined Text' field
chunk_size = 300  # Number of words per chunk
overlap = 50      # Overlap between chunks

chunked_notes = []
for index, row in notes_df.iterrows():
    chunks = chunk_text(row['Combined Text'], chunk_size, overlap)
    for chunk in chunks:
        chunked_notes.append({
            'Document ID': row['Document ID'],
            'Chunk': chunk
        })

# Convert chunked notes into a DataFrame
chunked_notes_df = pd.DataFrame(chunked_notes)
print(chunked_notes_df.head())

   Document ID                                              Chunk
0            1  Introduction to Biology - Understanding biolog...
1            2  Introduction to Biology - Understanding the hu...
2            3  Introduction to Biology - Sustainable use and ...
3            4  Introduction to Biology - Sustainable Food pro...
4            5  Introduction to Biology - Understanding plant ...


In [21]:
# Load a semantic embedding model
embedder = SentenceTransformer('sentence-transformers/multi-qa-mpnet-base-dot-v1')


In [23]:
# Generate embeddings for Q&A dataset
qa_embeddings = embedder.encode(qa_df['Question'].tolist())

# Generate embeddings for chunked notes
notes_embeddings = embedder.encode(chunked_notes_df['Chunk'].tolist())


In [24]:
# Combine all embeddings and content for indexing
all_embeddings = np.vstack([qa_embeddings, notes_embeddings]).astype('float32')

# Combine all content for retrieval
all_content = (
    [{'type': 'Q&A', 'question': q, 'answer': a, 'type_label': t}
     for q, a, t in zip(qa_df['Question'], qa_df['Answer'], qa_df['Type'])]
    +
    [{'type': 'Note', 'chunk': chunk} for chunk in chunked_notes_df['Chunk']]
)


In [None]:
# Initialize and populate the FAISS index
index = faiss.IndexFlatL2(all_embeddings.shape[1])
index.add(all_embeddings)

# Save FAISS index for reuse (optional)
faiss.write_index(index, 'faiss_index.bin')

In [26]:
def retrieve_similar_content(query, k=5):
    """
    Retrieve the top-k most similar content from the FAISS index.
    """
    # Embed the query
    query_embedding = embedder.encode([query]).astype('float32')
    
    # Search the FAISS index
    distances, indices = index.search(query_embedding, k)
    
    # Prepare results
    results = []
    for idx in indices[0]:
        if idx < len(qa_df):
            # From Q&A dataset
            results.append({
                'type': 'Q&A',
                'question': qa_df.iloc[idx]['Question'],
                'answer': qa_df.iloc[idx]['Answer']
            })
        else:
            # From Notes dataset
            note_idx = idx - len(qa_df)
            results.append({
                'type': 'Note',
                'chunk': chunked_notes_df.iloc[note_idx]['Chunk']
            })
    return results


In [27]:
def construct_context_for_structure(query, k=3):
    """
    Construct context for structured questions. Only include concise Q&A pairs.
    """
    retrieved = retrieve_similar_content(query, k)
    context = []

    for item in retrieved:
        if item['type'] == 'Q&A':  # Only include Q&A pairs
            context.append(f"Q: {item['question']}\nA: {item['answer']}")

    # Limit to top 3 entries
    return "\n".join(context[:3])


In [28]:
def construct_context_for_essay(query, k=5):
    """
    Construct context for essay questions. Include the most relevant Q&A pair first, 
    followed by complementary Note chunks.
    """
    retrieved = retrieve_similar_content(query, k)
    context = []

    # Add the most relevant Q&A pair (ensure it's the one matching the query)
    for item in retrieved:
        if item['type'] == 'Q&A' and query.lower() in item['question'].lower():
            context.append(f"Q: {item['question']}\nA: {item['answer']}")
            break  # Add only the most relevant Q&A pair

    # Add complementary Note chunks
    for item in retrieved:
        if item['type'] == 'Note':
            context.append(f"Note:\n{item['chunk']}")

    # Limit the context to ensure it's concise and relevant
    return "\n".join(context[:5])


In [29]:
# Load LLaMA model for answer generation from the local folder 'Merged_model'
generator = pipeline("text-generation", model="D:\Finetune - Structure and Essay-20241115T045829Z-001\Finetune - Structure and Essay\Merged_model\Merged_model")

In [121]:
def generate_structured_answer(query, k=3, max_words=50):
    """
    Generate structured answers with concise and specific responses.
    """
    # Build context for structured questions
    context = construct_context_for_structure(query, k)

    # Prompt for structured questions
    prompt = (
        f"Question: {query}\n\n"
        f"Context:\n{context}\n\n"
        f"Answer the question concisely and accurately in 1-2 sentences.\nAnswer:"
    )

    # Adjust max token length
    input_length = len(generator.tokenizer(prompt)['input_ids'])
    adjusted_max_length = input_length + max_words

    # Generate response
    response = generator(
        prompt,
        max_length=adjusted_max_length, 
        truncation=True, 
        num_return_sequences=1
    )
    return response[0]["generated_text"]


In [73]:
def generate_essay_answer(query, k=5, min_words=175, max_words=300):
    """
    Generate essay-style answers with a detailed explanation.
    Ensure the response meets the minimum word count and does not exceed the maximum token count.
    """
    # Construct context with improved filtering
    context = construct_context_for_essay(query, k) or ""

    # Check if context is empty and set the prompt accordingly
    if not context.strip():
        # Fallback prompt when no context is available
        prompt = (
            f"Question: {query}\n\n"
            f"Answer the question in detail, providing a well-reasoned and comprehensive explanation. "
            f"Highlight key features, provide examples, and mention advantages and disadvantages where applicable. "
            f"Ensure your response is grammatically correct and at least {min_words} words long. "
            f"Complete your answer and avoid repetition.\nAnswer:"
        )
    else:
        # Prompt with context
        prompt = (
            f"Question: {query}\n\n"
            f"Context:\n{context}\n\n"
            f"Answer the question using the provided context. "
            f"Focus on relevant details from the context and elaborate as needed. "
            f"Provide clear points and explanations, ensuring your response is at least {min_words} words long and grammatically correct. "
            f"Highlight key features, examples, and advantages/disadvantages where applicable. "
            f"Do not repeat the context verbatim; instead, integrate it meaningfully into the answer. "
            f"Complete your answer and avoid repetition.\nAnswer:"
        )

    # Adjust max token length dynamically
    input_length = len(generator.tokenizer(prompt)['input_ids'])
    estimated_token_count = int(min_words * 0.75)  # Approximate token count for the minimum word count
    adjusted_min_length = input_length + estimated_token_count
    adjusted_max_length = input_length + max_words

    # Generate response with parameters to ensure clarity and completeness
    response = generator(
        prompt,
        max_length=adjusted_max_length,
        min_length=int(adjusted_min_length),
        temperature=0.7,         # Controls randomness
        top_p=0.85,               # Nucleus sampling for diversity
        top_k=40,                # Limits sampling to top 50 tokens
        repetition_penalty=1.8,  # Penalizes repeated phrases
        truncation=True,
        num_return_sequences=1
    )

    # Extract and clean the generated text
    generated_text = response[0]["generated_text"]


    return generated_text


In [125]:
structured_query = "What is the structure of DNA?"
structured_answer = generate_structured_answer(structured_query, k=3, max_words=100)
print("Structured Answer:")
print(structured_answer)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Structured Answer:
Question: What is the structure of DNA?

Context:
Q: write three main specific characteristics which DNA acting as vital genetic materials in organisms?
A: Accurate replication 
Store hereditary information 
Transmission from one generation to the other 
Q: What are the characteristics of DNA molecule for acting as a vital genetic material in organisms?
A: Accurate replication of DNA. 
Its transmission from one generation to the other. 
Ability to store heredity information. 
Ability to express heredity information.
Q: Explain how structure of DNA differs from structure of RNA.
A: The structure of DNA (Deoxyribonucleic Acid) and RNA (Ribonucleic Acid) exhibit fundamental differences that are essential for their distinct biological roles in living organisms.

DNA is a polymer made up of deoxyribonucleotides, while RNA consists of ribonucleotides. These nucleotides are the building blocks of both molecules, but their chemical composition differs slightly. In DNA, the s

In [126]:
structured_query = "State two differences between effector cells and memory cells?"
structured_answer = generate_structured_answer(structured_query, k=3, max_words=100)
print("Structured Answer:")
print(structured_answer)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Structured Answer:
Question: State two differences between effector cells and memory cells?

Context:
Q: State two differences between effector cells and memory cells?
A: Effector cells have a short life span while memory cells have a long life span.
Effector cells create primary immunity responses while memory cells create secondary immunity responses.
Q: State the basic features that all cells share?
A: Al cells are bounded by a plasma membrane, which is selective barrier. 
Within the cell have a semifluid, jelly like substance which is called cytosol. 
Subcellular components are suspended within the cytosol. 
They carry DNA as genetic materials. 
Ribosomes are found in all cells. 
Q: Which cell types compose the nerve tissue?
A: Neurons 
Glial cells 

Answer the question concisely and accurately in 1-2 sentences.
Answer: Neurons are the only cell type that is found in the central nervous system. Neurons are the only cell type that is found in the central nervous system. Neurons are 

In [109]:
structured_query = "State the respiratory structures found in Scorpion"
structured_answer = generate_structured_answer(structured_query, k=3, max_words=50)
print("Structured Answer:")
print(structured_answer)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Structured Answer:
Question: State the respiratory structures found in Scorpion

Context:
Q: State the respiratory structures found in Scorpion
A: Book lung
Q: Name the respiratory structure of the Scorpion
A: Book lung
Q: Name the respiratory structure Scorpion:
A: Book lung

Answer the question concisely and accurately in 1-2 sentences.
Answer: Book lung


In [74]:
essay_query = "Explain how the evolution of C4 path way established to minimize the photo respiration and describe the C4 path way."
essay_answer = generate_essay_answer(essay_query, k=5, max_words=300)
print("Essay Answer:")
print(essay_answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Essay Answer:
Question: Explain how the evolution of C4 path way established to minimize the photo respiration and describe the C4 path way.

Context:
Q: Explain how the evolution of C4 path way established to minimize the photo respiration and describe the C4 path way.
A: The evolution of the C4 pathway represents a remarkable adaptation in plants, particularly in response to the challenges posed by high temperatures, intense light, and dry climates. In C3 plants, these environmental conditions often lead to a wasteful process known as photorespiration, where the enzyme ribulose bisphosphate carboxylase/oxygenase (RuBisCO) reacts with oxygen instead of carbon dioxide (CO2), resulting in a significant loss of carbon and energy for the plant (1-4). To mitigate this issue, certain plants have developed alternative mechanisms to concentrate CO2 around RuBisCO, leading to the establishment of the C4 pathway (5-6).

The C4 pathway is characterized by a series of anatomical and biochemical m

In [123]:
structured_query = "What are the main sources of drinking water contamination?"
structured_answer = generate_structured_answer(structured_query, k=3, max_words=50)
print("Structured Answer:")
print(structured_answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Structured Answer:
Question: What are the main sources of drinking water contamination?

Context:
Q: What are the major portals of pathogenic microorganisms in to human body ?
A: Gastrointestinal tract
Repiratory tract
Genito - urinary tract 
Wounds on skin
Q: What are the major functions of water in human saliva?
A: Liquify food
Provide watery medium for chemical digestion
Aids in taste reception
Q: Name the factors which affects water potential ?
A: Solute concentration 
Applied pressure 

Answer the question concisely and accurately in 1-2 sentences.
Answer: The main sources of drinking water contamination are the following:
1. Human sewage
2. Rainwater
3. Surface water
4. Wastewater from industrial and domestic sources

Q: What are the main sources of drinking water contamination?
A


In [122]:
structured_query = "What are the main steps in urban water treatment?"
structured_answer = generate_structured_answer(structured_query, k=3, max_words=50)
print("Structured Answer:")
print(structured_answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Structured Answer:
Question: What are the main steps in urban water treatment?

Context:
Q: What happens during the primary treatment stage of wastewater treatment?
A: Large floating materials, sand, oil, and grease are removed; no biological activity takes place, 25-35% of organic matter is removed, and solid matter settles.
Q: Briefly explain how filtration step is done in the process of urban drinking water treatment.
A: Water is passed through the sand beds fine sand 
surface absorption on to the sand particles 
(additionally) use of activated carbon for the removal of toxic chemicals
Q: State the major method of water entering from soil solution to root hair cells?
A: Osmosis

Answer the question concisely and accurately in 1-2 sentences.
Answer: Osmosis
Explanation: 
The major method of water entering from soil solution to root hair cells is osmosis. Osmosis is a process by which water moves from a region of higher solute concentration to a region of lower solute


In [66]:
essay_query = "Explain the structure and function of the human ear."
essay_answer = generate_essay_answer(essay_query, k=5, max_words=300)
print("Essay Answer:")
print(essay_answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Essay Answer:
Question: Explain the structure and function of the human ear.

Answer the question in detail, providing a well-reasoned and comprehensive explanation. Highlight key features, provide examples, and mention advantages and disadvantages where applicable. Ensure your response is grammatically correct and at least 175 words long. Complete your answer and avoid repetition.
Answer: The outer part or auricle contains several sensory structures that detect sounds from various directions such as pinnae (the visible external parts) which are made up by two lobes called antitragus lobe & posterolateral region known collectively referred to simply as 'pinne'. It has four muscles namely superior transverse muscle, inferior longitudinal muscule
The middle section also consists ot three semicircular canals with their respective cupula's on either side alongwith utricular membrane inside it these all work together forming an organ for balance& stabilization.
It receives information about

In [75]:
essay_query = "Explain the differences between open and closed circulatory systems."
essay_answer = generate_essay_answer(essay_query, k=5, max_words=300)
print("Essay Answer:")
print(essay_answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Essay Answer:
Question: Explain the differences between open and closed circulatory systems.

Answer the question in detail, providing a well-reasoned and comprehensive explanation. Highlight key features, provide examples, and mention advantages and disadvantages where applicable. Ensure your response is grammatically correct and at least 175 words long. Complete your answer and avoid repetition.
Answer: The difference lies mainly with oxygen supply to cells from blood through vessels that are enclosed within an organ or tissue called as vascular system (blood vessel). In contrast,
in case of animals which have no internal organs like heart etc., there exists only one type
of circulation i.e.the arterial-venous circuit wherein deoxygenated red corpuscles reach their destination by way

#1 Open Circulation System:

• It consists on many small tubes known collectively referred “capillaries”.
These capillary networks form network-like structure throughout body tissues including all major