In [78]:
# !pip install ollama

In [80]:
# !ollama pull mistral

In [81]:
import os
import json
import glob
from rank_bm25 import *
from nltk.tokenize import word_tokenize


In [82]:
def parse_documents(documents_folder):
    document_texts = {}
    for file_path in glob.glob(os.path.join(documents_folder, "*.txt")):
        with open(file_path, 'r') as file:
            doc_id = os.path.basename(file_path).split('.')[0]
            document_texts[doc_id] = file.read()
    return document_texts

def build_index(documents_folder):
    documents = parse_documents(documents_folder)
    corpus = [word_tokenize(doc.lower()) for doc in documents.values()]
    bm25 = BM25Okapi(corpus)
    index = {"bm25": bm25, "document_ids":list(documents.keys()), "corpus": corpus}
    return index

def retrieve_documents(question, index):
    tokenized_query = word_tokenize(question.lower())
    bm25 = index['bm25']
    scores = bm25.get_scores(tokenized_query)
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:3]
    top_docs = [index['document_ids'][i] for i in top_indices]
    return top_docs

In [83]:
index = build_index(documents_folder="renewable-energy/renewable-energy")

top_docs = retrieve_documents("Which enhanced systems are unlocking geothermal potential in areas with naturally low permeability?", index)
print(top_docs)

['12', '10', '11']


In [84]:
import ollama

def answer_question(question, index):
    # Retrieve top documents based on the query
    top_docs = retrieve_documents(question, index)

    # Combine content of the top documents and truncate if necessary
    combined_docs = ""
    for doc_id in top_docs:
        with open(f"renewable-energy/renewable-energy/{doc_id}.txt", 'r') as file:
            combined_docs += file.read() + "\n"


    prompt = (
        f"Please answer the following question based on the provided documents.\n\n"
        f"Question: {question}\n\n"
        f"Documents:\n{combined_docs}\n\n"
    )
    
    try:
        response = ollama.chat(
            model='mistral', 
            messages=[{'role': 'user', 'content': prompt}]
        )
        # Extract answer content if response is correctly structured
        answer_text = response['message']['content'] if response and 'message' in response else "No answer generated."

    except Exception as e:
        print(f"An error occurred: {e}")
        answer_text = "Error in generating response."

    # Return answer along with source document references
    return {"Answer": answer_text, "SourceDoc": top_docs}

In [85]:
# response = answer_question("what is the impact of renewable energe?", index)
# response = answer_question("Which advancements demonstrate that hydropower is not a stagnant source?", index)
# response = answer_question("Where does the future of solar lie?", index)
response = answer_question("Which enhanced systems are unlocking geothermal potential in areas with naturally low permeability?", index)


print(json.dumps(response))


{"Answer": " The documents provided mention that Enhanced Geothermal Systems (EGS) are unlocking geothermal potential in areas with naturally low permeability. EGS works by injecting water through fractured rock formations, creating artificial reservoirs of hot water, enabling the extraction of geothermal energy even in locations previously deemed unsuitable.", "SourceDoc": ["12", "10", "11"]}


In [86]:
from nltk.translate.bleu_score import sentence_bleu

def evaluate_answer(reference_answer, generated_answer):
    bleu_score = sentence_bleu([reference_answer.split()], generated_answer.split())
    return {"Score": bleu_score, "Feedback": "The answer is relatively close to the reference answer based on BLEU."}

In [87]:
answer = "Enhanced Geothermal Systems (EGS) are unlocking geothermal potential in areas with naturally low permeability"

evaluation = evaluate_answer(answer, response["Answer"])
print(evaluation)

{'Score': 0.25728025143580546, 'Feedback': 'The answer is relatively close to the reference answer based on BLEU.'}
