In [46]:
# !wget "https://storage.googleapis.com/ds--tasks-datasets/renewable-energy.zip"

In [47]:
# import zipfile
# with zipfile.ZipFile("renewable-energy.zip","r") as zip_ref:
#     zip_ref.extractall("renewable-energy")

In [48]:
# !pip install rank_bm25
# !pip install nltk
# !pip install torch transformers accelerate
# !pip install sentencepiece

In [49]:
# import nltk
# nltk.download('punkt_tab')

In [50]:
import os
import json
import glob
from rank_bm25 import *
from nltk.tokenize import word_tokenize
import torch

In [51]:
def parse_documents(documents_folder):
    document_texts = {}
    for file_path in glob.glob(os.path.join(documents_folder, "*.txt")):
        with open(file_path, 'r') as file:
            doc_id = os.path.basename(file_path).split('.')[0]
            document_texts[doc_id] = file.read()
    return document_texts

def build_index(documents_folder):
    documents = parse_documents(documents_folder)
    corpus = [word_tokenize(doc.lower()) for doc in documents.values()]
    bm25 = BM25Okapi(corpus)
    index = {"bm25": bm25, "document_ids":list(documents.keys()), "corpus": corpus}
    return index

def retrieve_documents(question, index):
    tokenized_query = word_tokenize(question.lower())
    bm25 = index['bm25']
    scores = bm25.get_scores(tokenized_query)
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:3]
    top_docs = [index['document_ids'][i] for i in top_indices]
    return top_docs

In [52]:
index = build_index(documents_folder="renewable-energy/renewable-energy")

top_docs = retrieve_documents("what is the impact of renewable energe?", index)
print(top_docs)

['30', '16', '28']


In [53]:
from transformers import AutoModelForCausalLM, AutoTokenizer 

# Specify a custom cache directory
model_path = "./Mistral-7B"

# # Download the model and tokenizer
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.3", cache_dir=model_path)
# model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.3", cache_dir=model_path)


In [54]:

# Load Mistral model from local directory (once downloaded)
tokenizer = AutoTokenizer.from_pretrained('Mistral-7B/models--mistralai--Mistral-7B-v0.3/snapshots/d8cadc02ac76bd617a919d50b092e59d2d110aff')
model = AutoModelForCausalLM.from_pretrained('Mistral-7B/models--mistralai--Mistral-7B-v0.3/snapshots/d8cadc02ac76bd617a919d50b092e59d2d110aff', torch_dtype="float16")

Loading checkpoint shards: 100%|██████████| 3/3 [00:32<00:00, 10.95s/it]


In [56]:
# Example usage with text generation
from transformers import pipeline

# Create a text generation pipeline
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device='cpu')

# Generate a response to a query
response = generator("Which enhanced systems are unlocking geothermal potential in areas with naturally low permeability?", max_length=100)
print(response[0]['generated_text'])


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Which enhanced systems are unlocking geothermal potential in areas with naturally low permeability?

The geothermal industry is growing rapidly, with the International Energy Agency (IEA) predicting that the global installed capacity of geothermal power plants will increase by 10% per year until 2020.

The IEA also predicts that geothermal power will account for 11% of the world’s electricity by 205


In [57]:
def answer_question(question, index):
    # Retrieve top documents based on the query
    top_docs = retrieve_documents(question, index)

    # Combine content of the top documents and truncate if necessary
    combined_docs = ""
    for doc_id in top_docs:
        with open(f"renewable-energy/renewable-energy/{doc_id}.txt", 'r') as file:
            combined_docs += file.read() + "\n"

    # Ensure combined_docs is not too long
    if len(combined_docs.split()) > 500:  # Adjust the number according to the model's capability
        combined_docs = ' '.join(combined_docs.split()[:500])  # Truncate to the first 500 words

    # Use Mistral 7B to generate an answer
    prompt = f"Question: {question}\nDocuments: {combined_docs}\nAnswer:"
    response = generator(prompt, max_new_tokens=100)  # Use max_new_tokens instead of max_length

    return {"Answer": response[0]['generated_text'], "SourceDoc": top_docs}



In [58]:
# response = answer_question("what is the impact of renewable energe?", index)
# response = answer_question("Which advancements demonstrate that hydropower is not a stagnant source?", index)
# response = answer_question("Where does the future of solar lie?", index)
response = answer_question("Which enhanced systems are unlocking geothermal potential in areas with naturally low permeability?", index)


print(json.dumps(response))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{"Answer": "Question: Which enhanced systems are unlocking geothermal potential in areas with naturally low permeability?\nDocuments: ## Geothermal: Unveiling Earth's Hidden Heat Deep beneath our feet lies a vast, untapped reservoir of renewable energy \u2013 geothermal heat. Unlocking this potential is where new exploration techniques and enhanced geothermal systems (EGS) are making a difference. **Drilling deeper, smarter:** Traditional drilling methods can be expensive and limited in reaching deeper, hotter geothermal resources. New techniques like directional drilling and slim-hole drilling are offering more cost-effective and precise ways to access these valuable sources. Additionally, advancements in seismic imaging are providing a clearer picture of the subsurface, allowing for more targeted drilling and reducing exploration risks. **Boosting the potential:** Enhanced geothermal systems (EGS) are unlocking geothermal potential in areas with naturally low permeability. By injecti

In [59]:
from nltk.translate.bleu_score import sentence_bleu

def evaluate_answer(reference_answer, generated_answer):
    bleu_score = sentence_bleu([reference_answer.split()], generated_answer.split())
    return {"Score": bleu_score, "Feedback": "The answer is relatively close to the reference answer based on BLEU."}

In [60]:
# Example usage
answer= """## Geothermal: Unveiling Earth's Hidden Heat
Deep beneath our feet lies a vast, untapped reservoir of renewable energy – geothermal heat. Unlocking this potential is where new exploration techniques and enhanced geothermal systems (EGS) are making a difference.
**Drilling deeper, smarter:** 
Traditional drilling methods can be expensive and limited in reaching deeper, hotter geothermal resources. New techniques like directional drilling and slim-hole drilling are offering more cost-effective and precise ways to access these valuable sources. Additionally, advancements in seismic imaging are providing a clearer picture of the subsurface, allowing for more targeted drilling and reducing exploration risks.
**Boosting the potential:** 
Enhanced geothermal systems (EGS) are unlocking geothermal potential in areas with naturally low permeability. By injecting water through fractured rock formations, EGS creates artificial reservoirs of hot water, enabling the extraction of geothermal energy even in locations previously deemed unsuitable.
These advancements are paving the way for a more sustainable future powered by geothermal energy. By efficiently accessing and utilizing Earth's natural heat, we can tap into a clean and reliable source of renewable energy for generations to come."""


evaluation = evaluate_answer(answer, response["Answer"])
print(evaluation)

{'Score': 0.3105806097102945, 'Feedback': 'The answer is relatively close to the reference answer based on BLEU.'}
