In [None]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
!pip install datasets



In [None]:
#Mounting drive to load the corpus
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install datasets==1.18.0

Collecting datasets==1.18.0
  Downloading datasets-1.18.0-py3-none-any.whl.metadata (22 kB)
Downloading datasets-1.18.0-py3-none-any.whl (311 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.3/311.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.1.0
    Uninstalling datasets-3.1.0:
      Successfully uninstalled datasets-3.1.0
Successfully installed datasets-1.18.0


In [None]:
#FINETUNING BERT

In [None]:
#Required libraries
import json
import torch
import numpy as np
import logging
import os
from torch.utils.data import Dataset, random_split
from transformers import BertForQuestionAnswering, BertTokenizerFast, TrainingArguments, Trainer, TrainerCallback
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load the corpus
def load_corpus(file_path):
    logger.info("Loading corpus...")
    try:
        with open(file_path, 'r') as f:
            corpus = json.load(f)
        logger.info(f"Successfully loaded corpus with {len(corpus)} documents.")
        return corpus
    except Exception as e:
        logger.error(f"Error loading corpus: {str(e)}")
        return None

# Prepare the dataset
class PEDDataset(Dataset):
    def __init__(self, corpus, tokenizer, max_length=512):
        self.examples = []
        for doc in corpus:
            context = ' '.join([section['processed_text'] for section in doc['sections']])
            for section in doc['sections']:
                if section['section']:
                    question = f"What is {section['section']}?"
                    answer = section['processed_text']

                    inputs = tokenizer.encode_plus(
                        question, context,
                        max_length=max_length,
                        truncation='only_second',
                        return_offsets_mapping=True,
                        padding='max_length',
                        return_tensors='pt'
                    )

                    input_ids = inputs['input_ids'].squeeze()
                    attention_mask = inputs['attention_mask'].squeeze()
                    offset_mapping = inputs['offset_mapping'].squeeze().tolist()

                    start_char = context.find(answer)
                    end_char = start_char + len(answer)

                    start_token, end_token = None, None
                    for idx, (start, end) in enumerate(offset_mapping):
                        if start <= start_char < end:
                            start_token = idx
                        if start < end_char <= end:
                            end_token = idx
                        if start_token is not None and end_token is not None:
                            break

                    if start_token is not None and end_token is not None:
                        self.examples.append({
                            'input_ids': input_ids,
                            'attention_mask': attention_mask,
                            'start_positions': start_token,
                            'end_positions': end_token
                        })
        logger.info(f"Created dataset with {len(self.examples)} examples.")

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

# Check if CUDA (GPU) is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Function to compute metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    start_logits, end_logits = predictions
    start_pred = np.argmax(start_logits, axis=1)
    end_pred = np.argmax(end_logits, axis=1)
    start_true, end_true = labels

    precision_start = precision_score(start_true, start_pred, average='macro', zero_division=0)
    recall_start = recall_score(start_true, start_pred, average='macro', zero_division=0)
    f1_start = f1_score(start_true, start_pred, average='macro', zero_division=0)

    precision_end = precision_score(end_true, end_pred, average='macro', zero_division=0)
    recall_end = recall_score(end_true, end_pred, average='macro', zero_division=0)
    f1_end = f1_score(end_true, end_pred, average='macro', zero_division=0)

    f1 = (f1_start + f1_end) / 2

    return {
        "precision_start": precision_start,
        "recall_start": recall_start,
        "f1_start": f1_start,
        "precision_end": precision_end,
        "recall_end": recall_end,
        "f1_end": f1_end,
        "f1": f1
    }
# Helper functions for QA
def thematic_chunking(context):
    return [section['processed_text'] for doc in corpus for section in doc['sections']]

# Function to return the top 5 passages
def filter_relevant_chunks(question, chunks, top_n=5):
    keywords = question.lower().split()
    relevant_chunks = [
        chunk for chunk in chunks if any(keyword in chunk.lower() for keyword in keywords)
    ]
    # Limit to top N passages
    return relevant_chunks[:top_n] if relevant_chunks else chunks[:top_n]

# Function to answer questions using the fine-tuned model
def answer_question(question, context, model, tokenizer):
    thematic_chunks = thematic_chunking(context)
    relevant_chunks = filter_relevant_chunks(question, thematic_chunks)

    best_answer = ""
    best_score = float("-inf")

    print(f"Relevant passages for the question '{question}':\n")
    for idx, chunk in enumerate(relevant_chunks):
        print(f"Passage {idx + 1}: {chunk}\n")

        inputs = tokenizer.encode_plus(
            question, chunk,
            max_length=512,
            truncation=True,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        answer_start = torch.argmax(outputs.start_logits)
        answer_end = torch.argmax(outputs.end_logits) + 1
        answer = tokenizer.decode(inputs['input_ids'][0][answer_start:answer_end])

        score = outputs.start_logits[0, answer_start] + outputs.end_logits[0, answer_end - 1]

        if score > best_score:
            best_score = score
            best_answer = answer

    return best_answer

# Custom Trainer with epoch-wise logging of training loss
class CustomTrainer(Trainer):
    def train(self, *args, **kwargs):
        print("Starting training...")
        training_loss_per_epoch = []
        output = super().train(*args, **kwargs)
        # Append final loss logs
        if hasattr(self.state, 'log_history'):
            for log in self.state.log_history:
                if 'loss' in log:
                    training_loss_per_epoch.append(log['loss'])
        print(f"Epoch-wise training loss: {training_loss_per_epoch}")
        return output

# Callback to log loss at the end of each epoch
class LogEpochLossCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        logger.info(f"Epoch {state.epoch} completed. Loss: {state.log_history[-1].get('loss', 'Not logged')}")

# Fine-tune BERT
def train_model(train_dataset, val_dataset, model_save_path):
    model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=6,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=0,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_strategy="epoch",  # Log after each epoch
        evaluation_strategy="epoch",  # Evaluate after each epoch
        save_strategy="epoch",  # Save the checkpoints at each epoch
        load_best_model_at_end=True,
        report_to="none",
        learning_rate=2e-5,
        lr_scheduler_type="constant",
        logging_first_step=True
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.add_callback(LogEpochLossCallback())  # Add custom callback for logging

    print("Training the model...")
    train_result = trainer.train()
    print(f"Training completed. Metrics: {train_result.metrics}")

    # Evaluate the model after training
    eval_results = trainer.evaluate()
    print(f"Evaluation results: {eval_results}")

    # Save the trained model
    model.save_pretrained(model_save_path)
    print(f"Trained model saved to {model_save_path}")

    return model, trainer

# Main execution
if __name__ == "__main__":
    # Load corpus
    corpus = load_corpus('/content/drive/MyDrive/tokenized_scopusNOGPT.json')
    if corpus is None:
        logger.error("Failed to load corpus.")
        exit()

    # Initialize tokenizer and create the  dataset
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    dataset = PEDDataset(corpus, tokenizer)

    if len(dataset) == 0:
        logger.error("The dataset is empty.")
        exit()

    # Split the dataset into train and validation
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    # Model saved to the path
    model_save_path = '/content/drive/MyDrive/fine_tuned_PED_modelx'
    model, trainer = train_model(train_dataset, val_dataset, model_save_path)
    # Test qns
    context = ' '.join([section['processed_text'] for doc in corpus for section in doc['sections']])
    questions = [
    "How do Positive Energy Districts contribute to urban sustainability?",
    "What is the annual renewal generation district in Umea?",
    "What are the main challenges in implementing Positive Energy Districts?",
    "How do tools and methodologies contribute to the design of positive energy districts?",
    "In what ways do PEDs integrate data-driven design principles?",
    "Why are tools for engaging multiple stakeholders essential in PED planning?",
    "What distinguishes net-zero energy buildings from positive energy districts?",
    "How do varying scales and boundary definitions influence PED analysis?",
    "What are positive energy districts?",
    "How does building envelope retrofitting in PEDs work?",
    "What are specific examples of how PEDs are implemented in Italy?"
    "What are the primary renewable energy technologies proposed for the transition to a Positive Energy District? in Riga?",
    "What is the purpose of multi-stakeholder engagement tools?",
    "What are the differences between net zero energy building and positive energy district?",
    "How do different scales and evaluation boundaries affect positive energy district analysis?",
    "What is involved in the planning of positive energy districts?",
    ]

    for question in questions:
        print(f"Question: {question}")
        print(f"Answer: {answer_question(question, context, model, tokenizer)}")
        print("\n" + "-"*50 + "\n")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training the model...
Starting training...


Epoch,Training Loss,Validation Loss,Precision Start,Recall Start,F1 Start,Precision End,Recall End,F1 End,F1
1,5.751,4.965907,0.142857,0.061224,0.085714,0.166667,0.166667,0.166667,0.12619
2,4.489,3.810696,0.208333,0.178571,0.192308,0.272727,0.272727,0.272727,0.232517
3,3.5301,2.818832,0.285714,0.285714,0.285714,0.272727,0.272727,0.272727,0.279221
4,2.6584,1.970274,0.4375,0.5,0.466667,0.272727,0.272727,0.272727,0.369697
5,2.0142,1.869659,0.4375,0.5,0.466667,0.555556,0.555556,0.555556,0.511111
6,1.513,1.474523,0.4375,0.5,0.466667,0.4,0.4,0.4,0.433333


Epoch-wise training loss: [6.3587, 5.751, 4.489, 3.5301, 2.6584, 2.0142, 1.513]
Training completed. Metrics: {'train_runtime': 57.9281, 'train_samples_per_second': 6.629, 'train_steps_per_second': 0.829, 'total_flos': 100337954586624.0, 'train_loss': 3.3386014699935913, 'epoch': 6.0}


Evaluation results: {'eval_loss': 1.4745231866836548, 'eval_precision_start': 0.4375, 'eval_recall_start': 0.5, 'eval_f1_start': 0.4666666666666667, 'eval_precision_end': 0.4, 'eval_recall_end': 0.4, 'eval_f1_end': 0.4, 'eval_f1': 0.43333333333333335, 'eval_runtime': 0.2819, 'eval_samples_per_second': 28.378, 'eval_steps_per_second': 3.547, 'epoch': 6.0}
Trained model saved to /content/drive/MyDrive/fine_tuned_PED_modelx
Question: How do Positive Energy Districts contribute to urban sustainability?
Relevant passages for the question 'How do Positive Energy Districts contribute to urban sustainability?':

Passage 1: positive_energy_district expected play major role energy_transition city hence paper aims introducing novel methodology useful district energy environmental analysis intended support accomplishment targets positive_energy_district district community level proposed approach relies basic concepts underpinning baseline_emission_inventory encompasses ambitious challenging object

In [None]:
!pip install transformers sentence-transformers scikit-learn numpy datasets
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached datasets-3.1.0-py3-none-any.whl (480 kB)
Installing collected packages: datasets, evaluate
  Attempting uninstall: datasets
    Found existing installation: datasets 1.18.0
    Uninstalling datasets-1.18.0:
      Successfully uninstalled datasets-1.18.0
Successfully installed datasets-3.1.0 evaluate-0.4.3


In [None]:
#USING THE GPT OPEN AI

In [None]:
pip install --upgrade openai


Collecting openai
  Downloading openai-1.55.3-py3-none-any.whl.metadata (24 kB)
Downloading openai-1.55.3-py3-none-any.whl (389 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.6/389.6 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.54.4
    Uninstalling openai-1.54.4:
      Successfully uninstalled openai-1.54.4
Successfully installed openai-1.55.3


In [None]:
import openai
#### Access to API and Drive
# Set up for the  OpenAI API key
openai.api_key = input("OpenAI-key")

# Mounting Google Drive:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import openai
print(openai.__version__)

!pip install openai==0.28
!pip install transformers datasets faiss-cpu sentence-transformers


0.28.0


In [None]:
#pip install --upgrade openai


In [None]:
#RAG WITH THE CHUNKED CORPUS

In [None]:
# Import required libraries
import openai
import torch
import json
from transformers import BertForQuestionAnswering, BertTokenizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# Setting up the OpenAI API key
openai.api_key = ("OpenAIkey")


# Loading the Sentence Transformer model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# Loading the fine-tuned BERT model and tokenizer for extractive QA
model_path = "/content/drive/MyDrive/fine_tuned_PED_modelx"  # Path to the fine-tuned model
bert_model = BertForQuestionAnswering.from_pretrained(model_path)
bert_tokenizer = BertTokenizer.from_pretrained(model_path)

# Loading the corpus
with open('/content/drive/MyDrive/chunked_corpus.json', 'r') as f:
    corpus = json.load(f)

# Extracting texts and metadata for embedding
if isinstance(corpus, list):
    # For a list of documents
    texts = []
section_metadata = []

for document in corpus:
    if 'sections' in document:
        for section in document['sections']:
            for chunk in section.get('chunks', []):
                texts.append(chunk)  # Add chunk text
                section_metadata.append((document.get('title', 'Chunks'), section.get('name', 'Chunks')))  # Add metadata

    #texts = [
     #   chunk  # Using chunks directly for embedding
     #  for document in corpus if 'sections' in document
      #  for section in document['sections']
      #  for chunk in section.get('chunks', [])
   # ]

# Computing the embeddings for chunked texts
if texts:
    corpus_embeddings = sentence_model.encode(texts, show_progress_bar=True)
else:
    print("No texts found for embedding.")

# The function for retrieving the relevant passages based on chunks
def retrieve_relevant_passages(query, top_k=5, min_similarity=0.40):
    query_embedding = sentence_model.encode([query])
    similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]

    # Top k relevant passages based on similarity
    top_indices = np.argsort(similarities)[-top_k:][::-1]

    # Get passages that have similarity above the threshold
    relevant_texts = [(texts[i], section_metadata[i], similarities[i]) for i in top_indices if similarities[i] > min_similarity]

    if not relevant_texts:
        return [], False
    return relevant_texts, True

# The function for the extractive QA using BERT
def answer_question_with_bert(question, context):
    inputs = bert_tokenizer.encode_plus(question, context, return_tensors='pt', max_length=512,
                                        truncation=True, padding='max_length')
    input_ids = inputs['input_ids'].to(bert_model.device)
    attention_mask = inputs['attention_mask'].to(bert_model.device)

    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits)

    answer = bert_tokenizer.convert_tokens_to_string(
        bert_tokenizer.convert_ids_to_tokens(input_ids[0][answer_start:answer_end + 1])
    ).replace('[SEP]', '').replace('[CLS]', '').strip()

    return answer

# Function for the generative QA using GPT
def generate_answer_with_gpt(query, relevant_passages):
    # Concatenating the relevant passages to use as context
    context = " ".join([text for text, _, _ in relevant_passages])

    # Formatting the input text for the model
    input_text = f"Question: {query}\nContext: {context}\nAnswer:"

    # Calling the OpenAI API
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": input_text},  # User's input
        ],
        max_tokens=700,  # response length
        temperature=0.7,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    # Extract and returning the generated answer
    generated_answer = response['choices'][0]['message']['content'].strip()
    return generated_answer

# List of questions to process
questions = [
    "How do Positive Energy Districts contribute to urban sustainability?",
    "What are the main challenges in implementing Positive Energy Districts?",
    "which district in riga have used the wind power energy",
    "Whats the population size in borlänge?",
    "What is the annual renewal generation district in Umea?",
    "How do tools and methodologies contribute to the design of positive energy districts?",
    "In what ways do PEDs integrate data-driven design principles?",
    "Why are tools for engaging multiple stakeholders essential in PED planning?",
    "What distinguishes net-zero energy buildings from positive energy districts?",
    "How do varying scales and boundary definitions influence PED analysis?",
    "What are positive energy districts?",
    "How does building envelope retrofitting in PEDs work?",
    "What are specific examples of how PEDs are implemented in Italy?",
    "What are the primary renewable energy technologies proposed for the transition to a Positive Energy District in Riga?",
    "What is the purpose of multi-stakeholder engagement tools?",
    "What are the differences between net zero energy building and positive energy district?",
    "How do different scales and evaluation boundaries affect positive energy district analysis?",
    "What is involved in the planning of positive energy districts?",
    "What is the habitat of Polar Bears?"
]

# Looping through the questions and processing
for question in questions:
    relevant_passages, is_relevant = retrieve_relevant_passages(question, top_k=5)

    if is_relevant:
        # Using BERT for extractive QA
        contexts = [text for text, _, _ in relevant_passages]
        bert_answer = answer_question_with_bert(question, " ".join(contexts[:5]))

        # Using GPT for generative QA
        generated_answer = generate_answer_with_gpt(question, relevant_passages)

        # Printing the output answers and relevant passages
        print(f"Question: {question}")
        print(f"Extractive Answer (BERT): {bert_answer}")
        print(f"Generated Answer (GPT): {generated_answer}")

        print("\nRelevant Passages:")
        for i, (passage, meta, similarity) in enumerate(relevant_passages, 1):
            print(f"Passage {i} (Parent Section: {meta[0]}, Section: {meta[1]}, Similarity: {similarity:.2f}): {passage}\n")

        print("-" * 50)
    else:
        print(f"Question: {question}")
        print("No relevant answer.")
        print("-" * 50)


Batches:   0%|          | 0/103 [00:00<?, ?it/s]

Question: How do Positive Energy Districts contribute to urban sustainability?
Extractive Answer (BERT): how do positive energy districts contribute to urban sustainability ?  positive _ energy _ district could impact social economic energy environmental societal goals positive _ energy _ district integrates main paradigms smart sustainable city incrementally introduced integrated energy planning many city communities coming years suggested renovation wave environmental goals set community district level ambitious challenging characterizing positive _ energy _ district processes positive _ energy _ district designs dynamic multi scale multi disciplinary urban environment conclusions emergence positive _ energy _ district represents significant shift toward sustainable urbanism energy _ transition underscoring need comprehensive approach address multifaceted urban challenges publication delves intricate web factors influencing positive _ energy _ district development
Generated Answer (G