In [None]:
# Step 1: Install required libraries
!pip install transformers faiss-cpu sentence-transformers

# Step 2: Import necessary libraries
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering, pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Step 3: Load the context data
file_path = '/content/raw_data.txt'
with open(file_path, 'r') as file:
    context_data = file.read()

# Split the data into smaller chunks for retrieval
chunk_size = 300
context_chunks = [context_data[i:i + chunk_size] for i in range(0, len(context_data), chunk_size)]

# Step 4: Use SentenceTransformer to create embeddings for chunks
embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')
chunk_embeddings = embedding_model.encode(context_chunks, convert_to_tensor=True).cpu().detach().numpy()

# Step 5: Create a FAISS index
dimension = chunk_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)  # L2 distance for similarity
faiss_index.add(chunk_embeddings)  # Add chunk embeddings to the index

# Step 6: Load DistilBERT for question answering
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-distilled-squad')
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')
qa_pipeline = pipeline('question-answering', model=model, tokenizer=tokenizer)

# Step 7: Define a function to retrieve and answer queries
def answer_query(query):
    # Encode the query and search in the FAISS index
    query_embedding = embedding_model.encode([query], convert_to_tensor=True).cpu().detach().numpy()
    _, indices = faiss_index.search(query_embedding, k=3)  # Retrieve top 3 matches

    # Combine the most relevant chunks as context
    retrieved_context = " ".join([context_chunks[i] for i in indices[0]])

    # Use DistilBERT to generate an answer
    answer = qa_pipeline({'question': query, 'context': retrieved_context})
    return answer




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

In [None]:
# Step 1: Install necessary libraries for DataFrame processing
!pip install pandas openpyxl
!pip install rouge-score
!pip install memory_profiler

# Step 2: Import required libraries for DataFrame processing
import pandas as pd
import time
import psutil
from sklearn.metrics.pairwise import cosine_similarity
from rouge_score import rouge_scorer
from memory_profiler import memory_usage


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=a9ba9a8ffb9e4de0559f0571960a0be3a73f5df3194967f537731e99fa30d7bd
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Installing collected packages: memory_profiler
Successfully installed memory_profiler-0.61.0


In [None]:
# Step 3: Load the Excel file into a DataFrame
excel_file_path = '/content/Training_Dataset.xlsx'  # Update with the correct file path
df = pd.read_excel(excel_file_path)

In [None]:
# Function to compute ROUGE-L score between correct answer and model's answer
def compute_rouge_l(correct_answer, model_answer):
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    score = scorer.score(correct_answer, model_answer)
    return score["rougeL"].fmeasure  # Return ROUGE-L F1 score

# Function to calculate cosine similarity between two embeddings
def calculate_cosine_similarity(query_embedding, chunk_embeddings):
    return cosine_similarity(query_embedding, chunk_embeddings)

# Function to compute cosine similarity between actual and model answers
def calculate_answer_cosine_similarity(actual_answer, model_answer, embedding_model):
    actual_answer_embedding = embedding_model.encode([actual_answer], convert_to_tensor=True).cpu().detach().numpy()
    model_answer_embedding = embedding_model.encode([model_answer], convert_to_tensor=True).cpu().detach().numpy()
    return cosine_similarity(actual_answer_embedding, model_answer_embedding)[0][0]

# Step 4: Define a function to process each question and get the model's response, including cosine similarity, ROUGE-L, time, and memory
def process_questions(df, embedding_model, faiss_index, chunk_embeddings):
    responses = []
    similarities = []
    answer_cosine_similarities = []
    rouge_l_scores = []
    times = []
    memory_usages = []

    # Loop through each question in the DataFrame
    for index, row in df.iterrows():
        question = row['Question']
        correct_answer = row['Answer']  # Correct answer from the 'Answer' column

        # Define a nested function to encapsulate the logic for processing a single question
        def process_single_question():
            # Get the query embedding
            query_embedding = embedding_model.encode([question], convert_to_tensor=True).cpu().detach().numpy()

            # Retrieve context (top k chunks)
            k = 5
            _, indices = faiss_index.search(query_embedding, k=k)
            top_chunk_embeddings = chunk_embeddings[indices[0]]

            # Calculate cosine similarity between the query and the top context chunks
            cosine_sim = calculate_cosine_similarity(query_embedding, top_chunk_embeddings)

            # Combine the top relevant chunks as context for answering the question
            retrieved_context = " ".join([context_chunks[i] for i in indices[0]])

            # Get the model's response using the retrieved context
            answer = qa_pipeline({'question': question, 'context': retrieved_context})
            return answer, cosine_sim

        # Measure peak memory usage and time for processing the question
        start_time = time.time()
        mem_usage, (answer, cosine_sim) = memory_usage(
            (process_single_question,),
            max_usage=True,
            retval=True
        )
        end_time = time.time()

        # Get the model's answer and compute ROUGE-L score
        model_answer = answer['answer']
        rouge_l_score = compute_rouge_l(correct_answer, model_answer)

        # Compute cosine similarity between the actual and model answers
        answer_cosine_sim = calculate_answer_cosine_similarity(correct_answer, model_answer, embedding_model)

        # Store the results
        responses.append(model_answer)
        similarities.append(cosine_sim.mean())  # Take the average similarity of top k chunks
        rouge_l_scores.append(rouge_l_score)
        answer_cosine_similarities.append(answer_cosine_sim)
        times.append(end_time - start_time)  # Time taken to answer the question
        memory_usages.append(mem_usage)  # Peak memory usage during the query

    # Add the results as new columns in the DataFrame
    df['Model_Response'] = responses
    df['Cosine_Similarity'] = similarities
    df['ROUGE_L_Score'] = rouge_l_scores
    df['Answer_Cosine_Similarity'] = answer_cosine_similarities
    df['Time_Taken_Sec'] = times
    df['Memory_Usage_MB'] = memory_usages
    return df


In [None]:
# Step 5: Process the questions and save the responses to the new columns
embedding_model = SentenceTransformer('distilbert-base-nli-mean-tokens')  # Example model
faiss_index = faiss.IndexFlatL2(chunk_embeddings.shape[1])  # Create FAISS index
faiss_index.add(chunk_embeddings)  # Add chunk embeddings to the index
context_chunks = [context_data[i:i + chunk_size] for i in range(0, len(context_data), chunk_size)]  # Define context chunks

df_with_responses = process_questions(df, embedding_model, faiss_index, chunk_embeddings)

# Step 6: Save the updated DataFrame to a new Excel file
output_file_path = '/content/questions_with_responses_and_metrics.xlsx'
df_with_responses.to_excel(output_file_path, index=False)

# Optional: Display the DataFrame with the new columns
df_with_responses.head()

# Optional: Download the new Excel file if you're in Google Colab
from google.colab import files
files.download(output_file_path)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
answer=answer_query(query="What is the maximum fine for contravening the provisions of the Act?")
print(answer['answer'])

$300.
