In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Dependencies
--

In [None]:
!pip install langchain langchain-community faiss-cpu transformers pypdf langchain_groq
!pip install sentence_transformers
!pip install groq


API KEYS
--

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
gemma= user_secrets.get_secret("Gemma")
groq = user_secrets.get_secret("Groq")
huggingface = user_secrets.get_secret("HuggingFace")

In [None]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN']=huggingface

PDF Vectorization
--

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

def load_documents(file_path):
    """Load PDF documents using PyPDFLoader."""
    loader = PyPDFLoader(file_path)
    return loader.load()

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks using RecursiveCharacterTextSplitter."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(documents)

def create_embeddings():
    """Create HuggingFace embeddings."""
    return HuggingFaceBgeEmbeddings(
        model_name="sentence-transformers/all-MiniLM-l6-v2",
        model_kwargs={'device': 'cpu'},
        encode_kwargs={'normalize_embeddings': True}
    )

def create_vectorstore(final_documents, embeddings):
    """Create a FAISS vector store from the documents and embeddings."""
    return FAISS.from_documents(final_documents, embeddings)

def setup_llm():
    """Set up the Hugging Face model."""
    return HuggingFaceHub(
#         repo_id="mistralai/Mistral-7B-v0.1",
        repo_id="mistralai/Mistral-7B-Instruct-v0.2",
        model_kwargs={"temperature": 0.5, "max_length": 1000}
    )

def create_prompt_template():
    """Create a prompt template for the RetrievalQA."""
    prompt_template = """
    "You are a knowledgeable assistant specialized in extracting information from PDF documents."
    "When answering questions, you must base your responses solely on the content of the PDF provided."
    "Do not include any information or context outside of this document."
    
    context: {context}
    Question: {question}
    Answer:
    """
    return PromptTemplate(template=prompt_template, input_variables=["context", "question"])

def setup_retrieval_qa(llm, retriever, prompt):
    """Set up the RetrievalQA chain."""
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt}
    )

def main(file_path):
    """Main function to execute the document processing and retrieval setup."""
    documents = load_documents(file_path)
    final_documents = split_documents(documents)
    embeddings = create_embeddings()
    vectorstore = create_vectorstore(final_documents, embeddings)
    
    # Create a retriever from the vector store
    retriever = vectorstore.as_retriever()
    
    llm = setup_llm()
    prompt = create_prompt_template()
    retrieval_qa = setup_retrieval_qa(llm, retriever, prompt)
    
    return retrieval_qa



Model Dump/Load
--

In [None]:
# import pickle

# with open("retrieval_qa_Model.pkl", 'wb') as f:
#     pickle.dump(retrieval_qa, f)
# print(f"Model saved to {filename}")


In [None]:
# with open("retrieval_qa_Model.pkl", 'rb') as f:
#     loaded_retrieval_qa = pickle.load(f)

# print("Model loaded successfully")

In [None]:
# file_path = "/content/drive/MyDrive/Colab Notebooks/CyberScan_Datahack/Cybersecurity Report Template.pdf"
file_path = "/kaggle/input/report/Cybersecurity Report Overview.pdf"
# file_path="/kaggle/input/report/Cybersecurity Report Template.pdf"
retrieval_qa = main(file_path)

In [None]:
# result=retrieval_qa.invoke({"query":"What measures are in place to ensure the confidentiality, integrity, and availability of data?"})
# result=result['result']
# print(result)

Diplaying the code
--

In [None]:
import re

def arrange(res):
  if "Answer:" in res:
      answer = res.split("Answer:")[1].strip()
  elif "answer:" in res:
      answer = res.split("answer:")[1].strip()
  else:
      # Fallback to the entire result if no split pattern is found
      answer = res
  return answer

def print_with_newline_on_period(text):
    modified_text = re.sub(r'\.', '.\n', text)
    return modified_text

In [None]:
import json

# Load the JSON file For Question_Answer
with open('/kaggle/input/knowledge/ques_ans.json', 'r') as file:
    data = json.load(file)

question = []
question_domain={}
answer = []
domain=[]
# Access each domain's question and answer
for domain_name, details in data.items():
    question.append(details['question'])
    question_domain[domain_name]=details['question']



    answer.append(print_with_newline_on_period(details['answer']))
    domain.append(domain_name)
#     print(f"Domain: {domain_name}")
#     print(f"Question: {question}")
#     print(f"Answer: {answer}\n")


answer_predicted=[]
for i in range(len(question)):
    query=question[i]
    result=retrieval_qa.invoke({"query":query})
    result=result['result']
    result=arrange(result)
    answer_predicted.append(result)


  # print("-"*50)
  # print(f"Domain: {domain[i]}")
  # print("-"*50)

  # print(f"Question: {query}")
  # print("-"*50)
  # print(f"Retrieved_Answer: {result}\n")
  # print("-"*50)

  # print(f"Acutal_Req: {answer[i]}\n")
  # print("-"*50)
  # print("-"*50)




Domain wise Scoring
--

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Calculate similarity between actual response and predicted answer with context
def evaluate_domain_response(question, actual_answer, predicted_answer):
    """Evaluate the risk score based on the similarity between actual and predicted answers."""
    # Combine question with answers
    actual_combined = f"{question} {actual_answer}"
    predicted_combined = f"{question} {predicted_answer}"
    
    # Vectorize and calculate similarity
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([actual_combined, predicted_combined])
    similarity = cosine_similarity(vectors[0], vectors[1])
    
    # Interpret similarity into a risk score (0 - high risk, 1 - low risk)
    risk_score = 1 - similarity[0][0]  # Inverse of similarity for risk (0 is high risk, 1 is low risk)
    
    return risk_score, similarity[0][0]  # Return risk score and similarity score

# Evaluate responses for each domain
domain_scores = {}
for i in range(len(question)):
    # Calculate score with context
    risk_score, similarity_score = evaluate_domain_response(question[i], answer,answer_predicted[i])
    
    # Store the risk score in the domain_scores dictionary
    domain_scores[domain[i]] = {
        'risk_score': risk_score,
        'similarity_score': similarity_score
    }

# # Identify the weakest domain (highest risk)
# weakest_domain = min(domain_scores, key=lambda x: domain_scores[x]['risk_score'])
# print(f"Domain with highest risk: {weakest_domain}")
# print(f"Scores by domain: {domain_scores}")


In [None]:
def print_domain_scores(domain_scores):
    for domain, scores in domain_scores.items():
        risk_score = scores['risk_score']
        similarity_score = scores['similarity_score']
        print("-"*50)
        print(f"Domain: {domain}")
        print("-"*50)
        print(f" Risk Score: {risk_score:.2f}")
        print("-"*50)


# Identify the weakest domain (highest risk)
weakest_domain = min(domain_scores, key=lambda x: domain_scores[x]['risk_score'])
print(f"Domain with highest risk: {weakest_domain}")


sorted_domains = sorted(domain_scores, key=lambda x: domain_scores[x]['risk_score'])
weakest_4_domains = sorted_domains[:4]
print(f"Domain with highest risk: {weakest_4_domains}")

# Print scores for each domain
# print_domain_scores(domain_scores)


In [None]:
from prettytable import PrettyTable

def print_domain_scores(domain_scores):
    # Create a PrettyTable object
    table = PrettyTable()
    table.field_names = ["Domain", "Risk Score"]

    # Populate the table with domain scores
    for domain, scores in domain_scores.items():
        risk_score = scores['risk_score']
        similarity_score = scores['similarity_score']
        table.add_row([domain, f"{risk_score:.2f}" ])

    # Print the table
    print(table)
print_domain_scores(domain_scores)


Groq_Follow_up
--

In [None]:
from groq import Groq

# Set up your Groq client with the correct API key
client = Groq(
    api_key=groq,  # Replace 'groq' with your actual API key if it's not defined
)

# Define the system prompt and the user's question
system_prompt =""" You are a highly intelligent assistant capable of generating insightful follow-up questions based on an initial input. Given a question and a specific domain, your task is to generate 10 follow-up questions that are directly related to the domain and provide a deeper or more detailed exploration of the subject within that domain.

The follow-up questions should:

1. Be highly relevant to the original question, exploring various aspects within the domain.
2. Encourage further clarification, elaboration, or exploration of the domain.
3. Provide valuable insights that help the user gain a deeper understanding of the domain in question.

Input Question: "{user_input}"
Domain: "{user_domain}"

Generate 10 thoughtful follow-up questions that delve into the topic of the original question, specifically focused on the domain of "{user_domain}":
"""
# User's input question
user_input = "Explain the importance of fast language models"
user_domain="Network Security"
# Make the API call with system and user messages
chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": system_prompt},  # System message setting the behavior
        {"role": "user", "content": user_input},  # User's input
        {"role": "user", "content": user_domain}  # User's input
    ],
    model="llama3-8b-8192",  # Specify the model you're using
)

output_text = chat_completion.choices[0].message.content
import re
follow_up_questions = re.findall(r'\d+\.\s([A-Za-z0-9\s\(\),\?\'"-]+[?])', output_text)



In [None]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_sentence_embedding(text, model, tokenizer):
    # Tokenize the input text and get the attention mask
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the hidden states and take the average to get the sentence embedding
    hidden_states = outputs.last_hidden_state  # Shape: (1, sequence_length, hidden_dim)
    sentence_embedding = hidden_states.mean(dim=1)  # Shape: (1, hidden_dim)
    
    return sentence_embedding

# Save the model and tokenizer
save_directory = './bert_model'
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print("Model and tokenizer saved successfully.")




In [None]:
from transformers import BertModel, BertTokenizer

# Load the model and tokenizer from the saved directory
load_directory = './bert_model'
model = BertModel.from_pretrained(load_directory)
tokenizer = BertTokenizer.from_pretrained(load_directory)
print("Model and tokenizer loaded successfully.")


In [None]:
for i in range(4):
    response =question_domain[weakest_4_domains[i]]
    # Get the embedding for the response
    response_embedding = get_sentence_embedding(response, model, tokenizer)
    # Get embeddings for each question
    question_embeddings = [get_sentence_embedding(question, model, tokenizer) for question in follow_up_questions]
    
    cosine_similarities = []
    for question_embedding in question_embeddings:
        similarity = cosine_similarity(response_embedding, question_embedding)
        cosine_similarities.append(similarity[0][0])
    
    most_relevant_question_index = np.argmax(cosine_similarities)
    most_relevant_question = questions[most_relevant_question_index]
    print("Domain:",weakest_4_domains[i])
    print("Response:", response)
    print("\nMost relevant question:", most_relevant_question)
    print("Cosine similarity score:", cosine_similarities[most_relevant_question_index])

    

In [None]:
# Calculate cosine similarity between response and each question

# Print results


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def get_most_relevant_question(weakest_4_domains, question_domain, follow_up_questions, questions, model, tokenizer):
    """
    Function to find the most relevant follow-up question based on cosine similarity with the response.

    Parameters:
    - weakest_4_domains: List of the weakest 4 domains.
    - question_domain: Dictionary with domain names as keys and their corresponding responses as values.
    - follow_up_questions: List of follow-up questions to compare.
    - questions: List of all possible questions to choose the most relevant one.
    - model: Pre-trained BERT model.
    - tokenizer: Pre-trained BERT tokenizer.

    Returns:
    - A list of tuples containing the domain, response, most relevant question, and cosine similarity score.
    """
    results = []

    for i in range(4):
        response = question_domain[weakest_4_domains[i]]
        # Get the embedding for the response
        response_embedding = get_sentence_embedding(response, model, tokenizer)
        
        # Get embeddings for each question
        question_embeddings = [get_sentence_embedding(question, model, tokenizer) for question in follow_up_questions]
        
        cosine_similarities = []
        for question_embedding in question_embeddings:
            similarity = cosine_similarity(response_embedding, question_embedding)
            cosine_similarities.append(similarity[0][0])
        
        # Find the most relevant question based on cosine similarity
        most_relevant_question_index = np.argmax(cosine_similarities)
        most_relevant_question = questions[most_relevant_question_index]

        # Store the result for this domain
        results.append((
            weakest_4_domains[i],
            response,
            most_relevant_question,
            cosine_similarities[most_relevant_question_index]
        ))

    return results

# Example usage
results = get_most_relevant_question(
    weakest_4_domains, 
    question_domain, 
    follow_up_questions, 
    questions, 
    model, 
    tokenizer
)

# Print results
for result in results:
    print("Domain:", result[0])
    print("Response:", result[1])
    print("\nMost relevant question:", result[2])
    print("Cosine similarity score:", result[3])
    print("="*50)

