# **This Notebook contains code for utilizing the LLM-Sentry framework**

To run this notebook successfully you need the following API keys


1.   [OPENAI](https://platform.openai.com/docs/quickstart/step-2-set-up-your-api-key) - name this key as OPENAI_API_KEY
2.   [HUGGINGFACE](https://www.nightfall.ai/ai-security-101/hugging-face-api-key) - name this key as HF_TOKEN
3.   [COHERE](https://cohere.com/pricing) - name this key as COHERE_API_KEY



**Install dependencies**

In [None]:
%%capture
!python -m pip install python-dotenv
!pip install openai
!pip install llama_index
!pip install llama-index-postprocessor-cohere-rerank
!python -m pip install cohere

## **Input Preparation**

**Import Dependencies**

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

False

In [None]:
import csv

**Change Working Directory**

In [None]:
os.chdir("/path/to/HarmfulKB") # insert the path to where you have saved HarmfulKB on your device

**Code to Prepare harmful files**

In [None]:
# Define the harmful input file path
input_file_path = 'harmful.txt'

# Read lines from the input text file
with open(input_file_path, 'r') as file:
    lines = file.readlines()

# Create a directory to save all the new CSV files
os.mkdir('harmful')

In [None]:
# Write each line to a new row in a new CSV file
for i, line in enumerate(lines):
    output_file_path = f'harmful/harmful_{i}.csv'
    with open(output_file_path, 'w', newline='') as csvfile:
      csv_writer = csv.writer(csvfile)
      csv_writer.writerow(['KB'])
      # Use strip() to remove any leading/trailing whitespace including newline characters
      csv_writer.writerow([line.strip()])
      csvfile.close()

print(f'Lines from {input_file_path} have been written')

**Code to Prepare harmless files**

In [None]:
# Define the harmless input file path
input_file_path = 'harmless.txt'

# Read lines from the input text file
with open(input_file_path, 'r') as file:
    lines = file.readlines()

# Create a directory to save all the new CSV files
os.mkdir('harmless')

In [None]:
# Write each line to a new row in a new CSV file
for i, line in enumerate(lines):
    output_file_path = f'harmless/harmless_{i}.csv'
    with open(output_file_path, 'w', newline='') as csvfile:
      csv_writer = csv.writer(csvfile)
      csv_writer.writerow(['KB'])
      # Use strip() to remove any leading/trailing whitespace including newline characters
      csv_writer.writerow([line.strip()])
      csvfile.close()

print(f'Lines from {input_file_path} have been written')

**Preparing the Knowledge Base**

In [None]:
# import OpenAI
from google.colab import userdata
import openai

openai.api_key = userdata.get('OPENAI_API_KEY')

In [None]:
# generate vector database from csv files in knowledge base
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

harmful_documents = SimpleDirectoryReader(input_dir="harmful", required_exts=[".csv"]).load_data()
harmless_documents = SimpleDirectoryReader(input_dir="harmless", required_exts=[".csv"]).load_data()

documents = harmful_documents + harmless_documents

index = VectorStoreIndex.from_documents(documents,show_progress=True)

## **The LLM-Sentry framework**

In [None]:
# import zero shot classifier
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

In [None]:
# define query engine for RAG
# import Reranker module
from llama_index.core.indices.postprocessor import SimilarityPostprocessor
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.llms.openai import OpenAI

query_engine = index.as_query_engine()

api_key = userdata.get("COHERE_API_KEY")
cohere_rerank = CohereRerank(api_key=api_key, top_n=10)
postprocessor = SimilarityPostprocessor(similarity_cutoff = 0.80)

query_engine = index.as_query_engine(
    similarity_top_k=4,
    node_postprocessors=[cohere_rerank],
)

## **Code to test LLM-Sentry**

In [None]:
import pandas as pd

# provide a csv file containing your harmful/safe prompts
df = pd.read_csv('prompt.csv')
df.head()
sentences_list = df['Prompt in English'].tolist()

# removing empty stings from a list of strings using one liner
filtered_list = [string for string in sentences_list if string]

In [None]:
def iterate_nested_json_for_loop(json_obj, filepaths):
  for key, value in json_obj.items():
    if isinstance(value, dict):
      iterate_nested_json_for_loop(value, filepaths)
    else:
      if(key == 'file_path'):
        filepaths.append(str(key + value))
  return filepaths

In [None]:
import re
splitted_sentences = []

# split each sentence in input prompt
for prompt in sentences_list:
  sentence = prompt.split("\n")
  sentence = list(filter(None, sentence))
  sentence = [sent.replace('?', '.').split('.') for sent in sentence]
  sentence = [item for sublist in sentence for item in sublist]
  sentence = [string for string in sentence if string!=""]
  print(sentence)
  splitted_sentences.append(sentence)

In [None]:
from llama_index.core.response.pprint_utils import pprint_response
from openai import OpenAI
import time

client = OpenAI(
    # This is the default and can be omitted
    api_key = userdata.get("OPENAI_API_KEY")
)

harmful_counter = 0
window_length = 2

for sentences in splitted_sentences:
  flag = 0
  # check shift of intent using sliding window of size 2 (pairs)
  for i in range(len(sentences)-(window_length-1)):
    # concatenate each sentence in the window
    sequence_to_classify = ""
    for j in range(window_length):
      sequence_to_classify += sentences[i+j] + '. '
    candidate_labels = ['harmful', 'safe']
    # use zero shot classifier to quantify the overall intent of the sentences inside the window
    output = classifier(sequence_to_classify, candidate_labels)
    print(output)
    # if harmful score is above a predefined threshold of 0.5, set the prompt as harmful
    if output["labels"][0] == "harmful" and output["scores"][0] > 0.6:
      harmful_counter += 1
      flag = 1
      break
    elif output["labels"][1] == "harmful" and output["scores"][1] > 0.6:
      harmful_counter += 1
      flag = 1
      break
  # if prompt is marked as safe using zero shot classifier, further check each sentence using RAG method
  if flag == 0:
    for i in range(len(sentences)):
      majority_vote = 0
      #paraphrasing each sentence 4 times and extracting matches from vector database
      print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", sentences[i])
      for j in range(5):
        rephrase_query = "Rephrase the following: " + sentences[i]
        paraphrased_sent = client.chat.completions.create(
          model="gpt-3.5-turbo",
          messages=[
            {"role": "user", "content": rephrase_query}
          ]
        )
        paraphrased_sent = paraphrased_sent.choices[0].message.content

        if paraphrased_sent != "Empty Response":
          # query_template2 = "Is this harmful?: " + paraphrased_sent
          query_template2 = paraphrased_sent
          print("...................................", query_template2)
          # introduce delay because of cohere API which has limit of 10 calls/minute
          # time.sleep(6)
          response = query_engine.query(query_template2)

          filepaths = []
          harmful_source = 0
          # counting how many of the extracted files were from the harmful knowledge base
          if(response.metadata != None):
            iterate_nested_json_for_loop(response.metadata, filepaths)
            # harmful_source = len([1 for paths in filepaths if "harmful" in paths])
            if "harmful" in filepaths[0]:
              harmful_source += 1
          print("harmful_source: ", harmful_source)
          # if any one of the retrieved documents are from harmful knowledge base we mark that paraphrased version as harmful
          if harmful_source > 0:
            majority_vote += 1
      # for each sentence if 3 out of the 5 paraphrased versions are harmful then we say the sentence is harmful
      if majority_vote > 2:
        harmful_counter += 1
        break
  print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")


print("Detection Accuracy: ", harmful_counter/(len(splitted_sentences)))