In [None]:
!pip install datasets huggingface_hub predictionguard langchain

Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting predictionguard
  Downloading predictionguard-1.9.1-py2.py3-none-any.whl (6.4 kB)
Collecting langchain
  Downloading langchain-0.1.8-py3-none-any.whl (816 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m816.1/816.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloa

In [None]:
import os
import predictionguard as pg
import huggingface_hub
import datasets

In [None]:
huggingface_hub.notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# data = datasets.load_dataset('csv', data_files = '/content/bot_database.csv')

In [None]:
# data.push_to_hub('RCODI/llm_bot_dataset')

In [None]:
data = datasets.load_dataset("RCODI/llm_bot_dataset")

Downloading readme:   0%|          | 0.00/354 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/459k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2688 [00:00<?, ? examples/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['Context', 'Baseline Response', 'bot'],
        num_rows: 2688
    })
})

In [None]:
data_copy = data['train']

In [None]:
data_copy

Dataset({
    features: ['Context', 'Baseline Response', 'bot'],
    num_rows: 2688
})

In [None]:
from getpass import getpass

In [None]:
pg_access_token = getpass('Enter your Prediction Guard access token: ')
os.environ['PREDICTIONGUARD_TOKEN'] = pg_access_token

Enter your Prediction Guard access token: ··········


In [None]:
from langchain import PromptTemplate, FewShotPromptTemplate

In [None]:
def chat_prompt(messages, demo_template, prefix, suffix):

  # Define a prompt template for the demonstrations.
  demo_prompt = PromptTemplate(
      input_variables=["user", "assistant"],
      template=demo_template,
  )

  examples = []
  user_entry = None
  assistant_messages = []

  for turn in messages:
      # Skip system messages
      if turn['role'] == 'system':
          continue

      if turn['role'] == 'user':
          # If encountering a user message, process accumulated assistant messages first
          if assistant_messages:
              # Combine assistant messages and add to examples
              examples.append({'user': user_entry, 'assistant': ' '.join(assistant_messages)})
              assistant_messages = []  # Reset assistant messages list
          # Update the user entry with the current message
          user_entry = turn['content']
      else:
          # Accumulate assistant messages
          assistant_messages.append(turn['content'])

  # After the loop, check if there are unprocessed assistant messages
  if assistant_messages:
      if user_entry:
          examples.append({'user': user_entry, 'assistant': ' '.join(assistant_messages)})
      else:
          # Handle case where there are only assistant messages at the end without a corresponding user message
          examples.append({'user': 'Continue', 'assistant': ' '.join(assistant_messages)})

  # Determine the latest message for prompt continuation
  latest_message = 'Continue' if not user_entry else user_entry

  # Prepare the few shot template
  few_shot_prompt = FewShotPromptTemplate(

      # This is the demonstration data we want to insert into the prompt.
      examples=examples,
      example_prompt=demo_prompt,
      example_separator="",

      # This is the boilerplate portion of the prompt corresponding to
      # the prompt task instructions.
      prefix=prefix[0],

      # The suffix of the prompt is where we will put the output indicator
      # and define where the "on-the-fly" user input would go.
      suffix=suffix[0],
      input_variables=["input"],
  )

  return few_shot_prompt.format(input=latest_message)

In [None]:
## First model to use: Zephyr-7B-Beta

In [None]:
import ast

In [None]:
## Helper function to prepare prompt into zephyr form
def prepare_prompt_zephyr(context):

  # Prepare the few shot demonstration template
  demo_template = """<|user|>
    {user}</s>
    <|assistant|>
    {assistant}</s>
    """

  tcontext = ast.literal_eval(context)
  # This is the boilerplate portion of the prompt corresponding to
  # the prompt task instructions.
  system = ''
  for turn in tcontext:
    if turn['role'] == 'system':
      system = turn['content']
  prefix = "<|user|>\n" + system + "</s>\n",

  # The suffix of the prompt is where we will put the output indicator
  # and define where the "on-the-fly" user input would go.
  suffix="<|user|>\n{input}</s>\n<|assistant|>\n",

  return chat_prompt(
      tcontext,
      demo_template,
      prefix,
      suffix
  )

In [None]:
## Helper function on getting the prompt response
def get_zephyr_response(context):
  respond_choices = pg.Completion.create(
      model="Zephyr-7B-Beta",
      prompt=prepare_prompt_zephyr(context),
      max_tokens=300
  )

  result = respond_choices['choices'][0]['text']
  return result

In [None]:
## Second model to use: Nous-Hermes-Llama2-13B

In [None]:
def prepare_prompt_llama(context):

  # Prepare the few shot demonstration template
  demo_template = """USER: {user}
  ASSISTANT: {assistant}\n
  """

  # This is the boilerplate portion of the prompt corresponding to
  # the prompt task instructions.
  system = ''
  tcontext = ast.literal_eval(context)
  for turn in tcontext:
    if turn['role'] == 'system':
      system = turn['content']
  prefix = "### Instruction:\n" + system + "\n\n",

  # The suffix of the prompt is where we will put the output indicator
  # and define where the "on-the-fly" user input would go.
  suffix="### Input:\nUSER: {input}\n\n### Response:\nASSISTANT: ",

  return chat_prompt(
      tcontext,
      demo_template,
      prefix,
      suffix
  )

In [None]:
def get_llama_response(context):
  respond_choices = pg.Completion.create(
    model="Nous-Hermes-Llama2-13B",
    prompt=prepare_prompt_llama(context),
    max_tokens=300
  )
  result = respond_choices['choices'][0]['text'].split('###')[0]
  return result

In [None]:
## Third model to use: Neural-Chat-7B

In [None]:
def prepare_prompt_neural(context):
    # Prepare the few shot demonstration template
  demo_template = """### User:
  {user}
  ### Assistant:
  {assistant}
  """

  # This is the boilerplate portion of the prompt corresponding to
  # the prompt task instructions.
  system = ''
  tcontext = ast.literal_eval(context)
  for turn in tcontext:
    if turn['role'] == 'system':
      system = turn['content']
  prefix = "### System:\n" + system + "\n",

  # The suffix of the prompt is where we will put the output indicator
  # and define where the "on-the-fly" user input would go.
  suffix="### User:\n{input}\n### Assistant:\n",

  return chat_prompt(
      tcontext,
      demo_template,
      prefix,
      suffix
  )

In [None]:
def get_neural_response(context):
  respond_choices = pg.Completion.create(
    model="Neural-Chat-7B",
    prompt=prepare_prompt_neural(context),
    max_tokens=300
  )

  result = respond_choices['choices'][0]['text'].split('###')[0]
  return result

In [None]:
## Metrics Helper Function

In [None]:
## Text Similarity

In [None]:
! pip install sacrebleu rouge_score rapidfuzz

Collecting sacrebleu
  Downloading sacrebleu-2.4.0-py3-none-any.whl (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m102.4/106.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rapidfuzz
  Downloading rapidfuzz-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-no

In [None]:
## BLEU score
from sacrebleu import corpus_bleu

def get_bleu_score(response, baseline):
    # Ensure response is a list of strings
    if not isinstance(response, list):
        response = [response]  # Wrap in list if it's a single string
    if not response or not all(isinstance(r, str) for r in response):
        raise ValueError("Response must be a list of non-empty strings.")

    # Ensure baseline is a string and wrap it in a list of lists
    if not isinstance(baseline, str) or not baseline:
        raise ValueError("Baseline must be a non-empty string.")
    baseline = [[baseline]]  # Correct format for corpus_bleu

    # Calculate BLEU score
    bleu_score = corpus_bleu(response, baseline).score
    return bleu_score

In [None]:
## ROUGE score (ROUGE-L)
from rouge_score import rouge_scorer
def get_rouge_score(response, baseline):
  scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
  rouge_score = scorer.score(baseline, response)['rougeL'].fmeasure
  return rouge_score

In [None]:
## Levenshtein distance
from rapidfuzz import fuzz
def get_leven_distance(response, baseline):
  leven_distance = fuzz.ratio(baseline, response)
  return leven_distance

In [None]:
## Semantic Similarity

In [None]:
! pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m122.9/132.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.3.1


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def get_cosine_similarity(response, baseline, model):
  embeddings = model.encode([response, baseline])
  cos_sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
  return cos_sim

In [None]:
## Factual Consistency

In [None]:
from transformers import pipeline

In [None]:
nli_pipeline = pipeline("text-classification", model="roberta-large-mnli")

config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
from transformers import AutoTokenizer

In [None]:
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")

In [None]:
def nil_truncate_input(baseline, response, max_length=486):
    # Encode both texts together and truncate to the max_length
    encoded_input = roberta_tokenizer.encode_plus(baseline, response, max_length=max_length, truncation=True, return_tensors='pt')

    # Convert encoded input back to text (for demonstration; in practice, you might directly use encoded input with your model)
    tokens = roberta_tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])
    truncated_text = roberta_tokenizer.convert_tokens_to_string(tokens)

    return truncated_text

In [None]:
def get_nli_result(response, baseline, nli_pipeline):
  # Truncate input to fit within model's sequence length limits
  truncated_input = nil_truncate_input(baseline, response)

  # Call the pipeline with truncated input
  result = nli_pipeline(truncated_input)

  if result[0]['label'] == 'ENTAILMENT':
    score = 1
  elif result[0]['label'] == 'CONTRADICTION':
    score = -1
  else:
    score = 0
  return score

In [None]:
## Empathy Consistency

In [None]:
from transformers import pipeline

In [None]:
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [None]:
def split_text(text, max_length):
    """Split the text into chunks of `max_length`."""
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

In [None]:
def get_sentiment_score(text, sentiment_pipeline, max_length=486):
    """Get the aggregated sentiment score for a text, splitting it if it exceeds `max_length`."""
    # Split the text into chunks
    chunks = split_text(text, max_length)
    total_score = 0
    total_weight = 0

    # Process each chunk
    for chunk in chunks:
        result = sentiment_pipeline(chunk)
        # Aggregate the scores (example: simple average)
        for r in result:
            score = r['score'] if r['label'] == 'POSITIVE' else -r['score']
            total_score += score
            total_weight += 1

    # Calculate the average score
    average_score = total_score / total_weight if total_weight > 0 else 0
    return average_score

In [None]:
# def get_sentiment_score(response, sentiment_pipeline):
#   sentiment_result = sentiment_pipeline(response)
#   sentiment_score = sentiment_result[0]['score'] if sentiment_result[0]['label'] == 'POSITIVE' else 0
#   return sentiment_score

In [None]:
! pip install nltk



In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import re

# Ensure the necessary NLTK data is downloaded
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Sentiment Intensity Analyzer for affective statements
sia = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
## Empathy Rules
# Person form
def score_person_form(text):
    first_person_plural = len([word for word in text.split() if word.lower() in ["we", "us", "our"]])
    second_person = len([word for word in text.split() if word.lower() in ["you", "your"]])
    return first_person_plural + second_person

# Pronouns
def score_pronouns(text):
    tokens = word_tokenize(text)
    pronouns = [word for word, pos in pos_tag(tokens) if pos == 'PRP']
    return len(pronouns)

# Tense
def score_tense(text):
    tokens = word_tokenize(text)
    present_tense_verbs = len([word for word, pos in pos_tag(tokens) if pos in ['VBP', 'VBZ']])
    return present_tense_verbs

# Exclamations
def score_exclamations(text):
    exclamations = text.count('!')
    return exclamations

# Stimulating Dialogue
def score_stimulating_dialogue(text):
    stimulating_phrases = [
        r"\bshall we\b", r"\bhow about\b", r"could you please share",
        r"what are your thoughts on\b", r"\bwhat do you think about\b",
        r"\bwhy don't we\b", r"\bhave you considered\b"
    ]
    # Use regex to find matches and count them
    return sum(len(re.findall(phrase, text.lower())) for phrase in stimulating_phrases)


def score_acknowledging(text):
    acknowledging_phrases = [
        r"\bthank you for\b", r"\bthis is helpful\b", r"\bI appreciate\b",
        r"\bgood point\b", r"\bthat's a great idea\b", r"\bI understand\b",
        r"\bthanks for sharing\b"
    ]
    # Use regex to find matches and count them
    return sum(len(re.findall(phrase, text.lower())) for phrase in acknowledging_phrases)


def score_collective_reasoning(text):
    reasoning_phrases = [
        r"\bthinking together\b", r"\blet us think this through\b",
        r"\bas a team\b", r"\bworking together\b", r"\bjoin our heads\b",
        r"\bcollectively consider\b", r"\bmutual understanding\b"
    ]
    # Use regex to find matches and count them
    return sum(len(re.findall(phrase, text.lower())) for phrase in reasoning_phrases)

# Imperative Statements
def score_imperative_statements(text):
    tokens = word_tokenize(text)
    imperatives = [word for word, pos in pos_tag(tokens) if pos == 'VB' or (pos == 'VBP' and word == 'please')]
    return len(imperatives)

# Interim Questioning
def score_interim_questioning(text):
    interim_questions = text.count('?')
    return interim_questions

# Caring Statements
def score_caring_statements(text):
    # This uses Sentiment Intensity Analyzer to check for positive sentiment as a proxy for affective statements
    sentiment = sia.polarity_scores(text)
    return sentiment['pos']

In [None]:
# count the number of sentences
def count_sentences(text):
    # Pattern to match sentence endings: '.', '?', '!'
    # We use a lookahead to ensure that the punctuation is followed by a space or end of string,
    # which helps in not counting abbreviations or decimal numbers as sentence ends.
    sentence_endings = re.compile(r'[.!?](?=\s|$)')
    sentences = sentence_endings.findall(text)
    return len(sentences)

In [None]:
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
nli_pipeline = pipeline("text-classification", model="roberta-large-mnli")
sentiment_pipeline = pipeline("sentiment-analysis")

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [None]:
def PromptandMetrics(row, model='llama'):
  context = row['Context']
  baseline = row['Baseline Response']
  if baseline is None:
    print('Empty baseline')
    baseline = 'Thank you.'
  if model == 'zephyr':
   response = get_zephyr_response(context)
  elif model == 'llama':
    response = get_llama_response(context)
  elif model == 'neural':
    response = get_neural_response(context)

  # text similarity
  bleu_score = get_bleu_score(response, baseline)
  rouge_score = get_rouge_score(response, baseline)
  leven_distance = get_leven_distance(response, baseline)
  text_similarity = (bleu_score + rouge_score + leven_distance)/3

  # semantic similarity
  cos_sim = get_cosine_similarity(response, baseline, semantic_model)

  # factual consistency
  nli_score = get_nli_result(response, baseline, nli_pipeline)

  # sentiment label
  response_sent_score = get_sentiment_score(response, sentiment_pipeline)
  baseline_sent_score = get_sentiment_score(baseline, sentiment_pipeline)

  # empathy rules
  sia = SentimentIntensityAnalyzer()
  response_length = count_sentences(response) + 1
  response_person_form_score = score_person_form(response)/response_length
  response_pronoun_score = score_pronouns(response)/response_length
  response_tense_score = score_tense(response)/response_length
  response_exclamations_score = score_exclamations(response)/response_length
  response_stimulating_score = score_stimulating_dialogue(response)/response_length
  response_acknowledging_score = score_acknowledging(response)/response_length
  response_collective_reasoning_score = score_collective_reasoning(response)/response_length
  response_imperative_socre = score_imperative_statements(response)/response_length
  response_interim_q_score = score_interim_questioning(response)/response_length
  response_caring_statement_score = score_caring_statements(response)
  baseline_length = count_sentences(baseline) + 1
  baseline_person_form_score = score_person_form(response)/baseline_length
  baseline_pronoun_score = score_pronouns(response)/baseline_length
  baseline_tense_score = score_tense(response)/baseline_length
  baseline_exclamations_score = score_exclamations(response)/baseline_length
  baseline_stimulating_score = score_stimulating_dialogue(response)/baseline_length
  baseline_acknowledging_score = score_acknowledging(response)/baseline_length
  baseline_collective_reasoning_score = score_collective_reasoning(response)/baseline_length
  baseline_imperative_socre = score_imperative_statements(response)/baseline_length
  baseline_interim_q_score = score_interim_questioning(response)/baseline_length
  baseline_caring_statement_score = score_caring_statements(response)

  # Return a dictionary with new columns
  return {"response": response,
          "text similarity": text_similarity,
          "BLEU score": bleu_score,
          "ROUGE score": rouge_score,
          "LEVEN distance": leven_distance,
          "semantic similarity": cos_sim,
          "factual consistency": nli_score,
          "LLM response length": response_length,
          "LLM sentiment": response_sent_score,
          "LLM person form": response_person_form_score,
          "LLM pronoun": response_pronoun_score,
          "LLM tense": response_tense_score,
          "LLM exclamation": response_exclamations_score,
          "LLM stimulating dialogue": response_stimulating_score,
          "LLM acknowledging": response_acknowledging_score,
          "LLM collective reasoning": response_collective_reasoning_score,
          "LLM imperative statement": response_imperative_socre,
          "LLM interim questions": response_interim_q_score,
          "LLM caring statement": response_caring_statement_score,
          "baseline response length": baseline_length,
          "baseline sentiment": baseline_sent_score,
          "baseline person form": baseline_person_form_score,
          "baseline pronoun": baseline_pronoun_score,
          "baseline tense": baseline_tense_score,
          "baseline exclamation": baseline_exclamations_score,
          "baseline stimulating dialogue": baseline_stimulating_score,
          "baseline acknowledging": baseline_acknowledging_score,
          "baseline collective reasoning": baseline_collective_reasoning_score,
          "baseline imperative statement": baseline_imperative_socre,
          "baseline interim questions": baseline_interim_q_score,
          "baseline caring statement": baseline_caring_statement_score}



In [None]:
shuffled_data = data_copy.shuffle(seed=42)

In [None]:
total_length = len(shuffled_data)

In [None]:
total_length

2688

In [None]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
init = 2000
batch_index = 20
batch_length = 100
while init < total_length:
  subset_data = shuffled_data.select(range(init, min(init + batch_length, total_length)))

  if batch_index > 5:
    llama_results = subset_data.map(lambda x: PromptandMetrics(x, model='llama'))
    llama_save_path = f"/content/drive/My Drive/results1/llama_results_{batch_index}.csv"
    llama_results_df = pd.DataFrame(llama_results)
    llama_results_df.to_csv(llama_save_path)

  neural_results = subset_data.map(lambda x: PromptandMetrics(x, model='neural'))
  neural_save_path = f"/content/drive/My Drive/results1/neural_results_{batch_index}.csv"
  neural_results_df = pd.DataFrame(neural_results)
  neural_results_df.to_csv(neural_save_path)

  init += batch_length
  batch_index += 1

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Empty baseline


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Empty baseline


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/88 [00:00<?, ? examples/s]

Empty baseline


Map:   0%|          | 0/88 [00:00<?, ? examples/s]

Empty baseline


In [None]:
print(prepare_prompt_llama(shuffled_data['Context'][200]))

### Instruction:
You are a chatbot specializing in legal advice for landlords and tenants regarding leasing issues in the Chicago area. You provide factual and direct guidance on local housing laws, lease agreements, tenant rights, and dispute resolution, without focusing on the emotional aspects of these issues.

USER: Continue
  ASSISTANT: Rentervention is a virtual assistant that's designed to help tenants with housing issues. What can I help you with?

  USER: There's trouble with my security deposit.
  ASSISTANT: Security deposit law varies based on where the property is located. Provide a zip code.

  USER: 60606
  ASSISTANT: The tenant is in Chicago.

  USER: Continue
  ASSISTANT: The protections for security deposits depend on the type of property rented.

  USER: Continue
  ASSISTANT: Select the type of rental.

  USER: Single family home
  ASSISTANT: Does/did the landlord live in the building?

  ### Input:
USER: Yes

### Response:
ASSISTANT: 


In [None]:
print(shuffled_data['Baseline Response'][2050])

When did the tenant receive the estimate for repairs?


In [None]:
# context = data_copy['Context'][1]

In [None]:
# data_copy['Context'][1]

In [None]:
# print(prepare_prompt_llama(context))

In [None]:
# respond_choices = pg.Completion.create(
#     model="Nous-Hermes-Llama2-13B",
#     prompt=prepare_prompt_llama(context),
#     max_tokens=300
# )

In [None]:
# print(respond_choices)

In [None]:
# result = respond_choices['choices'][0]['text']