In [15]:
import torch
import os
import gc
import pandas as pd
import re

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from huggingface_hub import InferenceClient

!huggingface-cli login --token YOUR_TOKEN

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /u/sy63/.cache/huggingface/token
Login successful


In [16]:
!nvidia-smi

Fri Apr 26 15:16:59 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  |   00000000:21:00.0 Off |                    0 |
| N/A   60C    P0            294W /  300W |   23949MiB /  81920MiB |    100%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          On  |   00

In [17]:
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm_client = InferenceClient(
    model=repo_id,
    timeout=120,
)

In [18]:
data = load_dataset ("CAII-NCSA/RAG_SelfCritique_Mixtral_2")['train']

question = data["Question"]
RAG = data["RAG"]
first_answer = data['Answer first attempt']
factuality = data['Factuality Critique first attempt']
usefulness = data["Usefulness Critique first attempt"]
helpfulness = data["Helpfulness Critique first attempt"]
freshness = data["Freshness Critique first attempt"]
examples = data["Examples Critique first attempt"]
final_answer = data["Final answer"]

data_dic = {
    'question': question,
    'RAG': RAG,
    "first_answer": first_answer,
    "factuality": factuality,
    "usefulness": usefulness,
    "helpfulness": helpfulness,
    "freshness": freshness,
    "examples": examples,
    "answer": final_answer
       }

final_data = pd.DataFrame(data_dic)
final_data

Unnamed: 0,question,RAG,first_answer,factuality,usefulness,helpfulness,freshness,examples,answer
0,Explain how tiling helps with global memory ba...,[{'text': '  ...,\n\n1. Tiling is a strategy to partition the d...,\n\nCritique:\n\n1. The response correctly ide...,\n\nCritique:\n\n1. The response is informativ...,\n\nCritique:\n\nThe response provides a clear...,\n\nCritique:\n\n1. The content of the respons...,\n\nCritique:\n\n1. The response effectively i...,\n\nTiling is a strategy to partition the data...
1,Show an example of [CUDA API] function usage.,[{'text': ' Returns the requested driver API f...,\n\nStep 1: Identify the CUDA API function.\nT...,\n\nStep 4: Check the documentation for the fu...,\n\nStep 4: Provide additional resources.\nFor...,\n\nStep 4: Understand the response options pr...,\n\nStep 4: Freshness of the content.\nThe con...,\n\nStep 4: Explain the relevance of the examp...,\n\nStep 1: Identify the CUDA API function.\nT...
2,what is my chance to get into ece master at ui...,"[{'text': ' Find degree requirements, visi...","\n\n1. According to the document ""Graduate Adm...",\n\nCritique:\n\n1. The initial response accur...,\n\nCritique:\n\n1. The response is written in...,\n\nCritique:\n\nThe response provided is gene...,\n\nCritique:\n\n1. The content in the respons...,\n\nCritique:\n\n* The response does incorpora...,"\n\n1. According to the document ""Graduate Adm..."
3,Explain how tiling helps with global memory ba...,[{'text': '  ...,\n\nStep 1: Understand the concept of tiling.\...,\n\nSources:\n<https://www.intel.com/content/w...,\n\nCritique of initial response:\n\nThe initi...,\n\nCritique:\nThe response provides a clear a...,\n\nCritique of the response:\n\nThe response ...,\n\nStep 5: Provide relevant examples.\nFor ex...,\n\nTiling is a technique used in computer gra...
4,When is the first midterm exam?,[{'text': ' What is this? | Let Me ChatGPT!...,\n\nStep 1: Let's first understand that you a...,\n\nCritique:\nThe response provided is factu...,\n\nCritique:\n\nStep 1: The response is infor...,\n\nCritique: \nThe response provided is hel...,\n\nCritique of freshness of content:\n\nStep ...,\n\nResponse critique: \n\nThe response effe...,\nTo find out when the first midterm exam is...
...,...,...,...,...,...,...,...,...,...
163,explain how the tiled matrix multiplication co...,[{'text': ' What does matrix multiplication m...,\n\n1. Tiled matrix multiplication is a techni...,\n\nCritique:\n\n1. The response accurately de...,\n\nCritique:\n\n* The response does not provi...,\n\nCritique:\nThe response provides a clear a...,\n\nCritique:\n\n* The content of the response...,\n\nCritique:\n\nThe response effectively inco...,\n\n1. Tiled matrix multiplication is a techni...
164,How many global memory data bytes TOTAL are be...,[{'text': '4.1. Importance of Memory Access Ef...,\n\n1. We know that each element of Matrix A i...,\n\nCritique:\n\nThe response is generally fac...,\n\nCritique:\n\nThe response is mostly correc...,\n\nCritique:\nThe response is helpful in unde...,\n\nCritique:\n\nThe content in this response ...,\n\nCritique:\n\nThe response effectively inco...,\n\nTo calculate the total number of bytes rea...
165,Each streaming multiprocessor (SM) in an NVIDI...,[{'text': ' FREE TOOLS AND TRAININGS FOR DEVE...,\n\nStep 1: Identify the compute capability of...,\n\nFinal answer: We cannot determine the numb...,\n\nFinal response:\n\nBased on the provided d...,\n\nCritique:\nThe initial response provides a...,\n\nFreshness of content critique:\n\nThe cont...,\n\nCritique:\nThe response effectively incorp...,"\n\nBased on the provided documents, we cannot..."
166,is it possible for theadidx.x>31,[{'text': ' This is an ID format used by the ...,"\n\nFirst, let's consider the ID64 format. The...","\n\nNext, let's consider the UDID format. The ...","\n\nNext, let's consider the UDID format. The ...","\n\nNext, let's consider the UDID format. The ...","\n\nNow, let's consider the UDID format. The U...","\n\nNow, let's consider the UDID format. The U...",\n\nThe ID64 format is a structure with differ...


In [19]:
JUDGE_PROMPT = """
You're an expert critic. You will be given a full_prompt and first_query pair. 
Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question.
Review the user’s question and the corresponding response using a 5-point scoring system based on the following criteria:

1. Completeness: Add 1 point if the response addresses all aspects of the question in a holistic manner.
2. Factuality: Add 1 point if the response is accurate and based on factual information, and add 0 points if any part of the response is loosely inferred or isn't supported by the context. 
3. Helpfulness: Add 1 point if the response is likely to be useful to the user in addressing their concern or question.
4. Supported: Add 1 point if the response provides evidence or sources supporting the claims or information presented.
5. Includes examples: Add 1 point if the response includes examples or analogies that help clarify the answer.
6. Format-wise: Subtract 1 point if the format is violated; add 0 points if the format is correctly followed.

After reviewing, list your scoring process:

- Completeness: (+1 if met, 0 otherwise)
- Factuality: (+1 if met, 0 otherwise)
- Helpfulness: (+1 if met, 0 otherwise)
- Supported: (+1 if met, 0 otherwise)
- Includes examples: (+1 if met, 0 otherwise)
- Format-wise: (-1 if violated, 0 if correctly followed)

Provide your feedback as follows strictly in the format: Evaluation between double square brackets and Total Rating between double curly braces. 

Evaluation:
[[ your rationale for each point added or subtracted, showing how each criterion was evaluated ]]

Total rating: {{ (Add points from the prompt, as a number between -1 and 5) }}

You MUST provide values for 'Evaluation:' and 'Score:' in your answer STRICTLY in the requested format. 
Conclude with the score using the format: “Score: <Total rating>”

Now here are the question and answer.
Question: {question}
Answer: {answer}
"""

In [20]:
def generate_text(row):
    prompt = JUDGE_PROMPT.format(
        question=row['question'],
        answer=row['answer'],
        factuality=row['factuality'],
        helpfulness=row['helpfulness'],
        freshness=row['freshness'],
        examples=row['examples'],
        usefulness=row['usefulness']
    )
    generated_text = llm_client.text_generation(
        prompt=prompt,
        max_new_tokens=1000,
        temperature=0.7
    ).strip()
    return generated_text

final_data["llm_judge"] = final_data.apply(generate_text, axis=1)

def extract_judge_score(answer: str, split_str: str = "Score:") -> int:
    try:
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        digit_groups = [el.strip() for el in re.findall(r"\d+(?:\.\d+)?", rating)]
        return float(digit_groups[0])
    except Exception as e:
        print(e)
        return None

final_data["llm_judge_score"] = final_data["llm_judge"].apply(extract_judge_score)

final_data

list index out of range
list index out of range
list index out of range
list index out of range


Unnamed: 0,question,RAG,first_answer,factuality,usefulness,helpfulness,freshness,examples,answer,llm_judge,llm_judge_score
0,Explain how tiling helps with global memory ba...,[{'text': '  ...,\n\n1. Tiling is a strategy to partition the d...,\n\nCritique:\n\n1. The response correctly ide...,\n\nCritique:\n\n1. The response is informativ...,\n\nCritique:\n\nThe response provides a clear...,\n\nCritique:\n\n1. The content of the respons...,\n\nCritique:\n\n1. The response effectively i...,\n\nTiling is a strategy to partition the data...,"Evaluation:\n[[ Completeness: (+1 if met, 0 ot...",5.0
1,Show an example of [CUDA API] function usage.,[{'text': ' Returns the requested driver API f...,\n\nStep 1: Identify the CUDA API function.\nT...,\n\nStep 4: Check the documentation for the fu...,\n\nStep 4: Provide additional resources.\nFor...,\n\nStep 4: Understand the response options pr...,\n\nStep 4: Freshness of the content.\nThe con...,\n\nStep 4: Explain the relevance of the examp...,\n\nStep 1: Identify the CUDA API function.\nT...,"Score:\n\nEvaluation:\n[[ 1, 'Completeness: Th...",1.0
2,what is my chance to get into ece master at ui...,"[{'text': ' Find degree requirements, visi...","\n\n1. According to the document ""Graduate Adm...",\n\nCritique:\n\n1. The initial response accur...,\n\nCritique:\n\n1. The response is written in...,\n\nCritique:\n\nThe response provided is gene...,\n\nCritique:\n\n1. The content in the respons...,\n\nCritique:\n\n* The response does incorpora...,"\n\n1. According to the document ""Graduate Adm...","Evaluation:\n[[\n- Completeness: (+1 if met, 0...",4.0
3,Explain how tiling helps with global memory ba...,[{'text': '  ...,\n\nStep 1: Understand the concept of tiling.\...,\n\nSources:\n<https://www.intel.com/content/w...,\n\nCritique of initial response:\n\nThe initi...,\n\nCritique:\nThe response provides a clear a...,\n\nCritique of the response:\n\nThe response ...,\n\nStep 5: Provide relevant examples.\nFor ex...,\n\nTiling is a technique used in computer gra...,"Evaluation:\n[[ +1 for completeness, as the re...",4.0
4,When is the first midterm exam?,[{'text': ' What is this? | Let Me ChatGPT!...,\n\nStep 1: Let's first understand that you a...,\n\nCritique:\nThe response provided is factu...,\n\nCritique:\n\nStep 1: The response is infor...,\n\nCritique: \nThe response provided is hel...,\n\nCritique of freshness of content:\n\nStep ...,\n\nResponse critique: \n\nThe response effe...,\nTo find out when the first midterm exam is...,"Evaluation:\n[[\n- Completeness: (+1 if met, 0...",
...,...,...,...,...,...,...,...,...,...,...,...
163,explain how the tiled matrix multiplication co...,[{'text': ' What does matrix multiplication m...,\n\n1. Tiled matrix multiplication is a techni...,\n\nCritique:\n\n1. The response accurately de...,\n\nCritique:\n\n* The response does not provi...,\n\nCritique:\nThe response provides a clear a...,\n\nCritique:\n\n* The content of the response...,\n\nCritique:\n\nThe response effectively inco...,\n\n1. Tiled matrix multiplication is a techni...,Evaluation:\n\n- Completeness: (+1) The answer...,4.0
164,How many global memory data bytes TOTAL are be...,[{'text': '4.1. Importance of Memory Access Ef...,\n\n1. We know that each element of Matrix A i...,\n\nCritique:\n\nThe response is generally fac...,\n\nCritique:\n\nThe response is mostly correc...,\n\nCritique:\nThe response is helpful in unde...,\n\nCritique:\n\nThe content in this response ...,\n\nCritique:\n\nThe response effectively inco...,\n\nTo calculate the total number of bytes rea...,"Evaluation:\n[[1, ""The response fully addresse...",3.0
165,Each streaming multiprocessor (SM) in an NVIDI...,[{'text': ' FREE TOOLS AND TRAININGS FOR DEVE...,\n\nStep 1: Identify the compute capability of...,\n\nFinal answer: We cannot determine the numb...,\n\nFinal response:\n\nBased on the provided d...,\n\nCritique:\nThe initial response provides a...,\n\nFreshness of content critique:\n\nThe cont...,\n\nCritique:\nThe response effectively incorp...,"\n\nBased on the provided documents, we cannot...","Evaluation:\n[[\n- Completeness: (+1 if met, 0...",5.0
166,is it possible for theadidx.x>31,[{'text': ' This is an ID format used by the ...,"\n\nFirst, let's consider the ID64 format. The...","\n\nNext, let's consider the UDID format. The ...","\n\nNext, let's consider the UDID format. The ...","\n\nNext, let's consider the UDID format. The ...","\n\nNow, let's consider the UDID format. The U...","\n\nNow, let's consider the UDID format. The U...",\n\nThe ID64 format is a structure with differ...,[[\nCompleteness: +1 (The response provides a ...,5.0


In [21]:
def format_sentences(text):
    sentences = re.split(r'\.\s+', text) 
    formatted_text = "\n".join(sentence for sentence in sentences if sentence)  
    return formatted_text

final_data["llm_judge"] = final_data["llm_judge"].apply(format_sentences)

In [22]:
final_data.to_csv('final_judge.csv')