In [1]:
import torch
import os
import gc
import pandas as pd
import re

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from huggingface_hub import InferenceClient

!huggingface-cli login --token ""

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /u/sy63/.cache/huggingface/token
Login successful


In [2]:
!nvidia-smi

Fri Apr 12 11:47:10 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          On  |   00000000:21:00.0 Off |                    0 |
| N/A   33C    P0             43W /  300W |       0MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
dataset = load_dataset("CAII-NCSA/critique_ECE408_FA23_first_interactions")

first_query = dataset['train']["first_query"]
first_response = dataset['train']["first_response"]
full_prompt = dataset['train']['full_prompt']
critique = dataset['train']['critique']

data = {'first_query': first_query,
     'first_response': first_response,
     "full_prompt": full_prompt,
       "critique": critique
       }

df = pd.DataFrame(data)
subset_dataset = df.iloc[:100]

In [4]:
model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"

In [5]:
JUDGE_PROMPT = """
You will be given a full_prompt and first_query couple.
Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question.
Review the user’s question and the corresponding response using a 10-point scoring system based on the following criteria:

1. Completeness: Add 1 point if the response addresses all aspects of the question in a holistic manner.
2. Factuality: Add 1 point if the response is accurate and based on factual information.
3. Helpfulness: Add 1 point if the response is likely to be useful to the user in addressing their concern or question.
4. Harmlessness: Add 1 point if the response avoids causing harm or spreading misinformation.
5. Freshness: Add 1 point if the response includes up-to-date information or perspectives.
6. Supported: Add 1 point if the response provides evidence or sources supporting the claims or information presented.
7. Retrieval necessary: Add 1 point if the answer benefits significantly from web retrieval or would benefit from such information.
8. Includes examples: Add 1 point if the response includes examples or analogies that help clarify the answer.
9. Useful: Add 1 point if the response provides actionable advice or insights beyond the basic answer.
10. AI Perspective: Add 1 point if the response is clearly from an AI, focusing on direct and comprehensive answering from an AI perspective.

Provide your feedback as follows:

Feedback:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (additive points from the prompt, as a number between 0 and 10)

You MUST provide values for 'Evaluation:' and 'Score:' in your answer.
Conclude with the score using the format: “Score: <Total rating>”

Now here are the question and answer.
Question: {question}
Answer: {answer}
"""

In [6]:
repo_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"

llm_client = InferenceClient(
    model=repo_id,
    timeout=120,
)

In [7]:
def generate_text(row):
    prompt = JUDGE_PROMPT.format(question=row['first_query'], answer=row['first_response'])
    return llm_client.text_generation(
        prompt=prompt,
        max_new_tokens=1000,
        temperature=0.7
    )
    generated_text = response.choices[0].text.strip()
    return generated_text

subset_dataset["llm_judge"] = subset_dataset.apply(generate_text, axis=1)

def extract_judge_score(answer: str, split_str: str = "Score:") -> int:
    try:
        if split_str in answer:
            rating = answer.split(split_str)[1]
        else:
            rating = answer
        digit_groups = [el.strip() for el in re.findall(r"\d+(?:\.\d+)?", rating)]
        return float(digit_groups[0])
    except Exception as e:
        print(e)
        return None

subset_dataset["llm_judge_score"] = subset_dataset["llm_judge"].apply(extract_judge_score)
subset_dataset

list index out of range


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_dataset["llm_judge"] = subset_dataset.apply(generate_text, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_dataset["llm_judge_score"] = subset_dataset["llm_judge"].apply(extract_judge_score)


Unnamed: 0,first_query,first_response,full_prompt,critique,llm_judge,llm_judge_score
0,Explain how tiling helps with global memory ba...,Tiling is a technique used in CUDA programming...,\nPlease analyze and respond to the following ...,1. The answer could benefit from a more concis...,\nFeedback:::\nEvaluation: The system_answer i...,8.0
1,Show an example of [CUDA API] function usage.,"Sure, here are a few examples of CUDA API func...",\nPlease analyze and respond to the following ...,1. The model-generated answer is generally cor...,\n\nFeedback:::\nEvaluation: The response is c...,10.0
2,what is my chance to get into ece master at ui...,"I'm sorry, but the documents provided do not c...",\nPlease analyze and respond to the following ...,The model-generated answer is technically corr...,\nFeedback:::\nEvaluation: This response does ...,2.0
3,Explain how tiling helps with global memory ba...,Tiling is a technique that helps optimize the ...,\nPlease analyze and respond to the following ...,The model-generated answer provides a good exp...,\nFeedback:::\nEvaluation: The answer provides...,11.0
4,When is the first midterm exam?,The first midterm exam is scheduled after appr...,\nPlease analyze and respond to the following ...,The model-generated answer is generally inform...,\nFeedback:::\nEvaluation: The system answer i...,8.0
...,...,...,...,...,...,...
95,NVIDIA A40 is which NVIDIA architecture?,The NVIDIA A40 GPU is based on the NVIDIA Ampe...,\nPlease analyze and respond to the following ...,"The model-generated answer is mostly correct, ...",\n\nFeedback:::\nEvaluation: The response is c...,10.0
96,How do I allocate GPU memory,"To allocate GPU memory in CUDA, you typically ...",\nPlease analyze and respond to the following ...,The model-generated answer is generally accura...,\nFeedback:::\nEvaluation: The answer is compl...,9.0
97,a warp of threads meaning,A warp of threads refers to a group of threads...,\nPlease analyze and respond to the following ...,1. While the answer accurately describes what ...,\n\n\nFeedback:::\nEvaluation: The response is...,9.0
98,What is a good number for threads per block fo...,When parallelizing operations on a long 1D vec...,\nPlease analyze and respond to the following ...,1. The model-generated answer provides a good ...,\n\nFeedback:::\nEvaluation: The answer is hig...,1.0


In [8]:
subset_dataset.to_csv('subset_dataset.csv')