## Installs & HF Logins

In [1]:
!pip install unsloth && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

from tqdm import tqdm
from datasets import load_dataset
from unsloth import FastLanguageModel
import torch

Collecting unsloth
  Downloading unsloth-2024.10.5-py3-none-any.whl.metadata (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting unsloth-zoo (from unsloth)
  Downloading unsloth_zoo-2024.10.4-py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting tyro (from unsloth)
  Downloading tyro-0.8.14-py3-none-any.whl.metadata (8.4 kB)
Collecting transformers<4.45.0 (from unsloth)
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Collecting wheel>=0.42.0 (from unsloth)
  Using cached wheel-0.44.0-py3-none-any.whl.metadata (2.3 kB)
Collecting trl!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.11.1,>=0.7.9 (from unsloth)
  Downloading trl-0.11.1-py3-none-an

In [2]:
# Can change the HF token based on FT model access

!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `test1` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `test1`


## Loading the FT model

In [3]:

max_seq_length = 3072 
dtype = None 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "gdevakumar/llama3_1_immigration_qlora", # Your MODEL_PATH YOU saved after training
    max_seq_length = max_seq_length,
    dtype = dtype,
)


==((====))==  Unsloth 2024.10.5: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA L4. Max memory: 21.951 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.0+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers, TRL and unsloth via:
`pip install --upgrade --no-cache-dir unsloth git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git`


adapter_model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

Unsloth 2024.10.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Inference on test data - To get Generated Responses

In [4]:
# Modify the prompt for DOMAIN

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction: You are an expert in immigration law with real-world experience in visa applications, green card processes, asylum claims, and naturalization procedures.. Based on the given input question, return a simple, clear, accurate and professional response that answers the question.

### Input:
{}

### Response:
{}"""

In [5]:
test_dataset = load_dataset('SJSU-SP24-DATA298-T3/ImmigrationQA', split='test') # Modify test dataset path
test_dataset

README.md:   0%|          | 0.00/3.92k [00:00<?, ?B/s]

train.json:   0%|          | 0.00/2.58M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/28.4k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['question', 'answer'],
    num_rows: 126
})

### Sample Inference (dont have to run this)

In [6]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "If i have 5 publications in top journals, can I surely get approved for EB1 visa as an international student pursuing masters in Data science in the US?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 200, use_cache = True)
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
result = response.split('Response:')[1]

In [7]:
print(result)


Yes, with 5 publications in top journals, you can confidently apply for the EB1 visa as an international student.


### Inference on whole Test dataset

In [6]:
def get_response(question, max_tokens=256):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer(
    [
        alpaca_prompt.format(
            question, "", # output - leave this blank for generation!
        )
    ], return_tensors = "pt").to("cuda")
    
    outputs = model.generate(**inputs, max_new_tokens = max_tokens, use_cache = True)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    result = response.split('Response:')[1]
    return result

In [7]:
generated_responses = []

for question in tqdm(test_dataset['question']):
    gen_ans = get_response(question)
    generated_responses.append(gen_ans)

100%|██████████| 126/126 [03:09<00:00,  1.50s/it]


In [8]:
test_dataset = test_dataset.add_column('Generated Answer', generated_responses)
test_dataset

Dataset({
    features: ['question', 'answer', 'Generated Answer'],
    num_rows: 126
})

In [9]:
# To save the inference results on HF or to csv (MODIFY NAMES OF PATH/FILE)

test_dataset.push_to_hub('sainathv02/llama3_1_imm_qlora_eval', token='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')
test_dataset.to_csv('llama3_1_imm_qlora_eval.csv')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/303 [00:00<?, ?B/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

40885

## LLM-as-a-judge

### FlowJudge as Judge

Install from github (https://github.com/flowaicom/flow-judge)

In [15]:
!pip uninstall flow-judge[llamafile,dev,hf,vllm] -y

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: flow-judge 0.1.0
Uninstalling flow-judge-0.1.0:
  Successfully uninstalled flow-judge-0.1.0
[0m

In [13]:
!pip install flow-judge[llamafile,dev,hf,vllm]
pip install -e ".[llamafile,dev,hf,vllm]"
# Remove top_p parameter line from github code in common.py and llamafile.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [15]:
import sys
import os

# Add the path to the directory containing flow_judge
sys.path.append('flow-judge')

# Now you can import the module
import flow_judge


In [16]:
import json
from flow_judge.models import Llamafile
from flow_judge.flow_judge import EvalInput, FlowJudge
from flow_judge.metrics import RESPONSE_CORRECTNESS_BINARY, RESPONSE_CORRECTNESS_3POINT, RESPONSE_CORRECTNESS_5POINT, \
RESPONSE_FAITHFULNESS_BINARY, RESPONSE_FAITHFULNESS_3POINT, RESPONSE_FAITHFULNESS_5POINT, \
RESPONSE_RELEVANCE_BINARY, RESPONSE_RELEVANCE_3POINT, RESPONSE_RELEVANCE_5POINT

from IPython.display import Markdown, display

# Create a model using ModelFactory
model = Llamafile()

In [17]:
# Initialize the judge
faithfulness_judge = FlowJudge(
    metric=RESPONSE_FAITHFULNESS_5POINT,
    model=model
)

# Load data
from datasets import load_dataset
data = load_dataset('csv', data_files='llama3_1_imm_qlora_eval.csv')['train']

Generating train split: 0 examples [00:00, ? examples/s]

In [18]:
data

Dataset({
    features: ['question', 'answer', 'Generated Answer'],
    num_rows: 126
})

In [20]:

# Create a list of EvalInput
eval_inputs_batch = [EvalInput(inputs=[{"query": sample["question"]}, {"context": sample["answer"]}], output={"response": sample['Generated Answer']}) for sample in data]

# Run the batch evaluation
results = faithfulness_judge.batch_evaluate(eval_inputs_batch, save_results=False)

INFO:root:Starting llamafile server...
INFO:root:Downloading llamafile to /root/.cache/flow-judge/flow-judge.llamafile
Downloading: 100%|██████████| 2.45G/2.45G [01:04<00:00, 40.9MiB/s]
INFO:root:Llamafile path: /root/.cache/flow-judge/flow-judge.llamafile
INFO:root:Quantized KV enabled
INFO:root:Flash Attention enabled
INFO:root:Starting llamafile server with command: sh -c '/root/.cache/flow-judge/flow-judge.llamafile --server --host 127.0.0.1 --port 8085 -c 8192 -ngl 34 --temp 0.1 -n 1000 --threads 64 --nobrowser -b 32 --parallel 1 --cont-batching -ctk q4_0 -ctv q4_0 -fa'
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
INFO:root:Subprocess started with PID: 1286
INFO:openai._base_client:Retrying request to /models in 0.457792 seconds
INFO:root:import_c

In [30]:
serializable_results = []
for result in results:
    serializable_results.append({
        "feedback": result.feedback,  # Feedback from the EvalOutput
        "score": result.score          # Score from the EvalOutput
    })

# Save the results to a file
with open("evaluation_results.json", "w") as file:
    import json
    json.dump(serializable_results, file, indent=4)

In [31]:

# Convert results to a serializable format and calculate aggregate score
serializable_results = []
total_score = 0
num_results = len(results)

for result in results:
    serializable_results.append({
        "feedback": result.feedback,  # Feedback from the EvalOutput
        "score": result.score          # Score from the EvalOutput
    })
    total_score += result.score

# Calculate average score
average_score = total_score / num_results if num_results > 0 else 0

# Prepare the final output including the aggregate score
output_data = {
    "results": serializable_results,
    "aggregate": {
        "total_score": total_score,
        "average_score": average_score
    }
}


In [32]:
output_data

{'results': [{'feedback': 'The response provided is highly consistent with the given context. The definition given in the response accurately reflects the information provided in the context, which states that a Green Card is a permanent resident card that allows foreign nationals to live and work in the U.S. indefinitely. The response correctly identifies the Green Card as a document that grants permanent residency and mentions that it allows individuals to live, work, and study in the U.S., which aligns with the concept of indefinite stay and work rights.\n\nThe response does not introduce any hallucinated or fabricated information. It stays true to the context by mentioning that the Green Card is a document and allows for permanent residency, which implies the ability to live, work, and study indefinitely. There are no significant deviations or additions that contradict the context.\n\nThe only minor addition is the mention of studying, which, while not explicitly stated in the cont

### Nemotron 340B as Judge

In [None]:
!pip install openai

In [83]:
JUDGE_PROMPT = """
You will be given a user_question, actual_answer and system_answer.
Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question. Use actual_answer as a ground truth for given user_question but its not necessary for system_answer to contain complete actual_answer to be perfect.
Give your answer on a scale of 1 to 4, where 1 means that the system_answer is not helpful at all, and 4 means that the system_answer completely and helpfully addresses the user_question accurately.

Here is the scale you should use to build your answer:
1: The system_answer is terrible: completely irrelevant to the question asked, or very partial
2: The system_answer is mostly not helpful: misses some key aspects of the question
3: The system_answer is mostly helpful: provides support, but still could be improved
4: The system_answer is excellent: relevant, direct, detailed, and addresses all the concerns raised in the question

Provide your feedback as follows:

Feedback:::
Total rating: (your rating, as a number between 1 and 4)

You MUST ONLY provide value for 'Total rating:' in your answer and nothing more.

Now here are the user_question and answers.

User Question: {user_question}
Actual Answer: {actual_answer}
System Answer: {system_answer}

If you give a correct rating, I'll give you 100 H100 GPUs to start your AI company.
Feedback::: 
Total rating: """


In [82]:
import re
from openai import OpenAI

client = OpenAI(
  base_url = "https://integrate.api.nvidia.com/v1",
  api_key = "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" # API can be generated at https://build.nvidia.com/nvidia/nemotron-4-340b-instruct 
)


def extract_rating(response):
    match = re.search(r"Total rating: (\d+)", response)
    if match:
        rating = int(match.group(1))
    else:
        rating = None
    return rating


def rate_responses(example):
    completion = client.chat.completions.create(
      model="nvidia/nemotron-4-340b-instruct",
      messages=[{"role": "user", "content": JUDGE_PROMPT.format(user_question=example['question'], 
                                                               actual_answer=example['answer'],
                                                               system_answer=example['Generated Answers'])}],
      temperature=0.15,
      top_p=0.8,
      max_tokens=1024
    )
    
    judge_response = completion.choices[0].message.content
    return extract_rating(judge_response)


In [80]:
test_dataset

Dataset({
    features: ['question', 'answer', 'Generated Answers'],
    num_rows: 126
})

In [84]:
ratings = []

for example in tqdm(test_dataset):
    ratings.append(rate_responses(example))

100%|██████████| 126/126 [05:28<00:00,  2.61s/it]


In [86]:
sum(ratings)/len(ratings)

3.3412698412698414

## MMLU

In [51]:
!pip install -U deepeval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting deepeval
  Downloading deepeval-1.4.3-py3-none-any.whl.metadata (977 bytes)
Collecting pytest (from deepeval)
  Downloading pytest-8.3.3-py3-none-any.whl.metadata (7.5 kB)
Collecting tabulate (from deepeval)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting typer (from deepeval)
  Downloading typer-0.12.5-py3-none-any.whl.metadata (15 kB)
Collecting pydantic (from deepeval)
  Downloading pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.4/149.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentry-sdk (from deepeval)
  Downloading sentry_sdk-2.17.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting pytest-repeat (from deepeval)
  Downloading pytest_repeat-0.9.3-py3-none-any.whl.metadata (4.9 kB)
Collecting pytest-xdist (from deepeval)
  Downloading pytest_xdist-3.6.1-py3-none-any.whl.metadata (4.3 kB)
Collecting portalocker (from deepeval)
  Downloading portalocker-2.10.

In [68]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from deepeval.models.base_model import DeepEvalBaseLLM
from typing import List
from deepeval.benchmarks import MMLU
from deepeval.benchmarks.tasks import MMLUTask

class LlaMA8b(DeepEvalBaseLLM):
    def __init__(
        self,
        model,
        tokenizer
    ):
        self.model = model
        self.tokenizer = tokenizer

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()

        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=64, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)[0]

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    # This is optional.
    def batch_generate(self, promtps: List[str]) -> List[str]:
        model = self.load_model()
        device = "cuda" # the device to load the model onto

        model_inputs = self.tokenizer(promtps, return_tensors="pt").to(device)
        model.to(device)

        generated_ids = model.generate(**model_inputs, max_new_tokens=64, do_sample=True)
        return self.tokenizer.batch_decode(generated_ids)

    def get_model_name(self):
        return "LlaMA 3.1 8B"

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "gdevakumar/llama3_1_immigration_lora", # Your MODEL_PATH YOU saved after training
#     max_seq_length = max_seq_length,
#     dtype = dtype,
# )


In [69]:
FastLanguageModel.for_inference(model)

llama = LlaMA8b(model=model, tokenizer=tokenizer)

In [70]:

# Define benchmark with specific tasks and shots
benchmark = MMLU(
    tasks=[MMLUTask.INTERNATIONAL_LAW, MMLUTask.PUBLIC_RELATIONS, MMLUTask.US_FOREIGN_POLICY ], # , MMLUTask.PROFESSIONAL_LAW],
    n_shots=5
)

In [71]:
# Replace 'llama' with your own custom model

benchmark.evaluate(model=llama)
print(benchmark.overall_score)

Processing international_law: 100%|██████████| 121/121 [00:15<00:00,  8.07it/s]


MMLU Task Accuracy (task=international_law): 0.0


Processing public_relations: 100%|██████████| 110/110 [00:13<00:00,  8.35it/s]


MMLU Task Accuracy (task=public_relations): 0.0


Processing us_foreign_policy: 100%|██████████| 100/100 [00:12<00:00,  8.30it/s]

MMLU Task Accuracy (task=us_foreign_policy): 0.0
Overall MMLU Accuracy: 0.0
0.0





In [72]:
benchmark.task_scores

Unnamed: 0,Task,Score
0,international_law,0.0
1,public_relations,0.0
2,us_foreign_policy,0.0


### Manually trying MMLU

In [32]:
# Modify the prompt for DOMAIN

mmlu_prompt = """Below is a question paired with four options. Choose the correct response and return only the option, that is A or B or C or D. Only return the option and nothing more. Choose only 1 correct option out of below. Return choice like option A or B or C or D.

### Instruction: You are an expert in {} and have extensive real world experience in this area.

### Input:
{}

### Choices: 
{}

### Response:
{}"""

def get_response(question, choices, subject = 'us_foreign_policy', max_tokens=256):
    FastLanguageModel.for_inference(model)
    inputs = tokenizer(
    [
        mmlu_prompt.format(
            subject, question, choices, "", # output - leave this blank for generation!
        )
    ], return_tensors = "pt").to("cuda")
    
    outputs = model.generate(**inputs, max_new_tokens = max_tokens, use_cache = True)
    response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    result = response.split('Response:')[1]
    return result

In [17]:
mmlu_us = load_dataset('cais/mmlu', 'us_foreign_policy')
mmlu_us

DatasetDict({
    test: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 11
    })
    dev: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 5
    })
})

In [18]:
test = mmlu_us['test']
val = mmlu_us['validation']
dev = mmlu_us['dev']

In [31]:
get_response(question='Which is not a nonstate actor that poses a threat to the United States?', 
            choices=['Terrorists', 'Organized crime', 'Drug traffickers', 'China'],
            subject='us_foreign_policy')

'\nChina'

In [33]:
mmlu_results = []

for q, c, s in tqdm(zip(dev['question'], dev['choices'], dev['subject'])):
    result = get_response(question=q, choices=str(c), subject=s)
    mmlu_results.append(result)

5it [00:07,  1.55s/it]


In [34]:
mmlu_results

['\nA: It damaged support for the US model of political economy and capitalism.',
 "\n['It globalized containment.', 'It militarized containment.', 'It called for the development of the hydrogen bomb.']",
 '\nforeign policy.',
 '\nDefensive realists place greater emphasis on the role of international institutions, place less emphasis on geographical factors, give more priority to the national interest than Offensive realists, and believe states to be security maximizers rather than power maximizers.',
 '\n[\'Liberal elites had encouraged globalization, while \'ordinary Americans\' lost jobs because of it\', \'Globalization only benefited certain American states, such as New York\', \'Globalization had made men like him too rich\', "Globalization encouraged damaging trade wars"]']