In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset("cais/mmlu", "college_mathematics")

Downloading readme:   0%|          | 0.00/53.2k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/138k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [4]:
dataset

DatasetDict({
    test: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 11
    })
    dev: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 5
    })
})

In [5]:
import requests
import time
import re
import pickle

In [6]:
url = "https://api.together.xyz/v1/chat/completions"

In [7]:
def PerformInference(prompt, model_name):
  payload = {
      "messages": [
          {
              "role": "system",
              "content": "You are a helpful assistant"
          },
          {
              "role": "user",
              "content": prompt
          }
      ],
      "model": model_name
  }
  headers = {
      "accept": "application/json",
      "content-type": "application/json",
      "authorization": "Bearer 57454ea184ce418edb5157490a09688755316fbffa89ed71b71b281f72042bb0"
  }

  response = requests.post(url, json=payload, headers=headers)

  return (response.json()['choices'][0]['message']['content'])

In [8]:
# Regular expression to match different patterns
pattern = r'Answer: Option (\d)|The correct answer is:? Option (\d)|Answer: (\d)|The correct answer is:? (\d)'

In [9]:
# Define the zero-shot prompt format
def create_zeroshot_prompt(question, options):
    return ("Choose the answer to the given question from below options(0, 1, 2, 3). \n"
            "Give me the answer in the below format: \n 'The correct answer is: Option X', here X is correct option number. \n"
            f"Question: {question}\n"
            f"Option 0: {options[0]}\n"
            f"Option 1: {options[1]}\n"
            f"Option 2: {options[2]}\n"
            f"Option 3: {options[3]}")


In [10]:
# Define the Chain of Thought prompt format
def create_chain_of_thought_prompt(question, options):
    return (f"Choose the answer to the given question from below options(0, 1, 2, 3).\n"
            "Give me the answer in the below format: \n 'The correct answer is: Option X', here X is correct option number. \n"
            f"Question: {question}\n"
            f"Option 0: {options[0]}\n"
            f"Option 1: {options[1]}\n"
            f"Option 2: {options[2]}\n"
            f"Option 3: {options[3]}\n"
            "Think step by step.")

In [11]:
# Define the ReAct prompt format
def create_react_prompt(question, options):
    return (f"Question: {question}\n"
            f"Options:\n"
            f"0. {options[0]}\n"
            f"1. {options[1]}\n"
            f"2. {options[2]}\n"
            f"3. {options[3]}\n"
            "Let's think step by step and act.\n"
            "Reasoning: [Your reasoning goes here].\n"
            "Action: Choose the answer to the given question from below options(0, 1, 2, 3).\n"
            "Give me the answer in the below format: \n 'The correct answer is: Option \n")

In [12]:
def find_accuracy(model_predictions , dataset_ans):
    correct_predictions = 0
    total_questions = len(dataset_ans)

    # Compare predictions to the actual answers
    for i in range(total_questions):
        if model_predictions[i] == dataset_ans[i]:
            correct_predictions += 1

    # Calculate accuracy as percentage
    accuracy = (correct_predictions / total_questions) * 100

    # Print result
    return (f"Accuracy: {accuracy:.2f}%")

In [13]:
# Iterate through the dataset and make predictions
questions = dataset['test'].select(range(20))

In [14]:
def inference_with_models(prompt_function, model_name, prompting_method, model_type):
    total_time = 0
    answers_list = []
    inference_list = []

    for example in questions:
        question = example['question']
        options = example['choices']
        prompt = prompt_function(question, options)

        start = time.time()
        prediction = PerformInference(prompt, model_name)
        end = time.time()

        total_time = total_time + (end-start)

        # Find all matches in the text
        matches = re.findall(pattern, prediction)
        # Extract non-empty matches and store in a list
        answers = [match[0] or match[1] or match[2] or match[3] for match in matches]
        answers_list.append(-1 if not answers else int(answers[0]))

        inference_list.append(prediction)

        time.sleep(1)

    print(f"Total_time : {total_time}\n")

    accuracy = find_accuracy(answers_list, questions['answer'])
    print(f"Accuracy : {accuracy}\n")

    with open(f'inference_list_{model_type}_{prompting_method}.pkl', 'wb') as f:
        pickle.dump(inference_list, f)

In [17]:
inference_with_models(create_zeroshot_prompt, "google/gemma-2b-it", "zero_shot","Gemma")

Total_time : 70.5777039527893



Accuracy : Accuracy: 28.00%




In [None]:
inference_with_models(create_chain_of_thought_prompt, "google/gemma-2b-it", "CoT","Gemma")

Total_time : 148.6077733039856



Accuracy : Accuracy: 20.00%




In [None]:
inference_with_models(create_react_prompt, "google/gemma-2b-it", "ReAct","Gemma")

Total_time : 129.6320505142212



Accuracy : Accuracy: 33.00%




In [None]:
inference_with_models(create_zeroshot_prompt, "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", "zero_shot","Llama")

Total_time : 185.82176899909973



Accuracy : Accuracy: 42.00%




In [None]:
inference_with_models(create_chain_of_thought_prompt, "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", "CoT","Llama")

Total_time : 312.6945879459381



Accuracy : Accuracy: 46.00%




In [None]:
inference_with_models(create_react_prompt, "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo", "ReAct","Llama")

Total_time : 349.09365916252136



Accuracy : Accuracy: 46.00%




In [15]:
!pip install git+https://github.com/huggingface/transformers.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-5gub3167
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-5gub3167
  Resolved https://github.com/huggingface/transformers.git to commit 78b2929c0554b79e0489b451ce4ece14d265ead2
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.45.0.dev0-py3-none-any.whl size=9786808 sha256=77555ee502e36b78eb33b223821e997cafad828819c4fa7a5ce9a5b9cfa02b6b
  Stored in directory: /tmp/pip-ephem-wheel-cache-ceody18q/wheels/e7/9c/5b/e1a9c8007c343041e61cc484433d512ea9274272e3fcbe7c16
Successfully built tr

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import login

In [17]:
# Load the model and tokenizer
model_name = "microsoft/Phi-3.5-mini-instruct"

access_token = "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"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token=access_token,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [18]:
# Perform inference using the model
def perform_inference(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
    outputs = model.generate(inputs["input_ids"], max_length=512)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [19]:
def generate_response(prompt_function, model_name, prompting_method, model_type):
    total_time_Phi = 0
    answers_Phi = []
    inference_list_Phi  = []

    for example in questions:
        question = example['question']
        options = example['choices']
        prompt = prompt_function(question, options)

        start = time.time()
        prediction = perform_inference(model, tokenizer, prompt)
        end = time.time()

        total_time_Phi = total_time_Phi + (end-start)

        # Find all matches in the text
        matches = re.findall(pattern, prediction)
        # Extract non-empty matches and store in a list
        answers = [match[0] or match[1] or match[2] or match[3] for match in matches]
        answers_Phi.append(-1 if not answers else int(answers[0]))

        inference_list_Phi.append(prediction)

#         time.sleep(1)

    print(f"Total_time : {total_time_Phi}\n")

    accuracy = find_accuracy(answers_Phi, questions['answer'])
    print(f"Accuracy : {accuracy}\n")

    with open(f'inference_list_{model_type}_{prompting_method}.pkl', 'wb') as f:
        pickle.dump(inference_list_Phi, f)

In [28]:
generate_response(create_zeroshot_prompt, model_name, "zero_shot","Phi")

Total_time : 2364.094246864319



Accuracy : Accuracy: 34.00%




In [20]:
generate_response(create_chain_of_thought_prompt, model_name, "CoT","Phi")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Total_time : 401.0253806114197

Accuracy : Accuracy: 35.00%



In [21]:
generate_response(create_react_prompt, model_name, "ReAct","Phi")

Total_time : 378.76421070098877

Accuracy : Accuracy: 25.00%

