In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import time

def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    return model, tokenizer

In [5]:
import torch
print(torch.cuda.is_available()) 

True


In [6]:
from huggingface_hub import login
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [7]:
def zero_shot_prompt(question, options):
    return f"Give the answer to the given question from below options.\nQuesion :- {question}\nOption-1 :- {options[0]}\nOption-2 :- {options[1]}\nOption-3 :- {options[2]}\nOption-4 :- {options[3]}"


In [8]:
def cot_prompt(question, options):
    return f"Give the answer of the given question from below options.\nQuestion :- {question}\n{options[0]}\n{options[1]}\n{options[2]}\n{options[3]}. Think step by step."


In [9]:
def perform_inference(model, tokenizer, prompt):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)
    
    start_time = time.time()
    inputs = tokenizer(prompt, return_tensors="pt").to(device)  
    
    outputs = model.generate(**inputs, max_new_tokens=120)
    end_time = time.time()
    return tokenizer.decode(outputs[0], skip_special_tokens=True), end_time - start_time


In [10]:
from datasets import load_dataset

dataset = load_dataset("cais/mmlu", "college_mathematics")
questions = dataset['test']['question']
options = dataset['test']['choices']
answers = dataset['test']['answer']

Downloading readme:   0%|          | 0.00/53.2k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/138k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.00k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [11]:
def evaluate_models(model,tokenizer,func):
    output = []
    time = []
    i=0
    for question, option_set in zip(questions, options):
        prompt = func(question, option_set)
        out, t = perform_inference(model, tokenizer, prompt)
        
        output.append(out)
        time.append(t)
        
        i+=1
        if i%10==0:
            print(i)
        
    return output, time

In [12]:
gemma_model, gemma_tokenizer = load_model("google/gemma-2b-it")

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [13]:
gemma_zero_output, gemma_zero_time = evaluate_models(gemma_model, gemma_tokenizer, zero_shot_prompt)

10
20
30
40
50
60
70
80
90
100


In [14]:
gemma_avg_zero_time = sum(gemma_zero_time)/len(gemma_zero_time)
print("Average time taken for one prompt of Gemma for Zero Shot :",gemma_avg_zero_time)

Average time taken for one prompt of Gemma for Zero Shot : 3.647759895324707


In [15]:
gemma_cot_output, gemma_cot_time = evaluate_models(gemma_model, gemma_tokenizer, cot_prompt)

10
20
30
40
50
60
70
80
90
100


In [16]:
gemma_avg_cot_time = sum(gemma_cot_time)/len(gemma_cot_time)
print("Average time taken for one prompt of Gemma for Chain of Thought :",gemma_avg_cot_time)

Average time taken for one prompt of Gemma for Chain of Thought : 4.350267882347107


In [17]:
def accuracy_calculation(model_answer,answers,options):
    c=0
    for i in range(len(model_answer)):
        v=model_answer[i].split('\n')
        for j in range(1,len(v)):
            if 'answer' in v[j] or 'Answer' in v[j]:
                w=v[j].split()
#                 print(v[j])
#                 print(answers[i])
#                 print(options[i])
                if 'Option' in v[j]:
                    ind=v[j].find('Option')
                    ans=v[j][ind+7]
                    if int(ans)==answers[i]+1:
                        c+=1
                else:
                    for k in range(len(options[i])):
                        if options[i][k] in v[j]:
                            if k==answers[i]:
                                c+=1
    return c/len(model_answer)
#                 elif w[1] in options[i]:
#                     ans=options[answers[i]]
#                     if w[1]==ans:
#                         c+=1
#                 else:
#                     ans=w[-1]
#                     if int(ans)==answers[i]:
#                         c+=1
                
#     return c/len(model_answer)
    

In [18]:
print("Accuracy of Gemma for Zero Shot :",accuracy_calculation(gemma_zero_output,answers,options))

Accuracy of Gemma for Zero Shot : 0.2376237623762376


In [19]:
print("Accuracy of Gemma for Chain of Thought :",accuracy_calculation(gemma_cot_output,answers,options))

Accuracy of Gemma for Chain of Thought : 0.38461538461538464


In [20]:
import gc
del gemma_model
del gemma_tokenizer
torch.cuda.empty_cache()
gc.collect()

21

In [21]:
phi_model, phi_tokenizer = load_model("microsoft/Phi-3.5-mini-instruct")

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

In [22]:
phi_zero_output, phi_zero_time = evaluate_models(phi_model, phi_tokenizer, zero_shot_prompt)

You are not running the flash-attention implementation, expect numerical differences.


10
20
30
40
50
60
70
80
90
100


In [23]:
phi_avg_zero_time = sum(phi_zero_time)/len(phi_zero_time)
print("Average time taken for one prompt of Phi for Zero Shot :",phi_avg_zero_time)

Average time taken for one prompt of Phi for Zero Shot : 11.004309170246124


In [24]:
phi_cot_output, phi_cot_time = evaluate_models(phi_model, phi_tokenizer, cot_prompt)

10
20
30
40
50
60
70
80
90
100


In [25]:
phi_avg_cot_time = sum(phi_cot_time)/len(phi_cot_time)
print("Average time taken for one prompt of Phi for Chain of Thought :",phi_avg_cot_time)

Average time taken for one prompt of Phi for Chain of Thought : 11.003227984905243


In [26]:
print("Accuracy of Phi for Zero Shot :",accuracy_calculation(phi_zero_output,answers,options))

Accuracy of Phi for Zero Shot : 0.5


In [27]:
print("Accuracy of Phi for Chain of Thought :",accuracy_calculation(phi_cot_output,answers,options))

Accuracy of Phi for Chain of Thought : 0.27692307692307694


In [29]:
print(gemma_zero_output[:10])

['Give the answer to the given question from below options.\nQuesion :- Let k be the number of real solutions of the equation e^x + x - 2 = 0 in the interval [0, 1], and let n be the number of real solutions that are not in [0, 1]. Which of the following is true?\nOption-1 :- k = 0 and n = 1\nOption-2 :- k = 1 and n = 0\nOption-3 :- k = n = 1\nOption-4 :- k > 1 and n = 0\n\nAnswer: Option-3\n\nExplanation: The equation e^x + x - 2 = 0 has one real solution in the interval [0, 1] since the exponential function is always positive in this interval. Therefore, k = 1 and n = 1.', 'Give the answer to the given question from below options.\nQuesion :- Up to isomorphism, how many additive abelian groups G of order 16 have the property that x + x + x + x = 0 for each x in G ?\nOption-1 :- 0\nOption-2 :- 1\nOption-3 :- 2\nOption-4 :- 3\n\nAnswer: 1\n\nExplanation: An abelian group G of order 16 has the property that x + x + x + x = 0 for each x in G if and only if G is isomorphic to Z_4.', 'Give

In [28]:
import gc
del phi_model
del phi_tokenizer
torch.cuda.empty_cache()
gc.collect()

99