# 🚨 How it works:

READ ME:
1.  Run the 🌀 *Setting Up* section, to dowload and install modules needed. Please enter your own Hugging Face token in the token cell (3rd cell in 🌀 *Setting Up*). Make sure you have requested all accesses as well as signed the agreements needed for the LLMs you are testing.
2. In the 🌀 *TESTING* section, run whichever 🔆 *Model* section of the LLM you want to test.

#  🌀 Setting Up

In [None]:
! pip install transformers -q
! pip install sentencepiece -q
! pip install accelerate -q
! pip install ipywidgets -q

In [None]:
# Import all needed packages

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline #fastest way to use pre-trained models for interferance
import torch
import os
import gc
import pandas as pd
import ast
import time
import re

In [None]:
# Create a token in Hugging Face, and copy it instead of YOUR TOKEN:

!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('YOUR TOKEN')"

# 🌀 TESTING

Choose your model within the range of 🔆 *Model* section offered (or make a new one and copy the pattern). Running all the cells within a 🔆 *Model* section will generate at the end 2 benchmark tables (code + math) for the chosen LLM model.

These 🔆 *Model* sections each include 11 cells for each LLMs:
- Dowloading/Uplaoding the LLM model and tokenizer in question needed for Code Evaluation*
- CODE PROMPTING = the actual evaluation of the LLM for code*
- the head() to view final result of CODE PROMPTING in a cleaner way
- removing the GPU that the LLM model for code occupied using gc (garbage collection)*
- checking through smi that the GPU is gone and we have space
- Dowloading/Uplaoding the LLM model and tokenizer in question needed for Math Evaluation*
- MATH PROMPTING = the actual evaluation of the LLM for math*
- the head() to view final result of MATH PROMPTING in a cleaner way
- removing the GPU that the LLM model_pipeline for math occupied using gc (garbage collection)*
- checking through smi that the GPU is gone and we have space
- Formating the Code and Math benchmark tables into a clean viewing experience**

Symbol * = Needed, must do

Symbol ** = Not mandatory but highly recommended if all needed cells have been executed

# 🔆 Yi 1.5, by 01.AI (6B)

01-ai/Yi-1.5-6B

In [None]:
#Dowloading/Uplaoding the LLM model and tokenizer in question needed for Code Evaluation

#Model bigger than 4B
model = AutoModelForCausalLM.from_pretrained("01-ai/Yi-1.5-6B", trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()

#Tokenizer
tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-1.5-6B", trust_remote_code=True)

In [None]:
# CODE PROMPTING


#To prevent this error: Error: cutlassF: no kernel found to launch!
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)


# Define the prompts for Code/Python testing
prompts = [
"#You are an expert Python programmer, and here is your task: Write a Python function to print 'Hello, World!'. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to count the number of vowels in a given string. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to reverse a given list in-place. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to remove duplicates from a given list. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to check if a given number is even or odd. Print just the function without any further informations.",
]


# Define the test names (for columns)

test_names = [
    "Print Hello World test",
    "Vowel Count test",
    "Reverse List test",
    "Remove Duplicates test",
    "Even/Odd test"
]

# Create the DataFrame with the test column names
benchmarkCodeYi6= pd.DataFrame(index=["01-ai/Yi-1.5-6B"], columns=test_names)



for i, prompt in enumerate(prompts):
    try:
        # Measure the time it took to generate the code
        #torch.manual_seed(0)
        start_time = time.time()
        # Use the pre-loaded model
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_length=128)
        code = tokenizer.decode(outputs[0], skip_special_tokens=True)
        #print(code) #IF NEED GENERATED CODE PRINTED
        end_time = time.time()
        generation_time = end_time - start_time

        # Extract the function definition from the code
        try:
            tree = ast.parse(code)
            function_def = next((node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)), None)
            if function_def:
                function_name = function_def.name
                code = ast.unparse(function_def)
                #print(code) #IF NEED GENERATED WORKING (True) CODE PRINTED
            else:
                #print("invalid",code) #IF NEED GENERATED NOT WORKING (False) CODE PRINTED
                code = None

        except SyntaxError:
        #    print(e)
             code = None

        if code:
            # Execute the code
            exec(code)
            # Test the code
            if prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to print 'Hello, World!'."):
                # Test the print helloworld function
                result = globals()[function_name]()==  print("Hello, World!")
                print(f"Print Hello World test: {result}")

            elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to count the number of vowels in a given string."):
                # Test the vowel count function
                result = (globals()[function_name]("hello") == 2) and (globals()[function_name]("Maggie Rogers") == 5)
                print(f"Vowel Count test: {result}")

            elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to reverse a given list in-place."):
                # Test the reverse list function
                list1 = [1, 2, 3, 4, 5]
                list2 = [16, 5, 1, 10, 0, 7]
                list3 = [3]
                list4 = []
                result = (globals()[function_name](list1) == [5, 4, 3, 2, 1]) and (globals()[function_name](list2) == [7, 0, 10, 1, 5, 16]) and (globals()[function_name](list3) == [3]) and (globals()[function_name](list4) == [])
                print(f"Reverse List test: {result}")

            elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to remove duplicates from a given list."):
                # Test the remove duplicates function
                result = (globals()[function_name]([1, 1, 2, 2, 3, 3, 4, 5, 5]) == [1, 2, 3, 4, 5]) and (globals()[function_name]([1, 1, 1, 1, 1, 1]) == [1])
                print(f"Remove Duplicates test: {result}")

            elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to check if a given number is even or odd."):
                # Test the even/odd function
                result = (globals()[function_name](10) == True) and (globals()[function_name](11) == False)
                print(f"Even/Odd test: {result}")

            # Store the result in the DataFrame
            benchmarkCodeYi6.at["01-ai/Yi-1.5-6B", test_names[i]] = f"{('Yes' if result else 'No')} ({generation_time:.2f}s)"
    except Exception as e:
        # Catch any exceptions, including GPU crashes
        benchmarkCodeYi6.at["01-ai/Yi-1.5-6B", test_names[i]] = f"Error: {e}"

# Print the benchmark table
print(benchmarkCodeYi6)

In [None]:
benchmarkCodeYi6.head()

In [None]:
model = None
gc.collect()
torch.cuda.empty_cache()

In [None]:
! nvidia-smi

In [None]:
#Download and save the model

model_pipeline = pipeline(model="01-ai/Yi-1.5-6B",
                            torch_dtype=torch.bfloat16, # bytes precisions
                            trust_remote_code=True,
                            device_map="auto" # will use automatically which best gpu/cpu for model
                         )

tokenizer = AutoTokenizer.from_pretrained("01-ai/Yi-1.5-6B", trust_remote_code=True)

In [None]:
print(model_pipeline)

In [None]:
! nvidia-smi

In [None]:
# MATH PROMPTING


#To prevent this error: Error: cutlassF: no kernel found to launch!
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)


# Define the prompts for Code/Python testing
prompts = [
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (3+8)-2 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (6*8)/2 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of 4 to the power of 3 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of squareroot 144 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of the fractions 5/7 + 10/11 ? Print just the result without any further informations."
]



# define function that extracts float number after "is ", so the answer to the prompt

def extract_math_answer(text):
    # Pattern to match "is" followed by a number (including decimals and negative numbers)
    pattern = r'is\s+(-?\d+(?:\.\d+)?)'
    # Find the first match in the text
    match = re.search(pattern, text)
    # If a match is found, return it as a float
    if match:
        return float(match.group(1))
    # If no match is found, return None
    return None


# define function prompting the LLM, takes the prompt as parameter

def get_completion_model(text):
    system = f"""
    You are an expert Mathematician.
    You are good at performing and explaining Mathematics concepts in simple words.
    Always end your response with 'The answer is X' where X is the final numerical result.
    Help as much as you can.
    """
    prompt = f"#### System: {system}\n#### User : \n {text} \n\n#### Response from Yi:"
    #print(prompt)
    model_response = model_pipeline(prompt, max_new_tokens=500)
    answer = model_response[0]["generated_text"]
    #print(answer)
    final = extract_math_answer(answer)
    return final


# Define the test names (for columns)

test_names = [
    "Addition/Subtraction",
    "Multiplication/Division",
    "Power",
    "Square Root",
    "Fractions"
]

# Create the DataFrame with the test column names
benchmarkMathYi6 = pd.DataFrame(index=["01-ai/Yi-1.5-6B"], columns=test_names)


#Prompt and test to avlaute the LLM

for i, prompt in enumerate(prompts):
    try:
        # Measure the time it took to generate the code
        torch.manual_seed(0)
        start_time = time.time()
        math = get_completion_model(prompt)
        end_time = time.time()
        generation_time = end_time - start_time

        if math:
            # Test the math
            if prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (3+8)-2 ?"):
                result = int(math) == 9
                print(f"Add/Sub test: {result}")
            elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (6*8)/2 ?"):
                result = int(math) == 24
                print(f"Mult/Div test: {result}")
            elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of 4 to the power of 3 ?"):
                result = int(math) == 64
                print(f"Power test: {result}")
            elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of squareroot 144 ?"):
                result = int(math) == 12
                print(f"SQRT test: {result}")
            elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of the fractions 5/7 + 10/11 ?"):
                result = math == 125/77
                print(f"Fractions test: {result}")

        # Store the result in the DataFrame using the test name
        benchmarkMathYi6.at["01-ai/Yi-1.5-6B", test_names[i]] = f"{('Yes' if result else 'No')} ({generation_time:.2f}s)"
    except Exception as e:
        # Catch any exceptions, including GPU crashes
        benchmarkMathYi6.at["01-ai/Yi-1.5-6B", test_names[i]] = f"Error: {e}"

# Print the benchmark table
print(benchmarkMathYi6)

In [None]:
benchmarkMathYi6.head()

In [None]:
model_pipeline = None
gc.collect()
torch.cuda.empty_cache()
time.sleep(5)

In [None]:
! nvidia-smi

In [None]:
class color:
  PURPLE = '\033[95m'
  CYAN = '\033[96m'
  DARKCYAN = '\033[36m'
  BLUE = '\033[94m'
  GREEN = '\033[92m'
  YELLOW = '\033[93m'
  RED = '\033[91m'
  BOLD = '\033[1m'
  UNDERLINE = '\033[4m'
  END = '\033[0m'



# Define the test names for Code
test_names_Code = [
   "Print Hello World test",
   "Vowel Count test",
   "Reverse List test",
   "Remove Duplicates test",
   "Even/Odd test"
]


# Define the test names for Math
test_names_Math = [
   "Addition/Subtraction",
   "Multiplication/Division",
   "Power",
   "Square Root",
   "Fractions"
]



# YI 6B
print(color.BOLD + "\n \n EVALUATION for Yi 6B: \n \n" + color.END)

#Show the Code Benchmark
print("Code Evaluations for Yi 6B:")
print(tabulate(benchmarkCodeYi6, headers=test_names_Code, tablefmt='fancy_grid'))

#Show the Math Benchmark
print("\n \n Math Evaluations for Yi 6B:")
print(tabulate(benchmarkMathYi6, headers=test_names_Math, tablefmt='fancy_grid'))

# 🔆 Gemma, by Google (2B)

google/gemma-2b

In [None]:
#Model bigger than 4B
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()

#Tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", trust_remote_code=True)

In [None]:
# CODE PROMPTING


#To prevent this error: Error: cutlassF: no kernel found to launch!
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)


# Define the prompts for Code/Python testing
prompts = [
"#You are an expert Python programmer, and here is your task: Write a Python function to print 'Hello, World!'. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to count the number of vowels in a given string. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to reverse a given list in-place. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to remove duplicates from a given list. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to check if a given number is even or odd. Print just the function without any further informations.",
]


# Define the test names (for columns)

test_names = [
   "Print Hello World test",
   "Vowel Count test",
   "Reverse List test",
   "Remove Duplicates test",
   "Even/Odd test"
]


# Create the DataFrame with the test column names
benchmarkCodeGem2= pd.DataFrame(index=["google/gemma-2b"], columns=test_names)



for i, prompt in enumerate(prompts):
   try:
       # Measure the time it took to generate the code
       #torch.manual_seed(0)
       start_time = time.time()
       # Use the pre-loaded model
       inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
       outputs = model.generate(**inputs, max_length=128)
       code = tokenizer.decode(outputs[0], skip_special_tokens=True)
       #print(code) #IF NEED GENERATED CODE PRINTED
       end_time = time.time()
       generation_time = end_time - start_time


       # Extract the function definition from the code
       try:
           tree = ast.parse(code)
           function_def = next((node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)), None)
           if function_def:
               function_name = function_def.name
               code = ast.unparse(function_def)
               #print(code) #IF NEED GENERATED WORKING (True) CODE PRINTED
           else:
               #print("invalid",code) #IF NEED GENERATED NOT WORKING (False) CODE PRINTED
               code = None


       except SyntaxError:
       #    print(e)
            code = None


       if code:
           # Execute the code
           exec(code)
           # Test the code
           if prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to print 'Hello, World!'."):
               # Test the print helloworld function
               result = globals()[function_name]()==  print("Hello, World!")
               print(f"Print Hello World test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to count the number of vowels in a given string."):
               # Test the vowel count function
               result = (globals()[function_name]("hello") == 2) and (globals()[function_name]("Maggie Rogers") == 5)
               print(f"Vowel Count test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to reverse a given list in-place."):
               # Test the reverse list function
               list1 = [1, 2, 3, 4, 5]
               list2 = [16, 5, 1, 10, 0, 7]
               list3 = [3]
               list4 = []
               result = (globals()[function_name](list1) == [5, 4, 3, 2, 1]) and (globals()[function_name](list2) == [7, 0, 10, 1, 5, 16]) and (globals()[function_name](list3) == [3]) and (globals()[function_name](list4) == [])
               print(f"Reverse List test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to remove duplicates from a given list."):
               # Test the remove duplicates function
               result = (globals()[function_name]([1, 1, 2, 2, 3, 3, 4, 5, 5]) == [1, 2, 3, 4, 5]) and (globals()[function_name]([1, 1, 1, 1, 1, 1]) == [1])
               print(f"Remove Duplicates test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to check if a given number is even or odd."):
               # Test the even/odd function
               result = (globals()[function_name](10) == True) and (globals()[function_name](11) == False)
               print(f"Even/Odd test: {result}")


           # Store the result in the DataFrame
           benchmarkCodeGem2.at["google/gemma-2b", test_names[i]] = f"{('Yes' if result else 'No')} ({generation_time:.2f}s)"
   except Exception as e:
       # Catch any exceptions, including GPU crashes
       benchmarkCodeGem2.at["google/gemma-2b", test_names[i]] = f"Error: {e}"


# Print the benchmark table
print(benchmarkCodeGem2)

In [None]:
benchmarkCodeGem2.head()

In [None]:
model = None
gc.collect()
torch.cuda.empty_cache()

In [None]:
! nvidia-smi

In [None]:
#Download and save the model

model_pipeline = pipeline(model="google/gemma-2b",
                           torch_dtype=torch.bfloat16, # bytes precisions
                           trust_remote_code=True,
                           device_map="auto" # will use automatically which best gpu/cpu for model
                        )

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", trust_remote_code=True)

In [None]:
# MATH PROMPTING

import torch
import pandas as pd
import ast
import time
import re


#To prevent this error: Error: cutlassF: no kernel found to launch!
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)


# Define the prompts for Code/Python testing
prompts = [
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (3+8)-2 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (6*8)/2 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of 4 to the power of 3 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of squareroot 144 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of the fractions 5/7 + 10/11 ? Print just the result without any further informations."
]


# define function that extracts float number after "is ", so the answer to the prompt

def extract_math_answer(text):
   # Pattern to match "is" followed by a number (including decimals and negative numbers)
   pattern = r'is\s+(-?\d+(?:\.\d+)?)'
   # Find the first match in the text
   match = re.search(pattern, text)
   # If a match is found, return it as a float
   if match:
       return float(match.group(1))
   # If no match is found, return None
   return None


# define function prompting the LLM, takes the prompt as parameter

def get_completion_model(text):
   system = f"""
   You are an expert Mathematician.
   You are good at performing and explaining Mathematics concepts in simple words.
   Always end your response with 'The answer is X' where X is the final numerical result.
   Help as much as you can.
   """
   prompt = f"#### System: {system}\n#### User : \n {text} \n\n#### Response from Gemma:"
   #print(prompt)
   model_response = model_pipeline(prompt, max_new_tokens=500)
   answer = model_response[0]["generated_text"]
   #print(answer)
   final = extract_math_answer(answer)
   return final



# Define the test names (for columns)

test_names = [
   "Addition/Subtraction",
   "Multiplication/Division",
   "Power",
   "Square Root",
   "Fractions"
]


# Create the DataFrame with the test column names
benchmarkMathGem2 = pd.DataFrame(index=["google/gemma-2b"], columns=test_names)


#Prompt and test to evaluate the LLM

for i, prompt in enumerate(prompts):
   try:
       # Measure the time it took to generate the code
       torch.manual_seed(0)
       start_time = time.time()
       math = get_completion_model(prompt)
       end_time = time.time()
       generation_time = end_time - start_time

       if math:
           # Test the math
           if prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (3+8)-2 ?"):
               result = int(math) == 9
               print(f"Add/Sub test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (6*8)/2 ?"):
               result = int(math) == 24
               print(f"Mult/Div test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of 4 to the power of 3 ?"):
               result = int(math) == 64
               print(f"Power test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of squareroot 144 ?"):
               result = int(math) == 12
               print(f"SQRT test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of the fractions 5/7 + 10/11 ?"):
               result = math == 125/77
               print(f"Fractions test: {result}")

       # Store the result in the DataFrame using the test name
       benchmarkMathGem2.at["google/gemma-2b", test_names[i]] = f"{('Yes' if result else 'No')} ({generation_time:.2f}s)"
   except Exception as e:
       # Catch any exceptions, including GPU crashes
       benchmarkMathGem2.at["google/gemma-2b", test_names[i]] = f"Error: {e}"


# Print the benchmark table
print(benchmarkMathGem2)

In [None]:
benchmarkMathGem2.head()

In [None]:
model_pipeline = None
gc.collect()
torch.cuda.empty_cache()

In [None]:
! nvidia-smi

In [None]:
from tabulate import tabulate


class color:
  PURPLE = '\033[95m'
  CYAN = '\033[96m'
  DARKCYAN = '\033[36m'
  BLUE = '\033[94m'
  GREEN = '\033[92m'
  YELLOW = '\033[93m'
  RED = '\033[91m'
  BOLD = '\033[1m'
  UNDERLINE = '\033[4m'
  END = '\033[0m'



# Define the test names for Code
test_names_Code = [
   "Print Hello World test",
   "Vowel Count test",
   "Reverse List test",
   "Remove Duplicates test",
   "Even/Odd test"
]


# Define the test names for Math
test_names_Math = [
   "Addition/Subtraction",
   "Multiplication/Division",
   "Power",
   "Square Root",
   "Fractions"
]



# Gemma 2B
print(color.BOLD + "\n \n EVALUATION for Gemma 2B: \n \n" + color.END)

#Show the Code Benchmark
print("Code Evaluations for Gemma 2B:")
print(tabulate(benchmarkCodeGem2, headers=test_names_Code, tablefmt='fancy_grid'))

#Show the Math Benchmark
print("\n \n Math Evaluations for Gemma 2B:")
print(tabulate(benchmarkMathGem2, headers=test_names_Math, tablefmt='fancy_grid'))

# 🔆 Vicuna, by LMSYS Org (7B)

lmsys/vicuna-7b-v1.3

In [None]:
#Model bigger than 4B
model = AutoModelForCausalLM.from_pretrained("lmsys/vicuna-7b-v1.3", trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()

#Tokenizer
tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.3", trust_remote_code=True)

In [None]:
# CODE PROMPTING


import pandas as pd
import ast
import time


#To prevent this error: Error: cutlassF: no kernel found to launch!
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)




# Define the prompts for Code/Python testing
prompts = [
"#You are an expert Python programmer, and here is your task: Write a Python function to print 'Hello, World!'. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to count the number of vowels in a given string. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to reverse a given list in-place. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to remove duplicates from a given list. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to check if a given number is even or odd. Print just the function without any further informations.",
]




# Define the test names (for columns)


test_names = [
   "Print Hello World test",
   "Vowel Count test",
   "Reverse List test",
   "Remove Duplicates test",
   "Even/Odd test"
]


# Create the DataFrame with the test column names
benchmarkCodeVi7= pd.DataFrame(index=["lmsys/vicuna-7b-v1.3"], columns=test_names)






for i, prompt in enumerate(prompts):
   try:
       # Measure the time it took to generate the code
       #torch.manual_seed(0)
       start_time = time.time()
       # Use the pre-loaded model
       inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
       outputs = model.generate(**inputs, max_length=128)
       code = tokenizer.decode(outputs[0], skip_special_tokens=True)
       #print(code) #IF NEED GENERATED CODE PRINTED
       end_time = time.time()
       generation_time = end_time - start_time


       # Extract the function definition from the code
       try:
           tree = ast.parse(code)
           function_def = next((node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)), None)
           if function_def:
               function_name = function_def.name
               code = ast.unparse(function_def)
               #print(code) #IF NEED GENERATED WORKING (True) CODE PRINTED
           else:
               #print("invalid",code) #IF NEED GENERATED NOT WORKING (False) CODE PRINTED
               code = None


       except SyntaxError:
       #    print(e)
            code = None


       if code:
           # Execute the code
           exec(code)
           # Test the code
           if prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to print 'Hello, World!'."):
               # Test the print helloworld function
               result = globals()[function_name]()==  print("Hello, World!")
               print(f"Print Hello World test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to count the number of vowels in a given string."):
               # Test the vowel count function
               result = (globals()[function_name]("hello") == 2) and (globals()[function_name]("Maggie Rogers") == 5)
               print(f"Vowel Count test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to reverse a given list in-place."):
               # Test the reverse list function
               list1 = [1, 2, 3, 4, 5]
               list2 = [16, 5, 1, 10, 0, 7]
               list3 = [3]
               list4 = []
               result = (globals()[function_name](list1) == [5, 4, 3, 2, 1]) and (globals()[function_name](list2) == [7, 0, 10, 1, 5, 16]) and (globals()[function_name](list3) == [3]) and (globals()[function_name](list4) == [])
               print(f"Reverse List test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to remove duplicates from a given list."):
               # Test the remove duplicates function
               result = (globals()[function_name]([1, 1, 2, 2, 3, 3, 4, 5, 5]) == [1, 2, 3, 4, 5]) and (globals()[function_name]([1, 1, 1, 1, 1, 1]) == [1])
               print(f"Remove Duplicates test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to check if a given number is even or odd."):
               # Test the even/odd function
               result = (globals()[function_name](10) == True) and (globals()[function_name](11) == False)
               print(f"Even/Odd test: {result}")


           # Store the result in the DataFrame
           benchmarkCodeVi7.at["lmsys/vicuna-7b-v1.3", test_names[i]] = f"{('Yes' if result else 'No')} ({generation_time:.2f}s)"
   except Exception as e:
       # Catch any exceptions, including GPU crashes
       benchmarkCodeVi7.at["lmsys/vicuna-7b-v1.3", test_names[i]] = f"Error: {e}"


# Print the benchmark table
print(benchmarkCodeVi7)

In [None]:
benchmarkCodeVi7.head()

In [None]:
model = None
gc.collect()
torch.cuda.empty_cache()

In [None]:
! nvidia-smi

In [None]:
#Download and save the model


model_pipeline = pipeline(model="lmsys/vicuna-7b-v1.3",
                           torch_dtype=torch.bfloat16, # bytes precisions
                           trust_remote_code=True,
                           device_map="auto" # will use automatically which best gpu/cpu for model
                        )


tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.3", trust_remote_code=True)

In [None]:
# MATH PROMPTING


import torch
import pandas as pd
import ast
import time
import re




#To prevent this error: Error: cutlassF: no kernel found to launch!
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)




# Define the prompts for Code/Python testing
prompts = [
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (3+8)-2 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (6*8)/2 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of 4 to the power of 3 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of squareroot 144 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of the fractions 5/7 + 10/11 ? Print just the result without any further informations."
]






# define function that extracts float number after "is ", so the answer to the prompt


def extract_math_answer(text):
   # Pattern to match "is" followed by a number (including decimals and negative numbers)
   pattern = r'is\s+(-?\d+(?:\.\d+)?)'
   # Find the first match in the text
   match = re.search(pattern, text)
   # If a match is found, return it as a float
   if match:
       return float(match.group(1))
   # If no match is found, return None
   return None




# define function prompting the LLM, takes the prompt as parameter


def get_completion_model(text):
   system = f"""
   You are an expert Mathematician.
   You are good at performing and explaining Mathematics concepts in simple words.
   Always end your response with 'The answer is X' where X is the final numerical result.
   Help as much as you can.
   """
   prompt = f"#### System: {system}\n#### User : \n {text} \n\n#### Response from Yi:"
   #print(prompt)
   model_response = model_pipeline(prompt, max_new_tokens=500)
   answer = model_response[0]["generated_text"]
   #print(answer)
   final = extract_math_answer(answer)
   return final




# Define the test names (for columns)


test_names = [
   "Addition/Subtraction",
   "Multiplication/Division",
   "Power",
   "Square Root",
   "Fractions"
]


# Create the DataFrame with the test column names
benchmarkMathVi7 = pd.DataFrame(index=["lmsys/vicuna-7b-v1.3"], columns=test_names)




#Prompt and test to evaluate the LLM


for i, prompt in enumerate(prompts):
   try:
       # Measure the time it took to generate the code
       torch.manual_seed(0)
       start_time = time.time()
       math = get_completion_model(prompt)
       end_time = time.time()
       generation_time = end_time - start_time

       if math:
           # Test the math
           if prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (3+8)-2 ?"):
               result = int(math) == 9
               print(f"Add/Sub test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (6*8)/2 ?"):
               result = int(math) == 24
               print(f"Mult/Div test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of 4 to the power of 3 ?"):
               result = int(math) == 64
               print(f"Power test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of squareroot 144 ?"):
               result = int(math) == 12
               print(f"SQRT test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of the fractions 5/7 + 10/11 ?"):
               result = math == 125/77
               print(f"Fractions test: {result}")

       # Store the result in the DataFrame using the test name
       benchmarkMathVi7.at["lmsys/vicuna-7b-v1.3", test_names[i]] = f"{('Yes' if result else 'No')} ({generation_time:.2f}s)"
   except Exception as e:
       # Catch any exceptions, including GPU crashes
       benchmarkMathVi7.at["lmsys/vicuna-7b-v1.3", test_names[i]] = f"Error: {e}"


# Print the benchmark table
print(benchmarkMathVi7)

In [None]:
benchmarkMathVi7.head()

In [None]:
model_pipeline = None
gc.collect()
torch.cuda.empty_cache()

In [None]:
! nvidia-smi

In [None]:
from tabulate import tabulate


class color:
  PURPLE = '\033[95m'
  CYAN = '\033[96m'
  DARKCYAN = '\033[36m'
  BLUE = '\033[94m'
  GREEN = '\033[92m'
  YELLOW = '\033[93m'
  RED = '\033[91m'
  BOLD = '\033[1m'
  UNDERLINE = '\033[4m'
  END = '\033[0m'



# Define the test names for Code
test_names_Code = [
   "Print Hello World test",
   "Vowel Count test",
   "Reverse List test",
   "Remove Duplicates test",
   "Even/Odd test"
]


# Define the test names for Math
test_names_Math = [
   "Addition/Subtraction",
   "Multiplication/Division",
   "Power",
   "Square Root",
   "Fractions"
]




# Vicuna 7B
print(color.BOLD + "\n \n EVALUATION for Vicuna 7B: \n \n" + color.END)


#Show the Code Benchmark
print("Code Evaluations for Vicuna 7B:")
print(tabulate(benchmarkCodeVi7, headers=test_names_Code, tablefmt='fancy_grid'))


#Show the Math Benchmark
print("\n \n Math Evaluations for Vicuna 7B:")
print(tabulate(benchmarkMathVi7, headers=test_names_Math, tablefmt='fancy_grid'))

# 🔆 Mistral, by Mistral AI (7B)


mistralai/Mistral-7B-v0.1

In [None]:
#Model bigger than 4B
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()

#Tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True)

In [None]:
# CODE PROMPTING


import pandas as pd
import ast
import time


#To prevent this error: Error: cutlassF: no kernel found to launch!
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)




# Define the prompts for Code/Python testing
prompts = [
"#You are an expert Python programmer, and here is your task: Write a Python function to print 'Hello, World!'. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to count the number of vowels in a given string. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to reverse a given list in-place. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to remove duplicates from a given list. Print just the function without any further informations.",
"#You are an expert Python programmer, and here is your task: Write a Python function to check if a given number is even or odd. Print just the function without any further informations.",
]




# Define the test names (for columns)


test_names = [
   "Print Hello World test",
   "Vowel Count test",
   "Reverse List test",
   "Remove Duplicates test",
   "Even/Odd test"
]


# Create the DataFrame with the test column names
benchmarkCodeMi7= pd.DataFrame(index=["mistralai/Mistral-7B-v0.1"], columns=test_names)






for i, prompt in enumerate(prompts):
   try:
       # Measure the time it took to generate the code
       #torch.manual_seed(0)
       start_time = time.time()
       # Use the pre-loaded model
       inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
       outputs = model.generate(**inputs, max_length=128)
       code = tokenizer.decode(outputs[0], skip_special_tokens=True)
       #print(code) #IF NEED GENERATED CODE PRINTED
       end_time = time.time()
       generation_time = end_time - start_time


       # Extract the function definition from the code
       try:
           tree = ast.parse(code)
           function_def = next((node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)), None)
           if function_def:
               function_name = function_def.name
               code = ast.unparse(function_def)
               #print(code) #IF NEED GENERATED WORKING (True) CODE PRINTED
           else:
               #print("invalid",code) #IF NEED GENERATED NOT WORKING (False) CODE PRINTED
               code = None


       except SyntaxError:
       #    print(e)
            code = None


       if code:
           # Execute the code
           exec(code)
           # Test the code
           if prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to print 'Hello, World!'."):
               # Test the print helloworld function
               result = globals()[function_name]()==  print("Hello, World!")
               print(f"Print Hello World test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to count the number of vowels in a given string."):
               # Test the vowel count function
               result = (globals()[function_name]("hello") == 2) and (globals()[function_name]("Maggie Rogers") == 5)
               print(f"Vowel Count test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to reverse a given list in-place."):
               # Test the reverse list function
               list1 = [1, 2, 3, 4, 5]
               list2 = [16, 5, 1, 10, 0, 7]
               list3 = [3]
               list4 = []
               result = (globals()[function_name](list1) == [5, 4, 3, 2, 1]) and (globals()[function_name](list2) == [7, 0, 10, 1, 5, 16]) and (globals()[function_name](list3) == [3]) and (globals()[function_name](list4) == [])
               print(f"Reverse List test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to remove duplicates from a given list."):
               # Test the remove duplicates function
               result = (globals()[function_name]([1, 1, 2, 2, 3, 3, 4, 5, 5]) == [1, 2, 3, 4, 5]) and (globals()[function_name]([1, 1, 1, 1, 1, 1]) == [1])
               print(f"Remove Duplicates test: {result}")


           elif prompt.startswith("#You are an expert Python programmer, and here is your task: Write a Python function to check if a given number is even or odd."):
               # Test the even/odd function
               result = (globals()[function_name](10) == True) and (globals()[function_name](11) == False)
               print(f"Even/Odd test: {result}")


           # Store the result in the DataFrame
           benchmarkCodeMi7.at["mistralai/Mistral-7B-v0.1", test_names[i]] = f"{('Yes' if result else 'No')} ({generation_time:.2f}s)"
   except Exception as e:
       # Catch any exceptions, including GPU crashes
       benchmarkCodeMi7.at["mistralai/Mistral-7B-v0.1", test_names[i]] = f"Error: {e}"


# Print the benchmark table
print(benchmarkCodeMi7)

In [None]:
benchmarkCodeMi7.head()

In [None]:
model = None
gc.collect()
torch.cuda.empty_cache()

In [None]:
! nvidia-smi

In [None]:
#Download and save the model


model_pipeline = pipeline(model="mistralai/Mistral-7B-v0.1",
                           torch_dtype=torch.bfloat16, # bytes precisions
                           trust_remote_code=True,
                           device_map="auto" # will use automatically which best gpu/cpu for model
                        )


tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", trust_remote_code=True)

In [None]:
# MATH PROMPTING


import torch
import pandas as pd
import ast
import time
import re




#To prevent this error: Error: cutlassF: no kernel found to launch!
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)




# Define the prompts for Code/Python testing
prompts = [
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (3+8)-2 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (6*8)/2 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of 4 to the power of 3 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of squareroot 144 ? Print just the result without any further informations.",
"You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of the fractions 5/7 + 10/11 ? Print just the result without any further informations."
]






# define function that extracts float number after "is ", so the answer to the prompt


def extract_math_answer(text):
   # Pattern to match "is" followed by a number (including decimals and negative numbers)
   pattern = r'is\s+(-?\d+(?:\.\d+)?)'
   # Find the first match in the text
   match = re.search(pattern, text)
   # If a match is found, return it as a float
   if match:
       return float(match.group(1))
   # If no match is found, return None
   return None




# define function prompting the LLM, takes the prompt as parameter


def get_completion_model(text):
   system = f"""
   You are an expert Mathematician.
   You are good at performing and explaining Mathematics concepts in simple words.
   Always end your response with 'The answer is X' where X is the final numerical result.
   Help as much as you can.
   """
   prompt = f"#### System: {system}\n#### User : \n {text} \n\n#### Response from Yi:"
   #print(prompt)
   model_response = model_pipeline(prompt, max_new_tokens=500)
   answer = model_response[0]["generated_text"]
   #print(answer)
   final = extract_math_answer(answer)
   return final




# Define the test names (for columns)


test_names = [
   "Addition/Subtraction",
   "Multiplication/Division",
   "Power",
   "Square Root",
   "Fractions"
]


# Create the DataFrame with the test column names
benchmarkMathMi7 = pd.DataFrame(index=["mistralai/Mistral-7B-v0.1"], columns=test_names)




#Prompt and test to evaluate the LLM


for i, prompt in enumerate(prompts):
   try:
       # Measure the time it took to generate the code
       torch.manual_seed(0)
       start_time = time.time()
       math = get_completion_model(prompt)
       end_time = time.time()
       generation_time = end_time - start_time

       if math:
           # Test the math
           if prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (3+8)-2 ?"):
               result = int(math) == 9
               print(f"Add/Sub test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of (6*8)/2 ?"):
               result = int(math) == 24
               print(f"Mult/Div test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of 4 to the power of 3 ?"):
               result = int(math) == 64
               print(f"Power test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of squareroot 144 ?"):
               result = int(math) == 12
               print(f"SQRT test: {result}")
           elif prompt.startswith("You are an expert Mathematician. You are good at performing and explaining Mathematics concepts in simple words. Help as much as you can. Here is your problem: What is the solution of the fractions 5/7 + 10/11 ?"):
               result = math == 125/77
               print(f"Fractions test: {result}")

       # Store the result in the DataFrame using the test name
       benchmarkMathMi7.at["mistralai/Mistral-7B-v0.1", test_names[i]] = f"{('Yes' if result else 'No')} ({generation_time:.2f}s)"
   except Exception as e:
       # Catch any exceptions, including GPU crashes
       benchmarkMathMi7.at["mistralai/Mistral-7B-v0.1", test_names[i]] = f"Error: {e}"


# Print the benchmark table
print(benchmarkMathMi7)

In [None]:
benchmarkMathMi7.head()

In [None]:
model_pipeline = None
gc.collect()
torch.cuda.empty_cache()

In [None]:
! nvidia-smi

In [None]:
from tabulate import tabulate


class color:
  PURPLE = '\033[95m'
  CYAN = '\033[96m'
  DARKCYAN = '\033[36m'
  BLUE = '\033[94m'
  GREEN = '\033[92m'
  YELLOW = '\033[93m'
  RED = '\033[91m'
  BOLD = '\033[1m'
  UNDERLINE = '\033[4m'
  END = '\033[0m'



# Define the test names for Code
test_names_Code = [
   "Print Hello World test",
   "Vowel Count test",
   "Reverse List test",
   "Remove Duplicates test",
   "Even/Odd test"
]


# Define the test names for Math
test_names_Math = [
   "Addition/Subtraction",
   "Multiplication/Division",
   "Power",
   "Square Root",
   "Fractions"
]




# Mistral 7B
print(color.BOLD + "\n \n EVALUATION for Mistral 7B: \n \n" + color.END)


#Show the Code Benchmark
print("Code Evaluations for Mistral 7B:")
print(tabulate(benchmarkCodeMi7, headers=test_names_Code, tablefmt='fancy_grid'))


#Show the Math Benchmark
print("\n \n Math Evaluations for Mistral 7B:")
print(tabulate(benchmarkMathMi7, headers=test_names_Math, tablefmt='fancy_grid'))