# Evaluation of Models on GROQ

## Selecting Models present on GROQ

### Installing Dependencies

In [None]:
!pip install groq
!pip install pandas
!pip install numpy
!pip install evaluate
!pip install langchain-groq

Collecting groq
  Downloading groq-0.18.0-py3-none-any.whl.metadata (14 kB)
Downloading groq-0.18.0-py3-none-any.whl (121 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/121.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.9/121.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.18.0
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  

### Testing available Models

#### API Key of GROQ

In [None]:
from google.colab import userdata
GROQ_KEY = userdata.get('GROQ_API_KEY_4')

#### Groq API for Chat Models

In [None]:
from groq import Groq

client = Groq(
    api_key=GROQ_KEY
)

models = client.models.list()
available_models = [x.to_dict()["id"] for x in models.data]
print(available_models)
print(len(available_models))

['llama-3.2-1b-preview', 'llama-3.2-11b-vision-preview', 'deepseek-r1-distill-llama-70b', 'distil-whisper-large-v3-en', 'llama-3.3-70b-versatile', 'llama-3.3-70b-specdec', 'llama-guard-3-8b', 'llama-3.2-90b-vision-preview', 'whisper-large-v3-turbo', 'llama-3.2-3b-preview', 'deepseek-r1-distill-qwen-32b', 'gemma2-9b-it', 'whisper-large-v3', 'qwen-2.5-32b', 'llama3-8b-8192', 'mistral-saba-24b', 'llama-3.1-8b-instant', 'qwen-2.5-coder-32b', 'llama3-70b-8192', 'mixtral-8x7b-32768']
20


In [None]:
models = []
chain_test = []
for model in available_models:
    try:
        response = client.chat.completions.create(
                messages=[
                    {
                        "role": "system",
                        "content": "You are a helpful assistant. When ever user ask you something first give your introduction in more than 20 words",
                    },
                    {
                        "role": "user",
                        "content": "Who won the world series in 2020?",
                    }
                ],
                model=model,
            )
        if len(response.choices[0].message.content) > 40:
            models.append(model)
        else:
            print(f"Model {model} is not suitable")
    except Exception as e:
        print(f"Model Name: {model} , Error : {e}")
        chain_test.append(model)


print(models)

Model Name: distil-whisper-large-v3-en , Error : Error code: 400 - {'error': {'message': 'The model `distil-whisper-large-v3-en` does not support chat completions', 'type': 'invalid_request_error'}}
Model llama-guard-3-8b is not suitable
Model Name: whisper-large-v3-turbo , Error : Error code: 400 - {'error': {'message': 'The model `whisper-large-v3-turbo` does not support chat completions', 'type': 'invalid_request_error'}}
Model Name: whisper-large-v3 , Error : Error code: 400 - {'error': {'message': 'The model `whisper-large-v3` does not support chat completions', 'type': 'invalid_request_error'}}
['llama-3.2-1b-preview', 'llama-3.2-11b-vision-preview', 'deepseek-r1-distill-llama-70b', 'llama-3.3-70b-versatile', 'llama-3.3-70b-specdec', 'llama-3.2-90b-vision-preview', 'llama-3.2-3b-preview', 'deepseek-r1-distill-qwen-32b', 'gemma2-9b-it', 'qwen-2.5-32b', 'llama3-8b-8192', 'mistral-saba-24b', 'llama-3.1-8b-instant', 'qwen-2.5-coder-32b', 'llama3-70b-8192', 'mixtral-8x7b-32768']


#### ChatGroq API for Chat Models

In [None]:
from langchain_groq import ChatGroq
from langchain.schema import SystemMessage, HumanMessage

models_on_api2 = []

for model in chain_test:
    try:
        llm = ChatGroq(
            groq_api_key = GROQ_KEY,
            model_name = model,
        )

        messages = [
            SystemMessage(content="You are a helpful assistant. When ever user ask you something first give your introduction in more than 20 words"),
            HumanMessage(content="Who won the world series in 2020?")
        ]

        response = llm.invoke(messages)

        if len(response.content) > 40:
            models_on_api2.append(model)
        else:
            print(f"Model {model} is not suitable")
    except Exception as e:
        print(e)

print(models_on_api2)

Error code: 400 - {'error': {'message': 'The model `distil-whisper-large-v3-en` does not support chat completions', 'type': 'invalid_request_error'}}
Error code: 400 - {'error': {'message': 'The model `whisper-large-v3-turbo` does not support chat completions', 'type': 'invalid_request_error'}}
Error code: 400 - {'error': {'message': 'The model `whisper-large-v3` does not support chat completions', 'type': 'invalid_request_error'}}
[]


### List of selected Models

In [None]:
for model in models:
    print(model)

llama-3.2-1b-preview
llama-3.2-11b-vision-preview
deepseek-r1-distill-llama-70b
llama-3.3-70b-versatile
llama-3.3-70b-specdec
llama-3.2-90b-vision-preview
llama-3.2-3b-preview
deepseek-r1-distill-qwen-32b
gemma2-9b-it
qwen-2.5-32b
llama3-8b-8192
mistral-saba-24b
llama-3.1-8b-instant
qwen-2.5-coder-32b
llama3-70b-8192
mixtral-8x7b-32768


## Collecting Questions and Solutions

### Mounting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!ls /content/drive/MyDrive/ML/*/*

Mounted at /content/drive
/content/drive/MyDrive/ML/area_based/305z6cgt.json
/content/drive/MyDrive/ML/area_based/305z6cgt.py
/content/drive/MyDrive/ML/area_based/6sc_glob.json
/content/drive/MyDrive/ML/area_based/6sc_glob.py
/content/drive/MyDrive/ML/area_based/fpta7la6.json
/content/drive/MyDrive/ML/area_based/fpta7la6.py
/content/drive/MyDrive/ML/area_based/_ig0_d4x.json
/content/drive/MyDrive/ML/area_based/_ig0_d4x.py
/content/drive/MyDrive/ML/area_based/irurej39.json
/content/drive/MyDrive/ML/area_based/irurej39.py
/content/drive/MyDrive/ML/area_based/stt89cth.json
/content/drive/MyDrive/ML/area_based/stt89cth.py
/content/drive/MyDrive/ML/area_based/t3u_w6hy.json
/content/drive/MyDrive/ML/area_based/t3u_w6hy.py
/content/drive/MyDrive/ML/area_based/ud7do2gk.json
/content/drive/MyDrive/ML/area_based/ud7do2gk.py
/content/drive/MyDrive/ML/area_based/vb58q98h.json
/content/drive/MyDrive/ML/area_based/vb58q98h.py
/content/drive/MyDrive/ML/area_based/wcp7ntk2.json
/content/drive/MyDrive/

### Cumulating Information in DataFrame

In [None]:
import json
import subprocess

def question_details(f, id):
    category = f.split("ML")[1].split("/")[1]
    question = ""
    code = ""

    with open(f, "r", encoding="utf-8") as f:
        content = json.load(f)
        question = content["question"]

    with open(f.name.replace("json", "py"), "r", encoding="utf-8") as f:
        code = f.read()
    answer = ""
    answer = subprocess.check_output(["python3", "-c", code]).decode("utf-8").strip()

    return {
        "id": id,
        "category": category,
        "question": question,
        "code": code,
        "answer": answer
    }

In [None]:
import sys
from glob import glob
import concurrent.futures as confu
import pandas as pd
from tqdm.notebook import tqdm

data = []

with confu.ThreadPoolExecutor(
    max_workers=16
) as executor:
    futs =[]
    for id, f in enumerate(glob('/content/drive/MyDrive/ML/area_based/*.json')):
        futs.append(executor.submit(question_details, f, id))
    for f in tqdm(confu.as_completed(futs), total=len(futs), file=sys.stdout):
        data.append(f.result())

data = sorted(data, key=lambda x: int(x['id']))
df = pd.DataFrame(data)
df.head()

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,id,category,question,code,answer
0,0,area_based,Which state has the highest PM2.5 concentratio...,def true_code():\n import pandas as pd\n ...,Chandigarh
1,1,area_based,Which union territory has the lowest PM2.5 con...,def true_code():\n import pandas as pd\n ...,Jammu and Kashmir
2,2,area_based,Identify the state with the highest density of...,def true_code():\n import pandas as pd\n ...,Delhi
3,3,area_based,Which state has the third highest density of a...,def true_code():\n import numpy as np\n ...,Puducherry
4,4,area_based,Which state has the highest land area among th...,def true_code():\n import numpy as np\n ...,Uttar Pradesh


## Response Generation Pipeline

### Dependencies

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [None]:
import concurrent.futures as confu
from tqdm.notebook import tqdm
import time
import sys
import pandas as pd

### System Prompt

In [None]:
system_prompt = """You are an air quality expert Python code generator.
You need to act on 3 dataframes based on the query to answer questions about air quality.

1. `data`, a pandas DataFrame with air quality data from India.
   Data frequency is daily. The data has the following columns and data types:
   {
       'Timestamp': dtype('<M8[ns]'),
       'station': dtype('O'),
       'PM2.5': dtype('float64'),
       'PM10': dtype('float64'),
       'address': dtype('O'),
       'city': dtype('O'),
       'latitude': dtype('float64'),
       'longitude': dtype('float64'),
       'state': dtype('O')
   }

2. `states_data`, a pandas DataFrame of state-wise population and area of India.
   The states_data has the following columns and data types:
   {
       'state': dtype('O'),
       'population': dtype('int64'),
       'area (km2)': dtype('int64')
   }

3. `ncap_funding_data`, a pandas DataFrame of funding given to the cities of India from 2019-2022, under The National Clean Air Program (NCAP).
   {
       'S. No.': dtype('int64'),
       'state': dtype('O'),
       'city': dtype('O'),
       'Amount released during FY 2019-20': dtype('float64'),
       'Amount released during FY 2020-21': dtype('float64'),
       'Amount released during FY 2021-22': dtype('float64'),
       'Total fund released': dtype('float64'),
       'Utilisation as on June 2022': dtype('float64')
   }

You just have to generate the code, don't give any explanation. Always import necessary libraries. Write the code in <code>...</code>
Function signature:
def get_response(data: pd.DataFrame, states_data: pd.DataFrame, ncap_funding_data: pd.DataFrame):
    # Your code here

Example:
>>> What's the maximum PM2.5 recorded ever
<code>
import pandas as pd
def get_response(data: pd.DataFrame, states_data: pd.DataFrame, ncap_funding_data: pd.DataFrame):
    max_pm25 = data['PM2.5'].max()
    return max_pm25
</code>
"""

In [None]:
models

['llama-3.2-1b-preview',
 'llama-3.2-11b-vision-preview',
 'deepseek-r1-distill-llama-70b',
 'llama-3.3-70b-versatile',
 'llama-3.3-70b-specdec',
 'llama-3.2-90b-vision-preview',
 'llama-3.2-3b-preview',
 'deepseek-r1-distill-qwen-32b',
 'gemma2-9b-it',
 'qwen-2.5-32b',
 'llama3-8b-8192',
 'mistral-saba-24b',
 'llama-3.1-8b-instant',
 'qwen-2.5-coder-32b',
 'llama3-70b-8192',
 'mixtral-8x7b-32768']

### API Request with re-tries

In [None]:
def querying_api(model, question, id, i):
    while True:
        try:
            response = client.chat.completions.create(
                    messages=[
                        {
                            "role": "system",
                            "content": system_prompt,
                        },
                        {
                            "role": "user",
                            "content": question,
                        }
                    ],
                    model=model,
                )
            break
        except Exception as e:
            # print(f"Model {model} is not responding on question {id} for sample {i}")
            print(e)
            time.sleep(60)

    return response.choices[0].message.content

### n Sample Generation for Question

In [None]:
def sample_responses(model, question, id, sample):
    generated_samples = []
    with confu.ThreadPoolExecutor(
        max_workers=16
    ) as executor:
        futs =[]
        for i in range(sample):
            futs.append(executor.submit(querying_api, model, question, id, i))
        for f in tqdm(confu.as_completed(futs), total=len(futs), file=sys.stdout, disable=True):
            generated_samples.append(f.result())

    return {
        "id": id,
        "generated_samples": generated_samples
    }

### Questions for Model

In [None]:
def querying_model(model, sample):
    question_responses = []
    with confu.ThreadPoolExecutor(
        max_workers=16
    ) as executor:
        futs =[]
        for i in data:
            question = i["question"]
            id = i["id"]
            futs.append(executor.submit(sample_responses, model, question, id, sample))
        for f in tqdm(confu.as_completed(futs), total=len(futs), file=sys.stdout, disable=True):
            question_responses.append(f.result())
    return {
        "model": model,
        "response" : sorted(question_responses, key=lambda x: int(x['id']))
    }

### Response for Models

In [None]:
def responses_from_models(sample):
    model_responses = []
    with confu.ThreadPoolExecutor(
        max_workers=16
    ) as executor:
        futs =[]
        for model in models:
            futs.append(executor.submit(querying_model, model, sample))
        for f in tqdm(confu.as_completed(futs), total=len(futs), file=sys.stdout):
            model_responses.append(f.result())
    return model_responses

In [None]:
responses = responses_from_models(2)
print(responses)

  0%|          | 0/16 [00:00<?, ?it/s]

Error code: 429 - {'error': {'message': 'Rate limit reached for model `mixtral-8x7b-32768` in organization `org_01jj4gbt00e9p9esyc4dfac5ts` service tier `on_demand` on tokens per minute (TPM): Limit 5000, Used 4947, Requested 638. Please try again in 7.018s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error code: 429 - {'error': {'message': 'Rate limit reached for model `mixtral-8x7b-32768` in organization `org_01jj4gbt00e9p9esyc4dfac5ts` service tier `on_demand` on tokens per minute (TPM): Limit 5000, Used 4947, Requested 638. Please try again in 7.016s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
Error code: 429 - {'error': {'message': 'Rate limit reached for model `mixtral-8x7b-32768` in organization `org_01jj4gbt00e9p9esyc4dfac5ts` service tier `on_demand` on tokens per minute (TPM): Limit 5

### To Clean the Model response

In [None]:
def post_process(code):
    ans = []
    for ins in code :
        if '<think>' in ins:
            ins = ins.split('</think>')[1]
        ins = ins.split('</code>')[0]
        ins = ins.replace('```python', '')
        ins = ins.split('```')[0]
        ins = ins.replace('<code>', '')
        ans.append(ins)
    return ans

### Adding the response to the DataFrame

In [None]:
df

Unnamed: 0,id,category,question,code,answer
0,0,area_based,Which state has the highest PM2.5 concentratio...,def true_code():\n import pandas as pd\n ...,Chandigarh
1,1,area_based,Which union territory has the lowest PM2.5 con...,def true_code():\n import pandas as pd\n ...,Jammu and Kashmir
2,2,area_based,Identify the state with the highest density of...,def true_code():\n import pandas as pd\n ...,Delhi
3,3,area_based,Which state has the third highest density of a...,def true_code():\n import numpy as np\n ...,Puducherry
4,4,area_based,Which state has the highest land area among th...,def true_code():\n import numpy as np\n ...,Uttar Pradesh
5,5,area_based,Identify the state that ranks fourth in having...,def true_code():\n import numpy as np\n ...,Mizoram
6,6,area_based,Identify the state with the highest PM10 level...,def true_code():\n import numpy as np\n ...,Arunachal Pradesh
7,7,area_based,Report the total land area of the state with t...,def true_code():\n import numpy as np\n ...,1484
8,8,area_based,Which state has the most uniform PM2.5 levels ...,def true_code():\n import numpy as np\n ...,Maharashtra
9,9,area_based,"Which state with a land area greater than 50,0...",def true_code():\n import numpy as np\n ...,Arunachal Pradesh


In [None]:
for record in responses:
  print(record)

{'model': 'llama-3.1-8b-instant', 'response': [{'id': 0, 'generated_samples': ["<code>\nimport pandas as pd\n\ndef get_response(data: pd.DataFrame, states_data: pd.DataFrame, ncap_funding_data: pd.DataFrame):\n    # Group the data by state and calculate the average PM2.5\n    avg_pm25_per_state = data.groupby('state')['PM2.5'].mean()\n    \n    # Merge the average PM2.5 with the states data\n    merged_data = pd.merge(avg_pm25_per_state, states_data, on='state')\n    \n    # Calculate the PM2.5 concentration per square kilometer\n    merged_data['pm25_per_km2'] = merged_data['PM2.5'] / merged_data['area (km2)']\n    \n    # Get the state with the highest PM2.5 concentration per square kilometer\n    max_pm25_per_km2_state = merged_data.loc[merged_data['pm25_per_km2'].idxmax()]\n    \n    return max_pm25_per_km2_state\n</code>", "<code>\nimport pandas as pd\ndef get_response(data: pd.DataFrame, states_data: pd.DataFrame, ncap_funding_data: pd.DataFrame):\n    # Merge data and states_dat

In [None]:
len(responses[0]['response'])

10

In [None]:
df

Unnamed: 0,id,category,question,code,answer
0,0,population_based,Report the fifth most polluted states in terms...,def true_code():\n import pandas as pd\n ...,Nagaland
1,1,population_based,Which state has the highest average PM2.5 conc...,def true_code():\n import pandas as pd\n ...,Arunachal Pradesh
2,2,population_based,Which state in India has the lowest number of ...,def true_code():\n import pandas as pd\n ...,Jammu and Kashmir
3,3,population_based,Report the state that has the largest populati...,def true_code():\n import numpy as np\n ...,Bihar
4,4,population_based,Which low-population state received the most N...,def true_code():\n import numpy as np\n ...,Chandigarh
5,5,population_based,What percentage of the population lives in are...,def true_code():\n import numpy as np\n ...,29.21960675284611


In [None]:
responses_df={}
for record in responses:
    model = record["model"]
    temp_df = pd.DataFrame(record["response"])
    temp_df[model] = temp_df["generated_samples"].apply(lambda x: post_process(x))

    temp_df = temp_df.drop("generated_samples", axis=1)
    df = pd.merge(df, temp_df, on="id", how="left", suffixes=("", ""))
df.head()

Unnamed: 0,id,category,question,code,answer,llama-3.1-8b-instant,llama-3.2-11b-vision-preview,gemma2-9b-it,llama3-70b-8192,qwen-2.5-32b,...,llama-3.3-70b-versatile,deepseek-r1-distill-qwen-32b,llama-3.2-90b-vision-preview,llama-3.3-70b-specdec,qwen-2.5-coder-32b,llama3-8b-8192,mistral-saba-24b,mixtral-8x7b-32768,llama-3.2-3b-preview,llama-3.2-1b-preview
0,0,area_based,Which state has the highest PM2.5 concentratio...,def true_code():\n import pandas as pd\n ...,Chandigarh,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\n\nimport pandas as pd\n\ndef get_response...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_max_pm25_per...
1,1,area_based,Which union territory has the lowest PM2.5 con...,def true_code():\n import pandas as pd\n ...,Jammu and Kashmir,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\n\nimport pandas as pd\n\ndef get_response...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_lowest_pm25_...
2,2,area_based,Identify the state with the highest density of...,def true_code():\n import pandas as pd\n ...,Delhi,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\ndef get_response(data:...,...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\n\nimport pandas as pd\n\ndef get_response...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\nimport pandas as pd\n\ndef get_response(d...,[\nimport pandas as pd\n\ndef get_response(dat...,[\ndef identify_highest_density_state():\n ...
3,3,area_based,Which state has the third highest density of a...,def true_code():\n import numpy as np\n ...,Puducherry,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\nimport numpy as np\n\n...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\nTo determine which state has the third hi...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\nimport pandas as pd\n\ndef get\_response(...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n# Import necessary libraries\nimport pandas...
4,4,area_based,Which state has the highest land area among th...,def true_code():\n import numpy as np\n ...,Uttar Pradesh,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\nimport numpy as np\n\n...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\n\nimport pandas as pd\n\ndef get_response...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[, \nimport pandas as pd\n\ndef get_response(d...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_top_poluted_...


In [None]:
print(df['llama-3.1-8b-instant'][1][2])


def get_response(data: pd.DataFrame, states_data: pd.DataFrame, ncap_funding_data: pd.DataFrame):
    # Group data by state and calculate PM2.5 concentration per km2
    data['pm25_per_km2'] = (data['PM2.5'] / data['area (km2)']).fillna(0)
    
    # Filter union territories (states with population < 1 million)
    union_territories = data.loc[data['population'] < 1e6, ['area (km2)', 'pm25_per_km2', 'state']]
    
    # Group union territories by state and calculate minimum PM2.5 concentration per km2
    min_pm25_per_km2 = union_territories.groupby('state')['pm25_per_km2'].min().reset_index()
    
    # Find the union territory with the lowest PM2.5 concentration per km2
    if not min_pm25_per_km2.empty:
        lowest_pm25_territory = min_pm25_per_km2.loc[min_pm25_per_km2['pm25_per_km2'].idxmin()]
        return lowest_pm25_territory['state']
    else:
        return 'No union territories found'



### Saving Responses DataFrame in case of Loss

In [None]:
df.to_json("/content/drive/MyDrive/ML/area_based.json", orient="records", indent=4)

### Retriving Information from saved DF

In [None]:
import pandas as pd
df = pd.read_json("/content/drive/MyDrive/ML/data.json", orient="records")
df

Unnamed: 0,id,category,question,code,answer,gemma2-9b-it,llama-3.1-8b-instant,llama3-8b-8192,llama-3.2-90b-vision-preview,llama-3.2-3b-preview,llama-3.2-11b-vision-preview,qwen-2.5-32b,qwen-2.5-coder-32b,llama-3.2-1b-preview,llama3-70b-8192,llama-3.3-70b-specdec,llama-3.3-70b-versatile,deepseek-r1-distill-llama-70b,deepseek-r1-distill-qwen-32b,mixtral-8x7b-32768
0,0,area_based,Which state has the highest PM2.5 concentratio...,def true_code():\n import pandas as pd\n ...,Chandigarh,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\ndef get_highest_pm25_state(data):\n # Gr...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\ndef get_response(data: pd.DataFrame, stat..."
1,1,area_based,Which union territory has the lowest PM2.5 con...,def true_code():\n import pandas as pd\n ...,Jammu and Kashmir,"[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\ndef get_response(data: pd.DataFrame, stat..."
2,2,area_based,Identify the state with the highest density of...,def true_code():\n import pandas as pd\n ...,Delhi,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_highest_density_state(data, states_...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\ndef get_response(data: pd.DataFrame, stat..."
3,3,area_based,Which state has the third highest density of a...,def true_code():\n import numpy as np\n ...,Puducherry,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_three_highest_density_states(data, ...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\n\n\ndef get_response(data: pd.DataFrame, st...",[\n\nTo determine which state has the third hi...,"[\ndef get\_response(data: pd.DataFrame, state..."
4,4,area_based,Which state has the highest land area among th...,def true_code():\n import numpy as np\n ...,Uttar Pradesh,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,"[, \ndef get_response(data: pd.DataFrame, stat...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_highest_land...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\ndef get_response(data: pd.DataFrame, stat..."
5,5,area_based,Identify the state that ranks fourth in having...,def true_code():\n import numpy as np\n ...,Mizoram,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef nth_lowest_densi...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\nTo solve the problem, we need to determin...","[\n\ndef get_response(data: pd.DataFrame, stat..."
6,6,area_based,Identify the state with the highest PM10 level...,def true_code():\n import numpy as np\n ...,Arunachal Pradesh,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\n# Import necessary libraries\nimport pandas...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\n\ndef get_response(data: pd.DataFrame, st...","[\ndef get_response(data: pd.DataFrame, states..."
7,7,area_based,Report the total land area of the state with t...,def true_code():\n import numpy as np\n ...,1484,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_max_pm_area(...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\ndef get_response(data: pd.DataFrame, stat..."
8,8,area_based,Which state has the most uniform PM2.5 levels ...,def true_code():\n import numpy as np\n ...,Maharashtra,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\n# Assuming data is t...,[\nimport pandas as pd\nimport numpy as np\n\n...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\n\n\ndef get_response(data: pd.DataFrame, st...",[\n\nTo determine which state has the most uni...,"[\ndef get\_response(data: pd.DataFrame, state..."
9,9,area_based,"Which state with a land area greater than 50,0...",def true_code():\n import numpy as np\n ...,Arunachal Pradesh,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,"[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...","[\ndef get_response(data: pd.DataFrame, states...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\nimport numpy as np\n\n...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\n\ndef get_response(data: pd.DataFrame, st...","[\n\ndef get_response(data: pd.DataFrame, stat..."


## Evaluation Pipeline

### Dependencies

In [None]:
from evaluate import load
import textwrap
import os
import concurrent.futures as confu

code_eval = load("code_eval")
os.environ["HF_ALLOW_CODE_EVAL"] = "1"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/9.18k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/6.10k [00:00<?, ?B/s]

In [None]:
results_df = pd.read_csv("result.csv")

In [None]:
import pandas as pd
df = pd.read_csv("./raw_data/main_data.csv")
ncap_funding_df = pd.read_csv("./raw_data/NCAP_Funding.csv")
state_df = pd.read_csv("./raw_data/State_data.csv")

In [None]:
import pandas as pd
import numpy as np
def get_response(data: pd.DataFrame, states_data: pd.DataFrame, ncap_funding_data: pd.DataFrame):
    # Find the top 5 most polluted states based on the average PM2.5
    avg_pm25 = data.groupby('state')['PM2.5'].mean().reset_index()
    top_5_most_polluted = avg_pm25.nlargest(5, 'PM2.5')['state'].tolist()

    # Filter the states data to keep only the top 5 most polluted states
    top_5_states_data = states_data[states_data['state'].isin(top_5_most_polluted)]

    # Find the state with the highest land area among the top 5 most polluted states
    max_area_state = top_5_states_data.loc[np.argmax(top_5_states_data['area (km2)']), 'state']

    return max_area_state
get_response(df,state_df,ncap_funding_df)

'Uttar Pradesh'

In [None]:
for i in results_df[results_df['result']=='failed: invalid syntax. Perhaps you forgot a comma? (<string>, line 30)']['sample'].tolist():
  print(i)


import pandas as pd
import numpy as np

def get_response(data: pd.DataFrame, states_data: pd.DataFrame, ncap_funding_data: pd.DataFrame):
    # Find the top 5 most polluted states based on the average PM2.5
    avg_pm25 = data.groupby('state')['PM2.5'].mean().reset_index()
    top_5_most_polluted = avg_pm25.nlargest(5, 'PM2.5')['state'].tolist()
    
    # Filter the states data to keep only the top 5 most polluted states
    top_5_states_data = states_data[states_data['state'].isin(top_5_most_polluted)]
    
    # Find the state with the highest land area among the top 5 most polluted states
    max_area_state = top_5_states_data.loc[np.argmax(top_5_states_data['area (km2)']), 'state']
    
    return max_area_state


import pandas as pd

def get_response(data: pd.DataFrame, states_data: pd.DataFrame, ncap_funding_data: pd.DataFrame):
    # Get top 5 most polluted states
    pollution_data = data.groupby('station')['PM2.5'].mean().reset_index()
    polluted_stations = pollution_data.

In [None]:
test_cases = ["assert add(2,3)==5"]
candidates = [["import pandas as pd\ndef add(,b): return a*b", "def add(a, b): return a+b"]]
pass_at_k, results = code_eval.compute(references=test_cases, predictions=candidates, k=[1, 2])

In [None]:
results_df.groupby('category')["pass@1"].mean()

Unnamed: 0_level_0,pass@1
category,Unnamed: 1_level_1
area_based,0.0


In [None]:
answer="Uttar Pradesh"

In [None]:
print("""import pandas as pd\nimport numpy as np\ndef get_response(data: pd.DataFrame, states_data: pd.DataFrame, ncap_funding_data: pd.DataFrame):
  # Find the top 5 most polluted states based on the average PM2.5
  avg_pm25 = data.groupby('state')['PM2.5'].mean().reset_index()
  top_5_most_polluted = avg_pm25.nlargest(5, 'PM2.5')['state'].tolist()
  # Filter the states data to keep only the top 5 most polluted states
  top_5_states_data = states_data[states_data['state'].isin(top_5_most_polluted)]
  # Find the state with the highest land area among the top 5 most polluted states
  max_area_state = top_5_states_data.loc[np.argmax(top_5_states_data['area (km2)']), 'state']
  return max_area_state""")

import pandas as pd
import numpy as np
def get_response(data: pd.DataFrame, states_data: pd.DataFrame, ncap_funding_data: pd.DataFrame):
  # Find the top 5 most polluted states based on the average PM2.5
  avg_pm25 = data.groupby('state')['PM2.5'].mean().reset_index()
  top_5_most_polluted = avg_pm25.nlargest(5, 'PM2.5')['state'].tolist()
  # Filter the states data to keep only the top 5 most polluted states
  top_5_states_data = states_data[states_data['state'].isin(top_5_most_polluted)]
  # Find the state with the highest land area among the top 5 most polluted states
  max_area_state = top_5_states_data.loc[np.argmax(top_5_states_data['area (km2)']), 'state']
  return max_area_state


In [None]:
print(f"""import pandas as pd\nimport numpy as np\ndf = pd.read_csv('./raw_data/main_data.csv')\nncap_funding_df = pd.read_csv('./raw_data/NCAP_Funding.csv')
        states_df = pd.read_csv('./raw_data/State_data.csv')\ndf['Timestamp'] = pd.to_datetime(df['Timestamp'])\nncap_funding_df.replace('-', np.nan, inplace=True)\nncap_funding_df['Amount released during FY 2019-20'] = ncap_funding_df['Amount released during FY 2019-20'].astype('float64')\nncap_funding_df['Amount released during FY 2020-21'] = ncap_funding_df['Amount released during FY 2020-21'].astype('float64')\ncap_funding_df['Amount released during FY 2021-22'] = ncap_funding_df['Amount released during FY 2021-22'].astype('float64')
        ncap_funding_df['Utilisation as on June 2022'] = ncap_funding_df['Utilisation as on June 2022'].astype('float64')
        assert str(get_response(df,states_df,ncap_funding_df)) == str({answer})
        """)


        import pandas as pd
        import numpy as np
        df = pd.read_csv('./raw_data/main_data.csv')
        ncap_funding_df = pd.read_csv('./raw_data/NCAP_Funding.csv')
        states_df = pd.read_csv('./raw_data/State_data.csv')
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        ncap_funding_df.replace('-', np.nan, inplace=True)
        ncap_funding_df['Amount released during FY 2019-20'] = ncap_funding_df['Amount released during FY 2019-20'].astype('float64')
        ncap_funding_df['Amount released during FY 2020-21'] = ncap_funding_df['Amount released during FY 2020-21'].astype('float64')
        ncap_funding_df['Amount released during FY 2021-22'] = ncap_funding_df['Amount released during FY 2021-22'].astype('float64')
        ncap_funding_df['Utilisation as on June 2022'] = ncap_funding_df['Utilisation as on June 2022'].astype('float64')
        assert str(get_response(df,states_df,ncap_funding_df)) == str(Uttar Pradesh)
        


In [None]:
refer = textwrap.dedent(f"""
        import pandas as pd
        import numpy as np
        df = pd.read_csv("/content/raw_data/main_data.csv")
        ncap_funding_df = pd.read_csv("/content/raw_data/NCAP_Funding.csv")
        states_df = pd.read_csv("/content/raw_data/State_data.csv")
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        ncap_funding_df.replace('-', np.nan, inplace=True)
        ncap_funding_df['Amount released during FY 2019-20'] = ncap_funding_df['Amount released during FY 2019-20'].astype('float64')
        ncap_funding_df['Amount released during FY 2020-21'] = ncap_funding_df['Amount released during FY 2020-21'].astype('float64')
        ncap_funding_df['Amount released during FY 2021-22'] = ncap_funding_df['Amount released during FY 2021-22'].astype('float64')
        ncap_funding_df['Utilisation as on June 2022'] = ncap_funding_df['Utilisation as on June 2022'].astype('float64')
        assert str(get_response(df,states_df,ncap_funding_df)) == str({repr(answer)})
    """)
pass_at_k, result = code_eval.compute(
                references=[refer],
                predictions=[["""import pandas as pd\nimport numpy as np\ndef get_response(data: pd.DataFrame, states_data: pd.DataFrame, ncap_funding_data: pd.DataFrame):
  # Find the top 5 most polluted states based on the average PM2.5
  avg_pm25 = data.groupby('state')['PM2.5'].mean().reset_index()
  top_5_most_polluted = avg_pm25.nlargest(5, 'PM2.5')['state'].tolist()
  # Filter the states data to keep only the top 5 most polluted states
  top_5_states_data = states_data[states_data['state'].isin(top_5_most_polluted)]
  # Find the state with the highest land area among the top 5 most polluted states
  max_area_state = top_5_states_data.loc[np.argmax(top_5_states_data['area (km2)']), 'state']
  return max_area_state"""]],
                timeout = 600,
                num_workers = 16,
                k=[1],
            )

In [None]:
result[0]

[(0, {'task_id': 0, 'passed': True, 'result': 'passed', 'completion_id': 0})]

In [None]:
response_df = pd.read_json("/content/drive/MyDrive/ML/area_based.json", orient="records")
response_df.head()

Unnamed: 0,id,category,question,code,answer,llama-3.1-8b-instant,llama-3.2-11b-vision-preview,gemma2-9b-it,llama3-70b-8192,qwen-2.5-32b,...,llama-3.3-70b-versatile,deepseek-r1-distill-qwen-32b,llama-3.2-90b-vision-preview,llama-3.3-70b-specdec,qwen-2.5-coder-32b,llama3-8b-8192,mistral-saba-24b,mixtral-8x7b-32768,llama-3.2-3b-preview,llama-3.2-1b-preview
0,0,area_based,Which state has the highest PM2.5 concentratio...,def true_code():\n import pandas as pd\n ...,Chandigarh,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\n\nimport pandas as pd\n\ndef get_response...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_max_pm25_per...
1,1,area_based,Which union territory has the lowest PM2.5 con...,def true_code():\n import pandas as pd\n ...,Jammu and Kashmir,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\n\nimport pandas as pd\n\ndef get_response...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_lowest_pm25_...
2,2,area_based,Identify the state with the highest density of...,def true_code():\n import pandas as pd\n ...,Delhi,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\ndef get_response(data:...,...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\n\nimport pandas as pd\n\ndef get_response...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\nimport pandas as pd\n\ndef get_response(d...,[\nimport pandas as pd\n\ndef get_response(dat...,[\ndef identify_highest_density_state():\n ...
3,3,area_based,Which state has the third highest density of a...,def true_code():\n import numpy as np\n ...,Puducherry,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\nimport numpy as np\n\n...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\nTo determine which state has the third hi...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\nimport pandas as pd\n\ndef get\_response(...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n# Import necessary libraries\nimport pandas...
4,4,area_based,Which state has the highest land area among th...,def true_code():\n import numpy as np\n ...,Uttar Pradesh,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\nimport numpy as np\n\n...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,...,[\nimport pandas as pd\n\ndef get_response(dat...,[\n\n\nimport pandas as pd\n\ndef get_response...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,"[, \nimport pandas as pd\n\ndef get_response(d...",[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\n\ndef get_response(dat...,[\nimport pandas as pd\ndef get_response(data:...,[\nimport pandas as pd\n\ndef get_top_poluted_...


In [None]:
def pass_at_K_on_df(n, df):
    results = []
    for _, row in df.iterrows():

        id = row["id"]
        answer = row["answer"]
        question = row["question"]
        category = row["category"]
        code = row["code"]

        refer = textwrap.dedent(f"""
        import pandas as pd
        import numpy as np
        df = pd.read_csv('./raw_data/main_data.csv')
        ncap_funding_df = pd.read_csv('./raw_data/NCAP_Funding.csv')
        states_df = pd.read_csv('./raw_data/State_data.csv')
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        ncap_funding_df.replace('-', np.nan, inplace=True)
        ncap_funding_df['Amount released during FY 2019-20'] = ncap_funding_df['Amount released during FY 2019-20'].astype('float64')
        ncap_funding_df['Amount released during FY 2020-21'] = ncap_funding_df['Amount released during FY 2020-21'].astype('float64')
        ncap_funding_df['Amount released during FY 2021-22'] = ncap_funding_df['Amount released during FY 2021-22'].astype('float64')
        ncap_funding_df['Utilisation as on June 2022'] = ncap_funding_df['Utilisation as on June 2022'].astype('float64')
        assert str(get_response(df,states_df,ncap_funding_df)) == str({answer})
        """)


        for model in models:
            sample = row[model]
            pass_at_k, result = code_eval.compute(
                references=[refer],
                predictions=[sample],
                timeout = 600,
                num_workers = 16,
                k=n,
            )

            for i in range(2):
                results.append({
                    'id': id,
                    'question': question,
                    'answer': answer,
                    'category': category,
                    'model': model,
                    'true_code': code,
                    'pass@1': pass_at_k['pass@1'],
                    'result': result[0][i][1]['result'],
                    'status': result[0][i][1]['passed'],
                    'sample': sample[i],
                })


    return results

In [None]:
def save_category_results(df):
    result_data = pass_at_K_on_df([1], df)
    result_df = pd.DataFrame(result_data)
    result_df.to_json("result.json", orient="records", indent=4)
    result_df.to_csv("result.csv")
    return result_df

In [None]:
result = save_category_results(response_df)

In [None]:
# def retry_code_eval(model, refer, response_code, n):
#     model_results = {}
#     while True:
#         try:
#             pass_at_k, result = code_eval.compute(
#                 references=[refer],
#                 predictions=[response_code],
#                 timeout = 30,
#                 num_workers = 16,
#                 k=n,
#             )
#             model_results[model] = pass_at_k
#             break
#         except Exception as e:
#             print(e)
#             time.sleep(1)
#     return model_results

In [None]:
# def models_pass_at_k(id, question, category, refer, n):
#     results = {}
#     results['id'] = id
#     results['question'] = question
#     results['category'] = category
#     with confu.ThreadPoolExecutor(
#         max_workers=16
#     ) as executor:
#         futs = []
#         for model in models:
#             futs.append(executor.submit(retry_code_eval, model, refer, df[f'{model}'][id], n))
#         for f in tqdm(confu.as_completed(futs), total=len(futs), file=sys.stdout):
#             results.update(f.result())
#         print(results)
#     return results

### pass@k generating Function

In [None]:
# def pass_at(n):
#     results = []
#     with confu.ThreadPoolExecutor(
#         max_workers=16
#     ) as executor:
#         futs =[]
#         for _, row in df.iterrows():
#             answer = row["answer"]
#             refer = textwrap.dedent(f"""
#             import pandas as pd
#             import numpy as np
#             df = pd.read_csv('/content/raw_data/main_data.csv')
#             df['Timestamp'] = pd.to_datetime(df['Timestamp'])
#             assert get_response(df) == {answer}
#             """)
#             id = row["id"]
#             question = row["question"]
#             category = row["category"]
#             futs.append(executor.submit(models_pass_at_k, id, question, category, refer, n))
#         for f in tqdm(confu.as_completed(futs), total=len(futs), file=sys.stdout):
#             results.append(f.result())
#     return sorted(results, key = lambda x: int(x["id"]))

In [None]:
def pass_at(n):
    results = []
    for _, row in df.iterrows():
        answer = row["answer"]
        refer = textwrap.dedent(f"""
        import pandas as pd
        import numpy as np
        df = pd.read_csv('/content/raw_data/main_data.csv')
        df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        assert str(get_response(df)) == str({answer})
        """)
        model_results = {}
        model_results["id"] = row["id"]
        model_results["question"] = row["question"]
        model_results["category"] = row["category"]
        # model_results["answer"] = answer
        # model_results["code"] = row["code"]
        # model_results["reference"] = refer
        for model in models:
            sample = row[model]
            pass_at_k, result = code_eval.compute(
                references=[refer],
                predictions=[sample],
                timeout = 600,
                num_workers = 16,
                k=n,
            )
            model_results[model] = pass_at_k
            print(result)
            print(pass_at_k)
        print(model_results)
        results.append(model_results)
    return results

In [None]:
result_data = pass_at([1, 2, 5])

In [None]:
result_df = pd.DataFrame(result_data)
result_df.head()