# installing needed packages and libraries

In [1]:
# %pip install --upgrade optimum
# %pip install onnxruntime onnxruntime-tools


In [2]:
# %%capture/
# %pip install accelerate bitsandbytes transformers trl

In [2]:
import os
import torch
import pandas as pd
import gc
import re
import json
import numpy as np 
import matplotlib.pyplot as plt
import subprocess as sp

# import sys
# import bitsandbytes as bnb
# from bitsandbytes import quantize_model
# from optimum.intel.neural_compressor import INCModel

from accelerate import Accelerator
from onnxruntime_tools import optimizer
from torch.cuda.amp import autocast
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from transformers.convert_graph_to_onnx import convert
from jiwer import wer
from IPython.display import clear_output






# Dataset files


In [4]:
# we insisted to write down everyfile name to inform all datasets we are using inclusively.
datasets = ["atis",         # 809 instances, 
            "chime4",       # 1320
            "coraal",       # 170
            "cv",           # 2000
            "lrs2",         # 2259
            "ls_clean",     # 2620
            "ls_other",     # 2939
            "swbd",         # 2000
            "td3",          # 1155
            "wsj_score",    # 836
            ]

punctuation_to_remove = ',.\"!?:;$'
punctuation_to_replace = '-'

In [5]:
device = "cuda:0"

# Loading LLaMa 3.1 Base_model

In [6]:
# loading Model from Hugging Face hub
base_model = "meta-llama/Meta-Llama-3.1-8B"

#Quanitization Configuration 
compute_dtype = getattr(torch, "float16")

# In our case, we create 4-bit quantization with NF4 type configuration using BitsAndBytes.
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0},
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
# loading Model from Hugging Face hub
base_model = "meta-llama/Llama-3.1-8B-Instruct"

#Quanitization Configuration 
compute_dtype = getattr(torch, "float16")

# In our case, we create 4-bit quantization with NF4 type configuration using BitsAndBytes.
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0},
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

In [7]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# import torch

# # Loading the model from Hugging Face hub with 4-bit quantization
# base_model = "meta-llama/Meta-Llama-3.1-8B"

# # Quantization Configuration
# compute_dtype = torch.float16  # Use FP16 precision for compute operations

# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,  # Enable 4-bit quantization
#     bnb_4bit_quant_type="nf4",  # Type of 4-bit quantization
#     bnb_4bit_compute_dtype=compute_dtype,  # Use FP16 for computations
#     bnb_4bit_use_double_quant=False  # Avoid double quantization
# )

# # Load the model with device mapping
# model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     quantization_config=quant_config,
#     device_map="auto"  # Automatically map layers to CPU/GPU as needed
# )

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(base_model)

# # Ensure caching is disabled if needed
# model.config.use_cache = False

# # Avoid manually moving the model to GPU, let the device_map handle it


### LLama prompt

In [8]:
# def llama_gensec(line):
#     few_shot_prompt = """<s>[INST] You need to do language model rescoring in ASR. Given the 5-best hypotheses, you need to report the true transcription from the 5-best hypotheses. DO NOT WRITE ANYTHING BESIDES THE true hypothesis absolutely nothing else. Just simply say the true hypothesis. Don't say you can help me with this or anything like that. Just say the true hypothesis. your output should be a single sentence and that sentence should be the prediction it is important that you do not write anything else besides the true hypothesis as it will all be added to a csv file and we need to make sure that the csv file is correct. DONT EVEN CONFIRM JUST PUT THE RESPONSE. If none of the hypotheses make sense just generate one that is logical. YOU MUST SAY "The true hypothesis is:" followed by the hypothesis.

#     Here are some examples of what you might take as an input: (REMEMBER THESE ARE JUST EXAMPLES, SO THEY ARE NOT THE SAME AS THE INPUT YOU WILL RECEIVE.)
    
#     Speech recognition: "list the flights from dallas to baltimore arriving july onest", "list the flights from dallas to baltimore arriving july onest", "list the flights from dallas to baltimore arriving july one", "list the flights from dallas to baltimore arriving july one", "list the flights from dallas to baltimore arriving july onest"
#     Truth: The true hypothesis is: list the flights from dallas to baltimore arriving july first


#     Speech recognition: "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june ten", "i would like to fly from san diego to houston on june ten"
#     Truth: The true hypothesis is: i would like to fly from san diego to houston on june tenth

#     Speech recognition: "list flights from houston to memphis june twenty-nineth", "list flights from houston to memphis june twenty-nineth", "list flights from houston to memphis june twenty-nine", "list flights from houston to memphis june twenty-nineth", "list flights from houston to memphis june twenty-nineth" 
#     Truth: list flights from houston to memphis june twenty ninth"

#     Speech recognition: "about half these managers are in the u s", "about half these managers are in the us", "about half these managers are in the us", "about half these managers are in the us", "about half of these managers are in the us"
#     Truth: about half these managers are in the us


#     Don't say you can help me with this or please provide the true hypothesis. Just say the true hypothesis AND DO NOT ADD ANY IRRELEVANT INFORMATION. Your output should be a single sentence and that sentence should be the prediction it is important that you do not write anything else besides the true hypothesis as it will all be added to a csv file and we need to make sure that the csv file is correct. DONT EVEN CONFIRM JUST PUT THE RESPONSE. If none of the hypotheses make sense just generate one that is logical. YOU MUST SAY "Truth: The true hypothesis is:" followed by the hypothesis.
#  """

#     return few_shot_prompt + line + "[/INST] Truth: The true hypothesis is: "




In [9]:
# def llama_gensec(line):
#     prompt = """<s>[INST] You will rescore ASR 5-best hypotheses and report the correct transcription. 
#     Output the result in the format <prediction>correct transcription</prediction> without adding any extra information. 
#     If none of the given hypotheses are correct, generate a logical one.

# Examples for consideration:
# Speech recognition: "list the flights from dallas to baltimore arriving july onest", "list the flights from dallas to baltimore arriving july onest", "list the flights from dallas to baltimore arriving july one", "list the flights from dallas to baltimore arriving july one", "list the flights from dallas to baltimore arriving july onest"
# <prediction>list the flights from dallas to baltimore arriving july first</prediction>

# Speech recognition: "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june ten", "i would like to fly from san diego to houston on june ten"
# <prediction>i would like to fly from san diego to houston on june tenth</prediction>

# Respond only with the correct transcription inside the <prediction> tags.

# Now, here's the input:
# """

#     return prompt + line + '[/INST]<prediction>'


In [8]:
def llama_gensec(line):
    prompt = """<s>[INST]As an ASR Corrector, your task is to enhance ASR transcriptions. You will receive 5 hypotheses from the ASR system. Your goal is to:

    1. Select the most accurate and logical transcription, ensuring it is grammatically correct and has no spelling mistakes.
    2. If none of the hypotheses are correct, generate a new transcription that is logical and grammatically correct with no spelling mistakes.
    3. Synthesize a unified hypothesis by identifying and integrating the most frequently recurring sequences of sentences across all hypotheses, prioritizing sequences that align cohesively with the broader contextual meaning of the sentences. Additionally, assign slightly higher importance to sequences appearing at the beginning of the hypotheses. In cases where no common sequences are found among the hypotheses, select the hypothesis that exhibits logical coherence.
    4. The correct transcription should have the same structure as the hypotheses.
    5. Do not output any explanation or extra information, and do not repeat hypotheses in the response.
    6. Ensure the output is in the exact format: <prediction>[the correct transcription]</prediction>.
    7. Do not include "No", "correct transcription", or anything else outside the specified format.

    Here are examples of what you should do:
        
    Example 1:
    Speech recognition: "list the flights from dallas to baltimore arriving july onest", "list the flights from dallas to baltimore arriving july onest", "list the flights from dallas to baltimore arriving july one", "list the flights from dallas to baltimore arriving july one", "list the flights from dallas to baltimore arriving july onest"
    <prediction>list the flights from dallas to baltimore arriving july first</prediction>

    Example 2:
    Speech recognition: "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june ten", "i would like to fly from san diego to houston on june ten"
    <prediction>i would like to fly from san diego to houston on june tenth</prediction>

    Now, here are the ASR hypotheses:
"""
    return prompt + line + '[/INST]<prediction>'


Next, we will load the tokenizer

In [9]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model.generation_config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Running LLaMa 

In [16]:
# import torch
# import pandas as pd
# import json
# import re
# from transformers import pipeline

# # Ensure GPU memory is freed before running
# torch.cuda.empty_cache()

# # Initialize the text-generation pipeline
# pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

# batch_size = 32  # Adjust based on available GPU memory
# max_length = 1024  # Max number of tokens in a single input sequence the model can handle

# # Function to process a batch of inputs and return the results
# # def process_batch(batch, batch_outputs):
# #     results = []
# #     for question, res in zip(batch, batch_outputs):
# #         try:
# #             # Extracting all occurrences of text between <prediction> and </prediction>
# #             matches = re.findall(r"<prediction>(.*?)</prediction>", res['generated_text'])
# #             if matches:
# #                 prediction = matches[-1]  # Get the last prediction
# #             else:
# #                 prediction = "No prediction found"  # Fallback in case no tags are found

# #         except IndexError:
# #             prediction = "No valid hypothesis found"
        
# #         truth = "Yes" if prediction == question['output'].strip().lower() else "No"
# #         results.append({
# #             'input': '\n'.join(question['input']),
# #             'prediction': prediction,
# #             'output': question['output'],
# #             'match': truth
# #         })
# #     return results


# def process_batch(batch, batch_outputs):
#     results = []
#     for question, res_list in zip(batch, batch_outputs):
#         try:
#             # Since batch_outputs is a list of lists (one per batch), get the first result in each list
#             generated_text = res_list[0]['generated_text'] if isinstance(res_list, list) else res_list['generated_text']
            
#             # Extracting all occurrences of text between <prediction> and </prediction>
#             matches = re.findall(r"<prediction>(.*?)</prediction>", generated_text)
#             if matches:
#                 prediction = matches[-1]  # Get the last prediction
#             else:
#                 prediction = "No prediction found"  # Fallback in case no tags are found

#         except (IndexError, KeyError):
#             prediction = "No valid hypothesis found"
        
#         truth = "Yes" if prediction == question['output'].strip().lower() else "No"
#         results.append({
#             'input': '\n'.join(question['input']),
#             'prediction': prediction,
#             'output': question['output'],
#             'match': truth
#         })
#     return results                                                                                                                                                                                                                                                                                                                                                                                  


# # file_number = 1
# for file in datasets:

#     llama_df = pd.DataFrame(columns=['input', 'output', 'prediction', 'match'])
#     i = 1
#     with open(f"./Test/test_test_{file}.json") as jsonFile:
#         test_data = json.load(jsonFile)

#         # Process data in batches
#         batch = []
#         for question in test_data:
#             print(f'\r{i}', end='', flush=True)
#             i += 1
#             # print(i)
#             hypotheses = question['input']
#             test_txt = "\n".join(hypotheses)

#             batch.append(question)

#             # When batch size is reached, process the batch
#             if len(batch) == batch_size:
#                 inputs = [llama_gensec("\n".join(q['input'])) for q in batch]
#                 batch_outputs = pipe(inputs, max_length=max_length, num_return_sequences=1)

#                 # Process and append results to llama_df
#                 batch_results = process_batch(batch, batch_outputs)
#                 llama_df = pd.concat([llama_df, pd.DataFrame(batch_results)], ignore_index=True)

#                 batch = []  # Clear the batch for the next round

#         # Process the remaining batch (if any)
#         if batch:
#             inputs = [llama_gensec("\n".join(q['input'])) for q in batch]
#             batch_outputs = pipe(inputs, max_length=max_length, num_return_sequences=1)
#             batch_results = process_batch(batch, batch_outputs)
#             llama_df = pd.concat([llama_df, pd.DataFrame(batch_results)], ignore_index=True)

#         # Save output after processing each dataset
#         llama_df.to_csv(f'llama_Gen_output/{file}.csv', index=False)
#         print(f'file: {file} done.')


In [10]:
import torch
import pandas as pd
import json
import re
from transformers import pipeline

# Ensure GPU memory is freed before running
torch.cuda.empty_cache()

# Initialize the text-generation pipeline
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

batch_size = 32  # Adjust based on available GPU memory
max_length = 1024  # Max number of tokens in a single input sequence the model can handle

def process_batch(batch, batch_outputs):
    results = []
    for question, res_list in zip(batch, batch_outputs):
        try:
            # Since batch_outputs is a list of lists (one per batch), get the first result in each list
            generated_text = res_list[0]['generated_text'] if isinstance(res_list, list) else res_list['generated_text']
            # print(f"Original generated text:\n{generated_text}")

            # First, extract all occurrences of text between <prediction> and </prediction>
            matches = re.findall(r"<prediction>(.*?)</prediction>", generated_text)

            if matches:
                prediction = matches[-1]  # Get the last prediction

                # Now apply punctuation removal, lowercasing, and other transformations
                prediction = (
                    prediction.translate(str.maketrans('', '', punctuation_to_remove))  # Remove specified punctuation
                                  .translate(str.maketrans(punctuation_to_replace, ' '))  # Replace specified punctuation with spaces
                                  .strip()  # Remove leading and trailing spaces
                                  .lower()  # Convert to lowercase
                                  .split('\n')[0]  # Get the first line only
                )
            else:
                prediction = "No prediction found"  # Fallback in case no tags are found

        except (IndexError, KeyError):
            prediction = "No valid hypothesis found"
        
        truth = "Yes" if prediction == question['output'].strip().lower() else "No"
        # print(f"Prediction: {prediction}")
        # print(f"truth: {truth}")
        # print("===============================================================================================")
        results.append({
            'input': '\n'.join(question['input']),
            'prediction': prediction,
            'output': question['output'],
            'match': truth
        })
    return results
                                                                                                                                                                                                                                                                                                                                             

# Loop through datasets
for file in datasets:
    llama_df = pd.DataFrame(columns=['input', 'output', 'prediction', 'match'])
    i = 1
    with open(f"./Test/test_test_{file}.json") as jsonFile:
        test_data = json.load(jsonFile)

        # Process data in batches
        batch = []
        for question in test_data:
            print(f'\r{i}', end='', flush=True)
            i += 1
            hypotheses = question['input']
            test_txt = "\n".join(hypotheses)

            batch.append(question)

            # When batch size is reached, process the batch
            if len(batch) == batch_size:
                inputs = [llama_gensec("\n".join(q['input'])) for q in batch]

                # Instead of using max_length, use max_new_tokens
                batch_outputs = pipe(inputs, max_new_tokens=100, pad_token_id=tokenizer.pad_token_id)

                # Process and append results to llama_df
                batch_results = process_batch(batch, batch_outputs)
                llama_df = pd.concat([llama_df, pd.DataFrame(batch_results)], ignore_index=True)

                batch = []  # Clear the batch for the next round

        # Process the remaining batch (if any)
        if batch:
            inputs = [llama_gensec("\n".join(q['input'])) for q in batch]

            # Use max_new_tokens instead of max_length here as well
            batch_outputs = pipe(inputs, max_new_tokens=100, pad_token_id=tokenizer.pad_token_id)
            batch_results = process_batch(batch, batch_outputs)
            llama_df = pd.concat([llama_df, pd.DataFrame(batch_results)], ignore_index=True)

        # Save output after processing each dataset
        llama_df.to_csv(f'llama_Gen_output/{file}.csv', index=False)
        print(f'file: {file} done.')


352

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


809file: atis done.
1320file: chime4 done.
170file: coraal done.
2000file: cv done.
2259file: lrs2 done.
2620file: ls_clean done.
2939file: ls_other done.
2000file: swbd done.
1155file: td3 done.
836file: wsj_score done.


In [1]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [10]:

for file in datasets:
    llama_df = pd.DataFrame(columns=['input', 'output', 'prediction', 'match'])
    i = 1
    with open("./Test/test_test_"+file + ".json") as jsonFile:
        test_data = json.load(jsonFile)
        # print(len(test_data))
        for question in test_data:
            #print(i)
            i+=1
            hypotheses = question['input']
            test_txt = ""
            for line in hypotheses:
                test_txt += line + '\n'
            
            inputs = tokenizer(llama_gensec(test_txt), return_tensors="pt").to(device)
            outputs = model.generate(**inputs, max_new_tokens=200)
            res = tokenizer.decode(outputs[0], skip_special_tokens=True)
            # print(res)
            # print(res)
            try:
                # Extracting all occurrences of text between <prediction> and </prediction>
                matches = re.findall(r"<prediction>(.*?)</prediction>", res)
                if matches:
                    res = matches[-1]  # Get the last prediction
                else:
                    res = "No prediction found"  # Fallback in case no tags are found

                # print(res)
            except IndexError:
                res = "No valid hypothesis found"
            
            truth = "Yes" if res == question['output'].strip().lower() else "No"
            llama_df = llama_df._append({'input': test_txt, 'prediction': res, 'output': question['output'], 'match': truth}, ignore_index=True)

        # downloading output after running each dataset
        
        llama_df.to_csv('llama_Gen_output/'+f'{file}.csv', index=False)  
        print(f'file: {file} done.')


NameError: name 'tokenizer' is not defined

In [13]:
print(llama_df.head(200))
# llama_df.to_csv('llama_Gen_output/'+f'{file}.csv', index=False)  

Empty DataFrame
Columns: [input, output, prediction, match]
Index: []


In [34]:
# Calculate WER 
def calculate_wer(pre, ref):
    wer_score = editdistance.eval(pre, ref) / len(ref)
    return wer_score
def calculate_wer_df(df):
    length  = len(df.index) 
    ignore = 0
    before = 0


    for index, row in df.iterrows():
        hyp = row['prediction']
        ground_truth = row['output']
        best_hypo = re.sub(r'[^\w\s]|[\d]', '', hyp)
        ground_truth = re.sub(r'[^\w\s]|[\d]', '', ground_truth)

        try:
            wer_best_hypo = calculate_wer(best_hypo.split(), ground_truth.split())
        except Exception as e:
            print("CHECK HERE")
            print (ground_truth)
            print (best_hypo)
            ignore += 1
            continue

        before = before + wer_best_hypo
    # Return length and calculated error
    error = before / (length - ignore) if (length - ignore) > 0 else 0
    return length, error
def load_data(name):
    dfs = []
    for file in os.listdir(name):
        dir = os.path.join(name, file)
        df = pd.read_csv(dir)
        dfs.append(df)
    return dfs
files = ['atis.csv',
        'chime4.csv',
        'coraal.csv',
        'cv.csv',
        'lrs2.csv',
        'ls_clean.csv',
        'ls_other.csv',
        'swbd.csv',
        'td3.csv',
        'wsj_score.csv']
llama_dfs = load_data('llama_Gen_output')
output_data = []

i = 0
for df in llama_dfs:
    file_name = files[i]
    length, error = calculate_wer_df(df)
    output_data.append([file_name, length, error])
    i = i + 1

# Write to a CSV file
with open('llama_Gen_output/llama_wer.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['File Name', 'Length', 'Error'])  # Write header
    writer.writerows(output_data)  # Write data rows

print("Data successfully written to llama_wer.csv")

CHECK HERE
list all us air flights from miami to cleveland leaving on sunday afternoon
list all us air flights from miami to cleveland leaving on sunday afternoon
CHECK HERE
list the flights from dallas to baltimore arriving july first
list flights from houston to memphis june twenty ninth
CHECK HERE
i would like to fly from san diego to houston on june tenth
 
CHECK HERE
is there an american airlines flight from houston to newark on june tenth after six pm
is there an american airlines flight from houston to newark on june tenth after six pm
CHECK HERE
what is the lowest fare from bwi to salt lake city
what is the lowest fare from bwi to salt lake city
CHECK HERE
list a flight on delta airlines from toronto to san diego
list a flight on delta airlines from toronto to san diego
CHECK HERE
list the alaska airlines flights departing from burbank
list the alaska airlines flights departing from burbank
CHECK HERE
list last wednesday flight from oakland to salt lake city
list last wednesday

NameError: name 'csv' is not defined

# Gemma

In [55]:
# freeing the cache for a new model to be used. 
gc.collect()
torch.cuda.empty_cache()

Loading model

In [92]:
model_id = "google/gemma-7b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
hf_token = "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"
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, use_auth_token=hf_token)




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [90]:
def gemma_gensec(line):
    few_shot_prompt = """<start_of_turn>user You need to do language model rescoring in ASR. Given the 5-best hypotheses, you need to report the true transcription from the 5-best hypotheses. DO NOT WRITE ANYTHING BESIDES THE true hypothesis absolutely nothing else. Just simply say the true hypothesis. Don't say you can help me with this or anything like that. Just say the true hypothesis. your output should be a single sentence and that sentence should be the prediction it is important that you do not write anything else besides the true hypothesis as it will all be added to a csv file and we need to make sure that the csv file is correct. DONT EVEN CONFIRM JUST PUT THE RESPONSE. If none of the hypotheses make sense just generate one that is logical. YOU MUST SAY "The true hypothesis is:" followed by the hypothesis.


    Here are some examples of what you might take as an input:

    <start_of_turn>user "list the flights from dallas to baltimore arriving july onest", "list the flights from dallas to baltimore arriving july onest", "list the flights from dallas to baltimore arriving july one", "list the flights from dallas to baltimore arriving july one", "list the flights from dallas to baltimore arriving july onest"
    <start_of_turn>model The true hypothesis is: list the flights from dallas to baltimore arriving july first<end_of_turn>


    <start_of_turn>user "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june tenth", "i would like to fly from san diego to houston on june ten", "i would like to fly from san diego to houston on june ten"
    <start_of_turn>model The true hypothesis is: i would like to fly from san diego to houston on june tenth<end_of_turn>

    Don't say you can help me with this or please provide the true hypothesis. Just say the true hypothesis. your output should be a single sentence and that sentence should be the prediction it is important that you do not write anything else besides the true hypothesis as it will all be added to a csv file and we need to make sure that the csv file is correct. DONT EVEN CONFIRM JUST PUT THE RESPONSE. If none of the hypotheses make sense just generate one that is logical. YOU MUST SAY "The true hypothesis is:" followed by the hypothesis.

 """

    return "<start_of_turn>user" + few_shot_prompt + line + "[/INST]<start_of_turn>model The true hypothesis is: "

Loading Tokenizer

In [93]:
# tokenizer initialization 
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=hf_token)



In [None]:
for file in datasets:
    gemma_df = pd.DataFrame(columns=['input','output','prediction','match'])

    with open("../Test/test_test_"+file + ".json") as jsonFile:
        test_data = json.load(jsonFile)

        for question in test_data:
            hypotheses = question['input']
            test_txt = ""
            for line in hypotheses:
                test_txt += line + '\n'
            inputs = tokenizer(gemma_gensec(test_txt), return_tensors="pt").to(device)
            outputs = model.generate(**inputs, max_new_tokens=100)
            res = tokenizer.decode(outputs[0], skip_special_tokens=True)
            res = res.split('[/INST]model The true hypothesis is: ')[1].translate(str.maketrans('', '', punctuation_to_remove)).translate(str.maketrans(punctuation_to_replace, ' ')).strip().lower().split('\n')[0]
            
            truth = "Yes" if res == question['output'].strip() else "No"
            gemma_df = gemma_df._append({'input':test_txt,'prediction':res,'output':question['output'],'match':truth},ignore_index=True)

    # gemma_df.to_csv('../gemma_Gen_output/'+'{file}.csv', index=False)  
    gemma_df.to_csv(f'../gemma_Gen_output/{file}.csv', index=False)
    print('file: {file} done.') 



## History of all prompts 


In [None]:
# this is the history of almost all prompts and how we analyzed that errors and managed to find the best approach 
# above each prompt there is the edits done more than the previous to it
# below each prompt there is the calculated WER. 
# all of them is done using Mistral and on Chime4 dataset. 

"""
        Given 5 hypotheses, generate a single hypothesis by identifying and combining the  most repetitive sequences of sentences across all hypotheses.
        Focus on maximizing the repetition of phrases and words to create the most common hypothesis.
      if there is no commonality between the hypotheses, choose the hypothesis that is logical.
      """
# 0.1458921652688201



# chosing the sequence that makes more sense among them
"""
    Given 5 hypotheses, generate a single hypothesis by identifying and combining the  most repetitive sequences of sentences across all hypotheses considering the sequence making more sense with the rest of the sentence context.
    if there is no commonality between the hypotheses, choose the hypothesis that is logical.

"""
# 0.16049382716049382



# separate the characters of any abbreviation with a space.
"""
    Given 5 hypotheses, generate a single hypothesis by identifying and combining the  most repetitive sequences of sentences across all hypotheses considering the sequence making more sense with the rest of the sentence context.
    if there is no commonality between the hypotheses, choose the hypothesis that is logical.
    SEPARATE THE CHARACTERS OF ANY ABBREVIATION WITH A SPACE.
"""
# 0.10266056013206416




# a more weight for the sentences at the begenning of the hypotheses
# grammatical correction of the sentences ..... 
# Do not convert numbers written in characters into actual numbers.
"""
    Given 5 hypotheses, generate a single hypothesis by identifying and combining the  most repetitive sequences of sentences across all hypotheses considering the sequence making more sense with the rest of the sentence context.
    if there is no commonality between the hypotheses, choose the hypothesis that is logical.
    IF YOU NOTICED ANY ABBREVIATION, SEPARATE THE CHARACTERS OF THE ABBREVIATION WITH A SPACE. LIKE "U S" INSTEAD OF "US" OR "R L I COMPANY" INSTEAD OF "RLI COMPANY". DO NOT CONVERT NUMBERS WRITTEN IN CHARACTERS INTO ACTUAL NUMBERS. LIKE "FORTY TWO" INSTEAD OF "42" OR "five hundred and twenty five" INSTEAD OF "525". 
"""
# 0.09696953241433806
# 0.09633439338367253


# Mistral

In [None]:
# Given 5 hypotheses, generate a single hypothesis by identifying and combining the  most repetitive sequences of sentences across all hypotheses considering the sequence making more sense with the rest of the sentence context. 
#         Add a slight more weight for sentences at the beginning of the hypotheses. If there is no commonality between the hypotheses, choose the hypothesis that is logical.

In [4]:
def mistral_gensec(line):
    prompt = """
    For this task, only output the true hypothesis in the format <prediction>correct transcription</prediction> without anything extra. 
    
        Generate a single hypothesis from five given hypotheses by combining the most common sequences of tokenized words across all hypotheses, ensuring that the combined sequence fits logically within the sentence context. Give more weight to sequences from the beginning of the hypotheses. If there's no overlap between the hypotheses, choose the most logical one.

    Examples:
    Speech recognition: 
                this opposition did not pay in her
                the supposition did not pay in her
                this opposition did not pay her 
                this opposition did not pain her
                the supposition did not pain her
    <prediction>the opposition did not pay in her</prediction>


    Output must be within <prediction> tags without a space.

    Now, here's the input:
    """

    return prompt + "Speech recognition: " + line + "\n"


Loading Tokenizer

In [5]:
# this loading consumes a lot from the gpu ... drops from 23 thousand to 8 thousand. but much faster around 3/4 faster in time than the other loading method

# Clearing cache for new model to be loaded
gc.collect()
torch.cuda.empty_cache()

# Load the model with FP16 precision
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.1",
    device_map={"": 0},
    use_cache=False, 
    torch_dtype=torch.float16  # Using FP16 precision to speed up generation
).to("cuda")

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Adjust padding side to 'left'





Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# # Load the Mistral tokenizer
# tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

# # Input sentence
# sentence = "I am good"

# # Tokenize the sentence
# tokens = tokenizer.tokenize(sentence)

# # Output the tokenized values
# print(tokens)

['▁I', '▁am', '▁good']


In [35]:
def get_gpu_memory():
    
    torch.cuda.empty_cache()
    
    command = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = sp.check_output(command.split()).decode('ascii').split('\n')[:-1][1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    return memory_free_values

get_gpu_memory()

[15074]

In [8]:
torch.cuda.empty_cache()

# Initialize the text-generation pipeline
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

batch_size = 32  # Adjust based on available GPU memory
max_length = 512  # max number of tokens in a single input sequence that the model can handle 


# Loop through datasets
for file in datasets:
    clear_output(wait=True)
    print(f'{file}')
    
    with open(f"./Test/test_test_{file}.json") as jsonFile:
        test_data = json.load(jsonFile)

    # Create a DataFrame to store the results
    mistral_df = pd.DataFrame(columns=['input', 'output', 'prediction', 'match'])
    
    # Collect inputs for batch processing
    batch_inputs = []
    batch_outputs = []
    batch_hypotheses = []
    i = 1

    for question in test_data:
        print(f'\r{i}', end='', flush=True)
        # clear_lines_above(3)
        i += 1

        hypotheses = question['input']
        test_txt = "\n".join(hypotheses)  # Simplified hypothesis aggregation

        # Store the current hypothesis and output for later matching
        batch_hypotheses.append(test_txt)
        batch_outputs.append(question['output'])

        # Prepare the prompt for this hypothesis
        batch_inputs.append(f"<s>[INST] {mistral_gensec(test_txt)} [/INST]")

        # If we've reached batch size, process the batch
        if len(batch_inputs) == batch_size:
            # Tokenize and generate for the whole batch
            tokenized_inputs = tokenizer(
                batch_inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length
            ).to(model.device)
            
            generated_responses = model.generate(
                **tokenized_inputs,
                max_new_tokens=100,  # Control the number of new tokens generated
                pad_token_id=tokenizer.pad_token_id,  # Explicitly set pad_token_id
                max_length=max_length  # Set max_length to control total length
            )

            # Decode and process each generated response
            for idx, response in enumerate(generated_responses):
                res = tokenizer.decode(response, skip_special_tokens=True)
                matches = re.findall(r"<prediction>(.*?)</prediction>", res)
                prediction = matches[-1] if matches else "No prediction found"

                truth = "Yes" if prediction == batch_outputs[idx].strip() else "No"

                # Add the results to the DataFrame
                mistral_df = pd.concat([mistral_df, pd.DataFrame([{
                    'input': batch_hypotheses[idx],
                    'prediction': prediction,
                    'output': batch_outputs[idx],
                    'match': truth
                }])], ignore_index=True)

            # Clear the cache after processing the batch
            torch.cuda.empty_cache()

            # Clear the batch
            batch_inputs.clear()
            batch_outputs.clear()
            batch_hypotheses.clear()

    # Process any remaining items in the batch
    if batch_inputs:
        tokenized_inputs = tokenizer(
            batch_inputs, return_tensors="pt", padding=True, truncation=True, max_length=max_length
        ).to(model.device)
        
        generated_responses = model.generate(
            **tokenized_inputs,
            max_new_tokens=100,
            pad_token_id=tokenizer.pad_token_id,
            max_length=max_length
        )

        for idx, response in enumerate(generated_responses):
            res = tokenizer.decode(response, skip_special_tokens=True)
            matches = re.findall(r"<prediction>(.*?)</prediction>", res)
            prediction = matches[-1] if matches else "No prediction found"

            truth = "Yes" if prediction == batch_outputs[idx].strip() else "No"

            mistral_df = pd.concat([mistral_df, pd.DataFrame([{
                'input': batch_hypotheses[idx],
                'prediction': prediction,
                'output': batch_outputs[idx],
                'match': truth
            }])], ignore_index=True)

    # Save the results to a CSV file
    mistral_df.to_csv(f'./mistral_Gen_output/{file}.csv', index=False)
    # break
    print(f'file: {file} done.')

atis
32

Both `max_new_tokens` (=100) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


64

Both `max_new_tokens` (=100) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


96

Both `max_new_tokens` (=100) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


KeyboardInterrupt: 

# Calculate WER 

### loading data function

In [19]:
def load_data(name):
    df_dict = dict()
    for file in os.listdir(name):
        dir = os.path.join(name, file)
        df = pd.read_csv(dir)
        df_dict[file] = df
    return df_dict

# def load_data_json(name):
#     df_dict = dict()
#     for file in os.listdir(name):
#         dir = os.path.join(name, file)
#         df = pd.read_json(dir)
#         df_dict[file] = df
#     return df_dict

# def load_data_json(name):
#     df_dict = dict()
#     for file in os.listdir(name):
#         dir = os.path.join(name, file)
#         if file.endswith('.json'):  # Check if the file is a JSON file
#             df = pd.read_json(dir)
#             df_dict[file] = df
#     return df_dict

In [20]:
# llama_dfs = load_data('../llama_Gen_output')
# test_dfs = load_data_json('../../Test_abb')
# gemma_dfs = load_data('../gemma_Gen_output')
mistral_dfs = load_data('./mistral_Gen_output')
mistral_means = {}
gemma_means = {}
llama_means = {}
# test_means = {}

# for key, gemma_df in gemma_dfs.items():
#     gemma_df["wer"] = gemma_df.apply(calculate_wer, axis=1)
#     gemma_WER = round(gemma_df["wer"].mean(),1)
#     gemma_means[key] = gemma_WER

for key, mistral_df in mistral_dfs.items():  
    mistral_df["wer"] = mistral_df.apply(calculate_wer, axis=1)
    mistral_WER = round(mistral_df["wer"].mean(),1)
    mistral_means[key] = mistral_WER

# for key, llama_df in llama_dfs.items():  
#     llama_df["wer"] = llama_df.apply(calculate_wer, axis=1)
#     llama_WER = round(llama_df["wer"].mean(),1)
#     llama_means[key] = llama_WER

# for key, test_df in test_dfs.items():  
#     test_df["wer"] = test_df.apply(calculate_wer_test, axis=1)
#     test_WER = round(test_df["wer"].mean(),1)
#     test_means[key] = test_WER

# print(llama_dfs['td3.csv'])
# [1155 rows x 5 columns]
# print(test_dfs)
# 2620 rows x 3 columns]


## Final WERs for each dataset

In [21]:
# print("Test Output: \n", test_means)
print("Mistral Output: \n", mistral_means)
# print("Gemma Output: \n", gemma_means)
# print("LLaMa Output: \n", llama_means)

# a better visualization for the generated output would be after the ensemble approach


Mistral Output: 
 {'atis.csv': 6.1}


## Old Ensemble Approach

In [122]:
final_dfs = dict()

for key, mistral_df in mistral_dfs.items(): 
    final_df = pd.DataFrame(columns=['input','output','prediction','match'])
    
    for index, row in mistral_df.iterrows():
        if pd.isna(llama_dfs[key].iloc[index]['prediction']):
            continue
        if row['prediction'] == gemma_dfs[key].iloc[index]['prediction'] or row['prediction'] == llama_dfs[key].iloc[index]['prediction']:
            final_df = final_df._append({'input':row['input'],'output':row['output'],'prediction':row['prediction'],'match':row['output']},ignore_index=True)
        elif gemma_dfs[key].iloc[index]['prediction'] == llama_dfs[key].iloc[index]['prediction']:
            final_df = final_df._append({'input':gemma_dfs[key].iloc[index]['input'],'output':gemma_dfs[key].iloc[index]['output'],'prediction':gemma_dfs[key].iloc[index]['prediction'],'match':gemma_dfs[key].iloc[index]['match']},ignore_index=True)
        else:
            final_df = final_df._append({'input':row['input'],'output':row['output'],'prediction':row['prediction'],'match':row['match']},ignore_index=True)
    final_dfs[key] = final_df

In [125]:
ensemble_means = {}
for key, final_df in final_dfs.items():  
    final_df["wer"] = final_df.apply(calculate_wer, axis=1)
    final_WER = round(final_df["wer"].mean(),1)
    ensemble_means[key] = final_WER
print("Ensemble Output: \n",ensemble_means)

Ensemble Output: 
 {'td3.csv': 11.3, 'atis.csv': 5.4, 'swbd.csv': 24.7, 'coraal.csv': 29.0, 'cv.csv': 15.6, 'wsj_score.csv': 4.7, 'ls_clean.csv': 3.4, 'lrs2.csv': 14.9, 'chime4.csv': 9.1, 'ls_other.csv': 6.1}


# New Ensemble Approach

In [None]:
def llm_judge_prompt(input_hypotheses, llama_out, llama_closest, gemma_out, gemma_closest, mistral_out, mistral_closest):
    prompt = f"""<s>[INST]As a LLM judge, your task is to choose one transcription from the six options you receive to be the true ASR transcription. You will receive:
- 5 ASR hypotheses,
- 3 LLM outputs, and
- 3 closest mapped hypotheses (1 for each model output).

Please follow these rules:

1. Select the ASR hypothesis that best matches the 3 LLM outputs and the 3 closest mapped hypotheses. Prioritize the hypothesis that is grammatically correct and contextually accurate.
2. **Hallucination Handling**: If you notice that an LLM output significantly deviates in structure or meaning (hallucination) from all ASR hypotheses, choose the closest hypothesis instead. You can identify hallucination by comparing the length and semantics of the LLM outputs to the average length and meaning of the ASR hypotheses.
3. If the LLM outputs differ slightly from all ASR hypotheses but are not hallucinations, select the hypothesis that is semantically closest to the majority of the LLM outputs.
4. If two or more hypotheses seem equally logical, prefer the one that aligns most closely with the closest mapped hypotheses.
5. Do not change any of the inputs or generate a new transcription.
6. If no logical transcription can be determined, choose the most frequent ASR hypothesis or the one with the highest similarity to the closest mapped hypotheses.
7. Do not output any explanation or extra information.
8. Output your final selection in the format: <prediction>the correct transcription</prediction>.

Examples:

Example 1:
Speech recognition hypotheses:
"list the flights from dallas to baltimore arriving july onest",
"list the flights from dallas to baltimore arriving july onest",
"list the flights from dallas to baltimore arriving july one",
"list the flights from dallas to baltimore arriving july one",
"list the flights from dallas to baltimore arriving july onest",


<model 1 output>list the flights from dallas to baltimore arriving july onest
<model 1 closest hypothesis>list the flights from dallas to baltimore arriving july onest
<model 2 output>list the flights from dallas to baltimore arriving july first
<model 2 closest hypothesis>list the flights from dallas to baltimore arriving july onest
<model 3 output>list the flights from dallas to baltimore arriving july first
<model 3 closest hypothesis>list the flights from dallas to baltimore arriving july first
<prediction>list the flights from dallas to baltimore arriving july first</prediction>


Example 2:
Speech recognition hypotheses:
"it was formed by floyd soil liu",
"it was formed by floyd soil lu",
"it was formed by floyd soilu",
"it was formed by floyd soil liu",
"it was formed by floyd soylu",

<model 1 output>it was formed by floyd soil liu
<model 1 closest hypothesis>it was formed by floyd soil liu
<model 2 output>it was formed by floyd soil liu
<model 2 closest hypothesis>it was formed by floyd soil liu
<model 3 output>it was formed by floyd soil lu
<model 3 closest hypothesis>it was formed by floyd soil lu
<prediction> it was formed by floyd soileau </prediction>

Now, here are the ASR hypotheses:
{input_hypotheses}

<model 1 output>{llama_out}
<model 1 closest hypothesis>{llama_closest}
<model 2 output>{gemma_out}
<model 2 closest hypothesis>{gemma_closest}
<model 3 output>{mistral_out}
<model 3 closest hypothesis>{mistral_closest}
[/INST]<prediction>"""
    return prompt


## Loading Mistral -7B V 0.2

In [None]:
# from huggingface_hub import login
# login()

In [None]:
# loading Model from Hugging Face hub
base_model = "mistralai/Mistral-7B-Instruct-v0.2"

#Quanitization Configuration 
compute_dtype = getattr(torch, "float16")

# In our case, we create 4-bit quantization with NF4 type configuration using BitsAndBytes.
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0},
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Clearing cache for new model to be loaded
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Clearing cache for new model to be loaded
gc.collect()
torch.cuda.empty_cache()


# Load the model with FP16 precision
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    device_map={"": 0},
    use_cache=False, 
    torch_dtype=torch.float16  # Using FP16 precision to speed up generation
).to("cuda")

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  # Adjust padding side to 'left'


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def load_data_csv(name):
    df_dict = dict()
    for file in os.listdir(name):
        dir = os.path.join(name, file)
        print(dir)
        df = pd.read_csv(dir)
        df_dict[file] = df
    return df_dict

def load_data_json(name):
    df_dict = dict()
    for file in os.listdir(name):
        dir = os.path.join(name, file)
        if file.endswith('.json'):  # Check if the file is a JSON file
            df = pd.read_json(dir)
            df_dict[file] = df
    return df_dict

In [None]:
def calculate_levenshtein_distance_with_prediction(inputs, prediction):
    """
    Calculate the Levenshtein distance between each input and the prediction.
    Return the index of the input that best matches the prediction (smallest Levenshtein distance).
    """
    if not inputs or prediction is None:  # Check for None or empty inputs
        return None, None

    # Convert prediction to string and clean it
    prediction = str(prediction).strip()

    # Remove duplicates from inputs and ensure they're all strings
    unique_inputs = list(dict.fromkeys([input_str.strip() for input_str in inputs if input_str]))  # Clean and remove empty entries

    if not unique_inputs:
        return None, None

    # Calculate Levenshtein distances for each cleaned, unique input
    distances = [distance(str(input_str), prediction) for input_str in unique_inputs]
    
    if not distances:
        return None, None
    
    # Find the index of the input with the smallest distance
    min_distance_index = min(range(len(distances)), key=lambda i: distances[i])
    
    # Print debugging information for each distance calculation
    print(f"Distances for prediction '{prediction}': {[(input_str, dist) for input_str, dist in zip(unique_inputs, distances)]}")
    
    return min_distance_index, distances[min_distance_index]



def calculate_average_levenshtein_distance(inputs):
    """
    Calculate the average Levenshtein distance for a list of inputs.
    """
    num_inputs = len(inputs)
    avg_distances = []

    for i in range(num_inputs):
        total_distance = 0
        for j in range(num_inputs):
            if i != j:
                total_distance += distance(inputs[i], inputs[j])
        avg_distance = total_distance / (num_inputs - 1)  # Average with respect to other inputs
        avg_distances.append(avg_distance)

    return avg_distances

In [None]:
def ensemble_parameters(llama_df, gemma_df, mistral_df, idx):
    # Extract the predictions and closest hypotheses from each LLM dataframe for the current index
    return (
        llama_df['prediction'].iloc[idx], llama_df['closest_map'].iloc[idx], 
        gemma_df['prediction'].iloc[idx], gemma_df['closest_map'].iloc[idx], 
        mistral_df['prediction'].iloc[idx], mistral_df['closest_map'].iloc[idx]
    )


In [None]:
for file in datasets:
    llama_df = pd.read_csv(f'llama_Gen_output/{file}.csv')
    gemma_df = pd.read_csv(f'gemma_Gen_output/{file}.csv')
    mistral_df = pd.read_csv(f'mistral_Gen_output/{file}.csv')
    ensemble_df = pd.DataFrame(columns=['input', 'output', 'prediction', 'match'])

    for idx, row in llama_df.iterrows():
        inputs = row['input'].split('\n')  # Assuming 'input' is a list of input strings
        output = row['output']  # Assuming 'output' is a single string
        prediction = row['prediction']
        if not inputs or prediction is None:
            continue     
        # best matches a prediction

        min_distance_index, min_distance = calculate_levenshtein_distance_with_prediction(inputs, prediction)
        # Select the best input based on the lowest average distance
        selected_input = inputs[min_distance_index]
        #print(f"Selected input for closest_map at index {idx}: {selected_input}")
    
        llama_df.at[idx, 'closest_map'] = selected_input

        # After assignment, check the stored value in the DataFrame
        #print(f"Stored value in llama_df['closest_map'][{idx}]: {llama_df.at[idx, 'closest_map']}")

    for idx, row in gemma_df.iterrows():
        inputs = row['input'].split('\n')  # Assuming 'input' is a list of input strings
        output = row['output']  # Assuming 'output' is a single string
        prediction = row['prediction']
        if not inputs or prediction is None:
            continue     
        # best matches a prediction
        min_distance_index, min_distance = calculate_levenshtein_distance_with_prediction(inputs, prediction)
        # Select the best input based on the lowest average distance
        selected_input = inputs[min_distance_index]
        # row['selected'] = inputs[min_distance_index]
        gemma_df.at[idx, 'closest_map'] = selected_input

    for idx, row in mistral_df.iterrows():
        inputs = row['input'].split('\n') # Assuming 'input' is a list of input strings
        output = row['output']  # Assuming 'output' is a single string
        prediction = row['prediction']
        if not inputs or prediction is None:
            continue     
        # best matches a prediction
        min_distance_index, min_distance = calculate_levenshtein_distance_with_prediction(inputs, prediction)
        # Select the best input based on the lowest average distance
        selected_input = inputs[min_distance_index]
        # row['selected'] = inputs[min_distance_index]
        mistral_df.at[idx, 'closest_map'] = selected_input
    
    
    for idx, row in llama_df.iterrows():
        input_hypotheses =row['input'].split('\n')
        
        # Get ensemble parameters for the current row
        llama_out, llama_closest, gemma_out, gemma_closest, mistral_out, mistral_closest = ensemble_parameters(llama_df, gemma_df, mistral_df, idx)

        # Generate the prompt for the LLM judge
        prompt = llm_judge_prompt(
            input_hypotheses, 
            llama_out, llama_closest, 
            gemma_out, gemma_closest, 
            mistral_out, mistral_closest
        )
        
        # Tokenize and pass to the model (example: assuming a model is loaded)
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_new_tokens=100,  pad_token_id=tokenizer.eos_token_id)
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Extract the final prediction from the result
        try:
            prediction = result.split('[/INST]<prediction>')[1].split('</prediction>')[0].strip()
        except IndexError:
            prediction = "No valid hypothesis found"
        # print(result)
        # print("=====================")
        # Compare with ground truth (assuming you have it in row['output'])
        truth = "Yes" if prediction == row['output'].strip().lower() else "No"
        
        # Append to the ensemble dataframe
        ensemble_df = pd.concat([ensemble_df, pd.DataFrame([{
        'input': input_hypotheses, 
        'output': row['output'], 
        'prediction': prediction, 
        'match': truth
        }])], ignore_index=True)



    # Save the ensemble results
    ensemble_df.to_csv(f'ensemble_Gen_output/{file}.csv', index=False)
    print(f'file: {file} done.')


file: wsj_score done.


: 