In [1]:
import os
import re

import pandas as pd
from tqdm import tqdm

In [2]:
def check_file_type(path):
    if path.endswith(".parquet"):
        return pd.read_parquet(path)
    if path.endswith(".csv"):
        return pd.read_csv(path)

In [3]:
def extract_string(df, left: str, right: str):
    # Escape start and end if they are regex special characters
    start = re.escape(left)
    end = re.escape(right)

    # Define the regex pattern to extract the substring between start and end
    pattern = rf"{start}(.*?){end}"

    def extract_match(text):
        if isinstance(text, str):  # Ensure the text is a string
            # print(text, pattern)
            text = text.replace("\n", "").replace("\r", "")
            match = re.search(pattern, text, re.DOTALL)
            return match.group(1) if match else None
        return None

    # Apply the regex pattern to the specified column and create a new column with the results
    df["generated_val"] = df["generated"].apply(extract_match)
    df = df.drop(columns=["generated"])
    return df

In [4]:
paths = [
    # bool = is llama
    ("PRGen-llama-3-8b-Instruct-bnb-4bit-4bit-LoRA_out.csv", True),
    ("Phi-3-mini-4k-instruct_val.csv", False),
    ("llama-3-8b-Instruct-bnb-4bit_val.csv", True),
    ("loraphi_val.csv", False),
]

In [5]:
llama_template = ("<|start_header_id|>assistant<|end_header_id|>", "<|eot_id|>")
phi_template = ("<|end|><|assistant|>", "<|end|>")

In [6]:
directory = "generated/"
results = {}

for path, is_llama in tqdm(paths, total=len(paths)):
    df = pd.read_csv(directory + path)
    if is_llama:
        df = extract_string(df, *llama_template)
    else:
        df = extract_string(df, *phi_template)
    results[path] = df
    if not os.path.exists("output"):
        os.makedirs("output")
    df.to_csv(f"output/{path}", index=False)

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:01<00:00,  2.06it/s]
