In [17]:
# Read csv shards
import os
import pandas as pd

# specify the directory you want to use
path = './chagpt/unfiltered'

data_frames = []

# iterate over all files in the directory
for filename in os.listdir(path):
    if filename.endswith('.csv'):
        df = pd.read_csv(os.path.join(path, filename))
        data_frames.append(df)

# concatenate all the dataframes in the list
all_data = pd.concat(data_frames, ignore_index=True)
print(all_data.size)
all_data.head()

18708


Unnamed: 0,question,expected_fields
0,What are the recent updates regarding the fina...,['news_data']
1,What are the five most recent updates on Tesla...,['news_data']
2,What are the most recent updates regarding App...,['news_data']
3,What's the latest update on Apple's financial ...,['news_data']
4,What is the latest news update regarding the r...,['news_data']


In [18]:
# filter out duplicates in the QA sets
import re
import pandas as pd
from typing import List, Callable
from itertools import combinations


def _rouge_l_axis_0(
        df: pd.DataFrame, columns: List[str],
        threshold: float) -> pd.DataFrame:
    from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
    drop_indices = []

    for column in columns:
            
        questions = df[column].tolist()
        filtered_questions = []
        for i, question in enumerate(questions):
            if not any(
                    scorer.score(existing_question, question)['rougeL'].fmeasure >
                    threshold for existing_question in filtered_questions):
                filtered_questions.append(question)
            else:
                drop_indices.append(i)
    
    if threshold < 1: # Edge case, which allows for identical questions
        for column in columns:
            df = df.drop_duplicates(subset=[column], keep='first')

    df = df.drop(drop_indices, errors='ignore')
    return df


def _rouge_l_axis_1(
        df: pd.DataFrame, columns: List[str],
        threshold: float) -> pd.DataFrame:
    from rouge_score import rouge_scorer

    assert len(columns) >= 2, "Must provide at least two columns"

    scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

    def is_below_threshold(row):
        for column_a, column_b in combinations(columns, 2):
            score = scorer.score(row[column_a], row[column_b])[
                'rougeL'].fmeasure
            if score > threshold:
                return False
        return True

    mask = df.apply(is_below_threshold, axis=1)
    return df[mask]


def rouge_l(
        df: pd.DataFrame, columns: List[str],
        axis: int = 0, threshold: float = 0.7) -> pd.DataFrame:
    """
    Filter out generated questions that are word-for-word too similar to the
    other questions.

    Args:
        df: DataFrame containing the questions to filter
        columns: Name of the columns containing the columns to filter by
        axis: Axis to apply the filter on. 0 for vertical, 1 for horizontal.
        threshold: Rouge-L threshold to use for filtering. 0.7 is the threshold
            used by the self-instruct paper.
            *Note: threshold=1 means questions can be identical.
    """
    if axis == 0:
        return _rouge_l_axis_0(df, columns, threshold)
    elif axis == 1:
        return _rouge_l_axis_1(df, columns, threshold)
    else:
        raise ValueError("Axis must be 0 or 1")

filtered_0 = rouge_l(all_data, columns=['question'])
print("axis 0 done")
filtered_1 = rouge_l(filtered_0, columns=['question', 'expected_fields'], axis=1, threshold=0.3)
print(filtered_1.size)
filtered_1.head()

In [None]:
from datasets import load_dataset, Dataset

dataset = Dataset.from_pandas(filtered_1)
dataset = dataset.shuffle()
# TODO: change to a separate test data set
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
dataset_path = f"./chagpt/ConFIRM_QAset_{all_data.size}n_gpt"
dataset["train"].to_csv(dataset_path + "_train.csv")
dataset["test"].to_csv(dataset_path + "_test.csv")

Creating CSV from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 172.70ba/s]
Creating CSV from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 226.58ba/s]


99841