In [13]:
import pandas as pd
import os

# Directory containing the parquet files
parquet_dir = "crag_dataset/parquet/"

# List to hold all DataFrames
dfs = []

# Load and combine all Parquet files
for file_name in os.listdir(parquet_dir):
    if file_name.endswith(".parquet"):
        file_path = os.path.join(parquet_dir, file_name)
        df = pd.read_parquet(file_path)
        dfs.append(df)

# Concatenate all DataFrames
full_df = pd.concat(dfs, ignore_index=True)

# Get 20 random samples per domain
sampled_df = full_df.groupby("domain", group_keys=False).apply(lambda x: x.sample(min(len(x), 50), random_state=42))

# Reset index for clean output (optional)
sampled_df = sampled_df.reset_index(drop=True)

# Preview result
print(sampled_df["domain"].value_counts())
print(sampled_df.head())


finance    50
movie      50
music      50
open       50
sports     50
Name: domain, dtype: int64
                         interaction_id           query_time   domain  \
0  6707548c-f697-48f9-8e8b-c1bc642ea02f  2024-03-13 08:46:46  finance   
1  3a4206fb-5cd5-4db1-a330-9b4e5f3b073c  2024-03-13 09:04:37  finance   
2  040b8c87-7da9-4a93-92d4-54702d3fe327  2024-03-05 23:13:16  finance   
3  0f648bed-e047-4420-bfe4-2946a86e9ca4  2024-02-28 08:26:59  finance   
4  860f1d51-66b5-411a-93ec-370f361fe406  2024-03-17 16:48:50  finance   

        question_type static_or_dynamic  \
0  simple_w_condition            static   
1           multi-hop     fast-changing   
2         aggregation     slow-changing   
3              simple         real-time   
4         aggregation            static   

                                               query  \
0  what was the total value of all exchange-trade...   
1  which company in the s&p 500 index has the hig...   
2  what is the market share of micros

In [14]:
del sampled_df['query_time']
del sampled_df['split']
del sampled_df['alt_ans']

In [16]:
sampled_df.to_parquet("sampled_50_per_domain.parquet", index=False)