In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [24]:
df = pd.read_csv('SPARQL_questions.csv')

In [25]:
df = df.drop_duplicates()
df = df[df['Task Type'].isin(['Membership', 'Property Assertion'])]
df.loc[:, 'Task ID temp'] = df['Task ID'].str.replace(r'-(BIN|MC)$', '', regex=True)

In [26]:
len(df)

28961

**Step 1: Create bins for each variable**

In [27]:
bin_edges = np.histogram_bin_edges(df['Size of ontology ABox'], bins='auto')
df['Bin_Size of ontology ABox'] = pd.cut(df['Size of ontology ABox'], bins=bin_edges, labels=False)

bin_edges = np.histogram_bin_edges(df['Avg Min Explanation Size'], bins='auto')
df['Bin_Avg Min Explanation Size'] = pd.cut(df['Avg Min Explanation Size'], bins=bin_edges, labels=False)

**Step 2:  Combine bins into a single stratification key**

In [28]:
df['strata'] = df['Bin_Size of ontology ABox'].astype(str) + '_' + df['Bin_Avg Min Explanation Size'].astype(str) 

**Step 3: Do stratified sampling**

In [29]:
# Step 1: Get unique groups 
df_groups = df.groupby('Task ID temp').first().reset_index()

In [30]:
# Step 2: Filter groups with at least 2 samples in the same strata
strata_counts = df_groups['strata'].value_counts()
valid_strata = strata_counts[strata_counts >= 2].index
df_filtered = df_groups[df_groups['strata'].isin(valid_strata)]

In [31]:
# Step 3: Stratified split at group level
train_groups, test_groups = train_test_split(
    df_filtered['Task ID temp'],
    test_size=0.95,
    stratify=df_filtered['strata'],
    random_state=42
)

In [32]:
# Step 4: Assign split back to original df rows based on group membership
df['split'] = 'test'  
df.loc[df['Task ID temp'].isin(train_groups), 'split'] = 'train'

In [33]:
len(df[df['split'] == 'train'])

1473

In [34]:
df[df['split'] == 'train'].to_csv("SPARQL_questions_sampling.csv", index=False)