In [None]:
import pandas as pd

folds = pd.read_csv('folds.csv', sep = '\t')

In [None]:
examples = folds[folds['fold']=="examples"]

In [None]:
examples.columns

# Select examples for binary task

In [None]:
import pandas as pd
import numpy as np

# Assuming df is already defined as your DataFrame with Dutch sentences and labels

# Extract sentences and labels
sentences = examples['sentence'].values
labels = examples['relevance_manual'].values

# Convert labels to numeric (including np.nan as a distinct label)
unique_labels = pd.Series(labels).unique()
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
numeric_labels = np.array([label_mapping[label] for label in labels])

# Function for random selection
def random_selection(X, y, k):
    """Randomly selects k samples for each label."""
    random_samples_idx = []
    for label in np.unique(y):
        class_indices = np.where(y == label)[0]
        if len(class_indices) > 0:
            selected_indices = np.random.choice(class_indices, size=min(k, len(class_indices)), replace=False)
            random_samples_idx.append(selected_indices)
    
    return np.concatenate(random_samples_idx) if random_samples_idx else np.array([])

# Initialize an empty DataFrame to store the results
random_bin_examples = pd.DataFrame(columns=["example", "label", "k", "fold"])

# Random selection process with 5 folds
for k in range(1, 6):  # For each k
    for fold in range(1, 6):  # Create 5 different folds
        selected_samples_idx = random_selection(sentences, numeric_labels, k)
        selected_sentences = [sentences[i] for i in selected_samples_idx]
        selected_labels = numeric_labels[selected_samples_idx]

        # Create a temporary DataFrame for this value of k and fold
        temp_df = pd.DataFrame({
            "example": selected_sentences,
            "label": [unique_labels[label] for label in selected_labels],
            "k": k,
            "fold": fold
        })
        
        # Append the temporary DataFrame to the results DataFrame
        random_bin_examples = pd.concat([random_bin_examples, temp_df], ignore_index=True)

# Display the final DataFrame
print(random_bin_examples)


In [None]:
examples_class = examples[examples['relevance_manual']==1]

In [None]:
print(examples_class)

# Select examples for regression task

In [None]:
import pandas as pd
import numpy as np

# Assuming df is already defined as your DataFrame with Dutch sentences and labels

# Extract sentences and labels
sentences = examples_class['sentence'].values
labels = examples_class['manual_sentence_labels'].values

# Convert labels to numeric (including np.nan as a distinct label)
unique_labels = pd.Series(labels).unique()
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
numeric_labels = np.array([label_mapping[label] for label in labels])

# Function for random selection
def random_selection(X, y, k):
    """Randomly selects k samples for each label."""
    random_samples_idx = []
    for label in np.unique(y):
        class_indices = np.where(y == label)[0]
        if len(class_indices) > 0:
            selected_indices = np.random.choice(class_indices, size=min(k, len(class_indices)), replace=False)
            random_samples_idx.append(selected_indices)
    
    return np.concatenate(random_samples_idx) if random_samples_idx else np.array([])

# Initialize an empty DataFrame to store the results
random_class_examples = pd.DataFrame(columns=["example", "label", "k", "fold"])

# Random selection process with 5 folds
for k in range(1, 6):  # For each k
    for fold in range(1, 6):  # Create 5 different folds
        selected_samples_idx = random_selection(sentences, numeric_labels, k)
        selected_sentences = [sentences[i] for i in selected_samples_idx]
        selected_labels = numeric_labels[selected_samples_idx]

        # Create a temporary DataFrame for this value of k and fold
        temp_df = pd.DataFrame({
            "example": selected_sentences,
            "label": [unique_labels[label] for label in selected_labels],
            "k": k,
            "fold": fold
        })
        
        # Append the temporary DataFrame to the results DataFrame
        random_class_examples = pd.concat([random_class_examples, temp_df], ignore_index=True)

# Display the final DataFrame
print(random_class_examples)


In [None]:
random_bin_examples.to_csv('df_bin_examples_random.csv', sep='\t')
random_class_examples.to_csv('df_class_examples_random.csv', sep='\t')

In [None]:
examples_class