In [1]:
!pip install scikit-learn datasets huggingface_hub

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from datasets import Dataset, DatasetDict
import huggingface_hub

In [3]:
# --- 1. Load and Prepare the Dataset ---
print("Loading the dataset...")
# Load the tab-separated file.
try:
    df = pd.read_csv("definitions_clean.tsv", sep="\t", on_bad_lines='skip', quoting=3)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: definitions_clean.tsv not found. Please ensure the file is in the correct directory.")
    exit()


# Define the columns that are essential for the task.

required_columns = ['Lemma', 'DefinitionShort', 'DefinitionFull', 'LemmaID', 'MeaningID', 'POS']

# Drop rows where any of the essential columns have missing values.
# This ensures data quality for training.
# This is extra as the data has already been cleaned.
print(f"Initial number of entries: {len(df)}")
df.dropna(subset=required_columns, inplace=True)
print(f"Entries after dropping rows with missing essential values: {len(df)}")

# Ensure ID columns are of integer type for reliable grouping and referencing.
# Applying .astype(int) to the whole dataframe would cause an error due to text columns.
df[['LemmaID', 'MeaningID']] = df[['LemmaID', 'MeaningID']].astype(int)


# --- 2. First Split: Training (80%) and Temporary (20%) ---
print("\nPerforming initial split (80% train, 20% temp)...")
# The 'groups' parameter requires a Series or array-like object. The 'LemmaID' column is passed directly.
groups = df['LemmaID']

# Splitter for creating the main training set and a temporary set for validation/test.
# n_splits=1 means this split is performed once. random_state ensures reproducibility.
train_test_splitter = GroupShuffleSplit(test_size=0.20, n_splits=1, random_state=42)
train_idx, temp_idx = next(train_test_splitter.split(df, groups=groups))

train_df = df.iloc[train_idx]
temp_df = df.iloc[temp_idx]

print(f"Training set size: {len(train_df)} entries")
print(f"Temporary set size: {len(temp_df)} entries")


# --- 3. Second Split: Validation (10%) and Test (10%) from Temporary Set ---
print("\nSplitting temporary set into validation and test sets (50/50 split)...")
# Now the temporary dataframe is split, using its own LemmaIDs as groups.
temp_groups = temp_df['LemmaID']

# Splitter for creating the validation and test sets from the temporary set.
# A 50% split of the 20% temporary set results in 10% validation and 10% test of the original data.
val_test_splitter = GroupShuffleSplit(test_size=0.50, n_splits=1, random_state=42)
val_idx, test_idx = next(val_test_splitter.split(temp_df, groups=temp_groups))

validation_df = temp_df.iloc[val_idx]
test_df = temp_df.iloc[test_idx]

print(f"Validation set size: {len(validation_df)} entries")
print(f"Test set size: {len(test_df)} entries")


# --- 4. Verification of Data Integrity ---
print("\nVerifying that no LemmaID leaks between splits...")
# To verify, sets of the *LemmaID values* from each dataframe must be created, not sets of the dataframes themselves.
train_lemmas = set(train_df['LemmaID'])
val_lemmas = set(validation_df['LemmaID'])
test_lemmas = set(test_df['LemmaID'])

# The intersection of any two sets should be zero.
assert len(train_lemmas.intersection(val_lemmas)) == 0, "Leakage detected between train and validation sets!"
assert len(train_lemmas.intersection(test_lemmas)) == 0, "Leakage detected between train and test sets!"
assert len(val_lemmas.intersection(test_lemmas)) == 0, "Leakage detected between validation and test sets!"
print("Verification successful: No lemma leakage detected between the sets.")

# --- 5. Save Splits to Files ---
print("\nSaving the splits to TSV files...")

# Save each dataframe to a tab-separated file.
# index=False prevents pandas from writing the dataframe index as a column.
train_df.to_csv("train_split.tsv", sep='\t', index=False)
validation_df.to_csv("validation_split.tsv", sep='\t', index=False)
test_df.to_csv("test_split.tsv", sep='\t', index=False)

print("Successfully saved splits to train_split.tsv, validation_split.tsv, and test_split.tsv")

# --- 6. Final Output ---
print("\nPartitioning complete.")
print(f"Final sizes: Train={len(train_df)}, Validation={len(validation_df)}, Test={len(test_df)}")

Loading the dataset...
Dataset loaded successfully.
Initial number of entries: 34824
Entries after dropping rows with missing essential values: 34824

Performing initial split (80% train, 20% temp)...
Training set size: 27880 entries
Temporary set size: 6944 entries

Splitting temporary set into validation and test sets (50/50 split)...
Validation set size: 3494 entries
Test set size: 3450 entries

Verifying that no LemmaID leaks between splits...
Verification successful: No lemma leakage detected between the sets.

Saving the splits to TSV files...
Successfully saved splits to train_split.tsv, validation_split.tsv, and test_split.tsv

Partitioning complete.
Final sizes: Train=27880, Validation=3494, Test=3450


In [14]:
# --- 7. Convert to Hugging Face Datasets format ---
print("\nConverting DataFrames to Hugging Face DatasetDict...")
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(validation_df.reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df.reset_index(drop=True))
})
print("\nCreated Hugging Face DatasetDict:")
print(raw_datasets)


Converting DataFrames to Hugging Face DatasetDict...

Created Hugging Face DatasetDict:
DatasetDict({
    train: Dataset({
        features: ['Lemma', 'POS', 'MeaningNumber', 'LemmaID', 'MeaningID', 'DefinitionFull', 'DefinitionShort'],
        num_rows: 27880
    })
    validation: Dataset({
        features: ['Lemma', 'POS', 'MeaningNumber', 'LemmaID', 'MeaningID', 'DefinitionFull', 'DefinitionShort'],
        num_rows: 3494
    })
    test: Dataset({
        features: ['Lemma', 'POS', 'MeaningNumber', 'LemmaID', 'MeaningID', 'DefinitionFull', 'DefinitionShort'],
        num_rows: 3450
    })
})


In [15]:
# --- 6. Upload to Hugging Face Hub ---
# hf_repo_name = "my-username/my-awesome-dataset"
# Be logged in. Run `huggingface-cli login` in the terminal first.
# Or run python -m huggingface_hub.commands.huggingface_cli login.
hf_repo_name = "RobbedoesHF/dutch-definitions"

if hf_repo_name != "my-username/my-awesome-dataset":
    print(f"\nUploading dataset to Hugging Face Hub repository: {hf_repo_name}")
    try:
        raw_datasets.push_to_hub(hf_repo_name)
        print("Upload complete!")
    except Exception as e:
        print(f"An error occurred during upload: {e}")
        print("Make sure I'm logged in (`huggingface-cli login`) and the repository name is correct.")
else:
    print("\nSkipping upload. To upload your dataset, edit the `hf_repo_name` variable in the script.")


Uploading dataset to Hugging Face Hub repository: RobbedoesHF/dutch-definitions


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Upload complete!
