In [None]:
!pip install datasets

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Loading dataset
df = pd.read_csv("legal_compliance_dataset.csv")
df = df.dropna()

# Checking class distribution
print("Original distribution:\n", df['label'].value_counts())

# Define number of samples per class (use minimum class size or cap at 1500)
num_samples = min(df['label'].value_counts().min(), 1500)

# Sampling equally from each class
df_samples = []
for label in df['label'].unique():
    df_class = df[df['label'] == label]
    df_class_sample = df_class.sample(num_samples, random_state=42)
    df_samples.append(df_class_sample)

# Concatenatening balanced dataset
balanced_df = pd.concat(df_samples)

# Renaming columns to match HuggingFace format
balanced_df = balanced_df.rename(columns={'label': 'labels'})

# Encoding labels as integers
balanced_df['labels'] = balanced_df['labels'].astype('category').cat.codes

# Shuffling dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split ratios
train_frac = 0.7
valid_frac = 0.15
test_frac = 0.15

train_size = int(train_frac * len(balanced_df))
valid_size = int(valid_frac * len(balanced_df))

train_df = balanced_df[:train_size]
valid_df = balanced_df[train_size:train_size + valid_size]
test_df = balanced_df[train_size + valid_size:]

# Converting to Hugging Face datasets
train_ds = Dataset.from_pandas(train_df)
valid_ds = Dataset.from_pandas(valid_df)
test_ds = Dataset.from_pandas(test_df)

dataset_dict = DatasetDict({
    'train': train_ds,
    'validation': valid_ds,
    'test': test_ds
})

print(dataset_dict)


Original distribution:
 label
CAN-SPAM Violation    54
CCPA Disclosure       50
CAN-SPAM Compliant    40
Non-Disclosure        35
Name: count, dtype: int64
DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 98
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 21
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 21
    })
})


In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
dataset_dict.push_to_hub("RohitWani17/legalcompliance")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/RohitWani17/legalcompliance/commit/d6c3a6103eb21ac1bd8c653259248bb3f2c5b7c7', commit_message='Upload dataset', commit_description='', oid='d6c3a6103eb21ac1bd8c653259248bb3f2c5b7c7', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/RohitWani17/legalcompliance', endpoint='https://huggingface.co', repo_type='dataset', repo_id='RohitWani17/legalcompliance'), pr_revision=None, pr_num=None)