In [1]:
import pandas as pd
from src.factory import dataset_factory
from src.augment import generate_dataset


class MockTokenizer:
    def __init__(self,):
        pass

    def tokenize(self, text: str)->list[str]:
        return text.split()
    
    def convert_tokens_to_string(self, tokens: list[str])->str:
        return " ".join(tokens)
    

positive_df = pd.DataFrame({
    "text": [
        "<CLASS_1> TEXT IS HERE <CLASS_1>. ", 
        "<CLASS_2> TEXT IS HERE <CLASS_2>. ",
        "<CLASS_3> TEXT IS HERE <CLASS_3>. ",
        "<CLASS_4> TEXT IS HERE <CLASS_4>. ", 
        "<CLASS_5> TEXT IS HERE <CLASS_5>. ",
    ],
    "label": ["class 1", "class 2", "class 3", "class 4", "class 5"]
})
negative_df = pd.DataFrame({
    "text": [
        "This is some rather interesting text. It really makes me go wow! Who would have thought it?",
        "Once upon a time, there was a rather interesting story. It was about a cat and a dog. The cat was a rather interesting character.",
        "This is a rather interesting story. It is about a cat and a dog. The cat is a rather interesting character.",
        "Once upon a time, in a galaxy far far away. The year was 2020 and the world was in chaos. The cat was a rather interesting character.",
        "Hi, I'm making up some random text. The story begins in Paris, where a mouse and a squirrel are having a conversation. The mouse is a rather interesting character.",
        "Hey GPT, can you come up with some random story you know? I'm bored and I need some entertainment. The cat was a rather interesting character.",
    ]
})

dataset = dataset_factory(
    positive_df=positive_df, # dataframe of positive examples
    negative_df=negative_df, # dataframe of negative examples
)
    
tokenizer = MockTokenizer()

dataset_generator = generate_dataset(
    dataset=dataset, 
    tokenizer=tokenizer, 
    n_positive=1000,
    classes=["class 1", "class 2", "class 3", "class 4", "class 5"],
    max_length=50,
    bleed_allowance=0.2,
    max_labels=5,
)

dataset_generator.multi_label_classification_dataset()

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


Unnamed: 0,text,clipped,class 1,class 5,class 2,class 3,class 4
0,<CLASS_1> TEXT IS HERE <CLASS_1>.,0,1,0,0,0,0
1,<CLASS_2> TEXT IS HERE <CLASS_2>.,0,0,0,1,0,0
2,<CLASS_3> TEXT IS HERE <CLASS_3>.,0,0,0,0,1,0
3,<CLASS_4> TEXT IS HERE <CLASS_4>.,0,0,0,0,0,1
4,<CLASS_5> TEXT IS HERE <CLASS_5>.,0,0,1,0,0,0
...,...,...,...,...,...,...,...
1000,"Once upon a time, there was a rather interesti...",0,0,1,0,1,1
1001,<CLASS_4> TEXT IS HERE <CLASS_4>. Once upon a ...,0,0,1,0,1,1
1002,<CLASS_4> TEXT IS HERE <CLASS_4>. The year was...,0,0,0,0,1,1
1003,<CLASS_5> TEXT IS HERE <CLASS_5>. It is about ...,0,1,1,1,1,0
