From de0df1adc9bb525fd00cba0fdc9ed33891054954 Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Thu, 20 Jul 2023 14:36:41 +0530 Subject: [PATCH 01/21] Task: support hf dataset for augmentation --- langtest/augmentation/__init__.py | 58 +++++++++++++++++++++++-------- langtest/langtest.py | 6 ++-- 2 files changed, 47 insertions(+), 17 deletions(-) diff --git a/langtest/augmentation/__init__.py b/langtest/augmentation/__init__.py index 0e16c84e8..5ea0084aa 100644 --- a/langtest/augmentation/__init__.py +++ b/langtest/augmentation/__init__.py @@ -11,7 +11,7 @@ from langtest.transform import TestFactory from langtest.utils.custom_types import Sample -from langtest.datahandler.datasource import DataFactory +from langtest.datahandler.datasource import DataFactory, HuggingFaceDataset from langtest.transform.utils import create_terminology from langtest.utils.custom_types.output import NEROutput from langtest.utils.custom_types.predictions import NERPrediction, SequenceLabel @@ -93,7 +93,12 @@ def __init__( with open(self.config) as fread: self.config = yaml.safe_load(fread) - def fix(self, input_path: str, output_path, export_mode: str = "add"): + def fix( + self, + input_path: Optional[Union[str, dict]], + output_path, + export_mode: str = "add", + ): """Applies perturbations to the input data based on the recommendations from harness reports. Args: @@ -108,8 +113,17 @@ def fix(self, input_path: str, output_path, export_mode: str = "add"): Returns: List[Dict[str, Any]]: A list of augmented data samples. """ - self.df = DataFactory(input_path, self.task) - data = self.df.load() + if type(input_path) == dict: + self.df = HuggingFaceDataset(input_path["name"], self.task) + data = self.df.load_data( + feature_column=input_path.get("feature_column", "text"), + target_column=input_path.get("target_column", "label"), + split=input_path.get("split", "test"), + subset=input_path.get("subset", None), + ) + else: + self.df = DataFactory(input_path, self.task) + data = self.df.load() TestFactory.is_augment = True supported_tests = TestFactory.test_scenarios() suggest: pd.DataFrame = self.suggestions(self.h_report) @@ -162,19 +176,33 @@ def fix(self, input_path: str, output_path, export_mode: str = "add"): sample_data = random.choices(data, k=int(sample_length)) aug_data, _ = TestFactory.transform(self.task, sample_data, test_type) final_aug_data.extend(aug_data) + if type(input_path) == dict: + if export_mode == "inplace": + final_aug_data = list(hash_map.values()) + self.df.export_data(final_aug_data, output_path) + elif export_mode == "transformed": + final_aug_data = [hash_map[i] for i in hash_map if i in sample_indices] + self.df.export_data(final_aug_data, output_path) + else: + data.extend(final_aug_data) + self.df.export_data(data, output_path) + + TestFactory.is_augment = False + return final_aug_data - if export_mode == "inplace": - final_aug_data = list(hash_map.values()) - self.df.export(final_aug_data, output_path) - elif export_mode == "transformed": - final_aug_data = [hash_map[i] for i in hash_map if i in sample_indices] - self.df.export(final_aug_data, output_path) else: - data.extend(final_aug_data) - self.df.export(data, output_path) - - TestFactory.is_augment = False - return final_aug_data + if export_mode == "inplace": + final_aug_data = list(hash_map.values()) + self.df.export(final_aug_data, output_path) + elif export_mode == "transformed": + final_aug_data = [hash_map[i] for i in hash_map if i in sample_indices] + self.df.export(final_aug_data, output_path) + else: + data.extend(final_aug_data) + self.df.export(data, output_path) + + TestFactory.is_augment = False + return final_aug_data def suggestions(self, report: "pd.DataFrame") -> "pd.DataFrame": """Calculates suggestions for improving test performance based on a given report. diff --git a/langtest/langtest.py b/langtest/langtest.py index cf57e482c..075666afd 100644 --- a/langtest/langtest.py +++ b/langtest/langtest.py @@ -717,7 +717,7 @@ def generated_results(self) -> Optional[pd.DataFrame]: def augment( self, - input_path: str, + input_path: Optional[Union[str, dict]], output_path: str, custom_proportions: Union[Dict, List] = None, export_mode: str = "add", @@ -726,7 +726,9 @@ def augment( """Augments the data in the input file located at `input_path` and saves the result to `output_path`. Args: - input_path (str): Path to the input file. + input_path (Union[str, dict]): The path to the input data file or a dictionary containing the huggingface dataset directly. + If a dictionary is provided, the keys 'name', 'feature_column', 'target_column', + 'split', and 'subset' can be used to specify the dataset details. output_path (str): Path to save the augmented data. custom_proportions (Union[Dict, List]): export_mode (str, optional): Determines how the samples are modified or exported. From 7dcd8cb52252b8d449154c9b31afc663547b003e Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Thu, 20 Jul 2023 14:49:33 +0530 Subject: [PATCH 02/21] fix(augmentation/__init__.py): Bug fix in export_mode = transformed --- langtest/augmentation/__init__.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/langtest/augmentation/__init__.py b/langtest/augmentation/__init__.py index 5ea0084aa..1323e3d55 100644 --- a/langtest/augmentation/__init__.py +++ b/langtest/augmentation/__init__.py @@ -102,16 +102,17 @@ def fix( """Applies perturbations to the input data based on the recommendations from harness reports. Args: - input_path (str): The path to the input data file. + input_path (Union[str, dict]): The path to the input data file or a dictionary containing the huggingface dataset directly. + If a dictionary is provided, the keys 'name', 'feature_column', 'target_column', + 'split', and 'subset' can be used to specify the dataset details. output_path (str): The path to save the augmented data file. export_mode (str, optional): Determines how the samples are modified or exported. - 'inplace': Modifies the list of samples in place. - 'add': Adds new samples to the input data. - 'transformed': Exports only the transformed data, excluding untransformed samples. Defaults to 'add'. - - Returns: - List[Dict[str, Any]]: A list of augmented data samples. + Returns: + List[Dict[str, Any]]: A list of augmented data samples. """ if type(input_path) == dict: self.df = HuggingFaceDataset(input_path["name"], self.task) @@ -136,7 +137,7 @@ def fix( final_aug_data = [] hash_map = {k: v for k, v in enumerate(data)} - + transformed_data = [] for proportion in suggest.iterrows(): cat = proportion[-1]["category"].lower() if cat not in ["robustness", "bias"]: @@ -149,7 +150,7 @@ def fix( * self.max_prop * (proportion[-1]["proportion_increase"] / sum_propotion) ) - if export_mode in ("inplace", "transformed"): + if export_mode in ("inplace"): sample_indices = random.sample( range(0, len(data)), int(sample_length) ) @@ -176,13 +177,16 @@ def fix( sample_data = random.choices(data, k=int(sample_length)) aug_data, _ = TestFactory.transform(self.task, sample_data, test_type) final_aug_data.extend(aug_data) + + if export_mode == "transformed": + transformed_data.extend(aug_data) if type(input_path) == dict: + if export_mode == "inplace": final_aug_data = list(hash_map.values()) self.df.export_data(final_aug_data, output_path) elif export_mode == "transformed": - final_aug_data = [hash_map[i] for i in hash_map if i in sample_indices] - self.df.export_data(final_aug_data, output_path) + self.df.export_data(transformed_data, output_path) else: data.extend(final_aug_data) self.df.export_data(data, output_path) @@ -195,8 +199,7 @@ def fix( final_aug_data = list(hash_map.values()) self.df.export(final_aug_data, output_path) elif export_mode == "transformed": - final_aug_data = [hash_map[i] for i in hash_map if i in sample_indices] - self.df.export(final_aug_data, output_path) + self.df.export(transformed_data, output_path) else: data.extend(final_aug_data) self.df.export(data, output_path) From 56128c994a9bf4a28e47ad98e7093c31ab21e895 Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Thu, 20 Jul 2023 19:48:32 +0530 Subject: [PATCH 03/21] Test(test/test_augmentation.py): added test for coverage --- langtest/augmentation/__init__.py | 1 - tests/test_augmentation.py | 54 +++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/langtest/augmentation/__init__.py b/langtest/augmentation/__init__.py index 1323e3d55..030be97a3 100644 --- a/langtest/augmentation/__init__.py +++ b/langtest/augmentation/__init__.py @@ -181,7 +181,6 @@ def fix( if export_mode == "transformed": transformed_data.extend(aug_data) if type(input_path) == dict: - if export_mode == "inplace": final_aug_data = list(hash_map.values()) self.df.export_data(final_aug_data, output_path) diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py index 3727c9bd4..815e166b9 100644 --- a/tests/test_augmentation.py +++ b/tests/test_augmentation.py @@ -37,6 +37,20 @@ def setUp(self) -> None: "config": "tests/fixtures/config_ner.yaml", "hub": "huggingface", }, + "spacy_textclassification_csv_dataset": { + "task": "text-classification", + "model": "textcat_imdb", + "data": "imdb/sample.csv", + "config": "tests/fixtures/config_ner.yaml", + "hub": "spacy", + }, + "huggingface_textclassification_csv_dataset": { + "task": "text-classification", + "model": "lvwerra/distilbert-imdb", + "data": "imdb/sample.csv", + "config": "tests/fixtures/config_ner.yaml", + "hub": "huggingface", + }, } def test_augment_robustness(self): @@ -163,6 +177,46 @@ def test_spacy_templatic_augmentation(self): is_file_exist = pl.Path("tests/fixtures/augmentated_train.conll").is_file() self.assertTrue(is_file_exist) + def test_csv_dataset_textclassification_hf(self): + """ + Test augmentation using Hugging Face NER model. + """ + harness = Harness(**self.params["huggingface_textclassification_csv_dataset"]) + self.assertIsInstance(harness, Harness) + harness.data = harness.data[:50] + report = harness.generate().run().report() + self.assertIsInstance(report, pd.DataFrame) + + harness.augment( + input_path="imdb/sample.csv", + output_path="augmented_train_transformed.csv", + export_mode="transformed", + ) + is_file_exist = pl.Path( + "tests/fixtures/augmented_train_transformed.csv" + ).is_file() + self.assertTrue(is_file_exist) + + def test_csv_dataset_textclassification_spacy(self): + """ + Test augmentation using Hugging Face NER model. + """ + harness = Harness(**self.params["spacy_textclassification_csv_dataset"]) + self.assertIsInstance(harness, Harness) + harness.data = harness.data[:50] + report = harness.generate().run().report() + self.assertIsInstance(report, pd.DataFrame) + + harness.augment( + input_path="imdb/sample.csv", + output_path="augmented_train_transformed.csv", + export_mode="transformed", + ) + is_file_exist = pl.Path( + "tests/fixtures/augmented_train_transformed.csv" + ).is_file() + self.assertTrue(is_file_exist) + class TestTemplaticAugmentation(unittest.TestCase): """Test case for the TemplaticAugment class""" From 20347a748ebf25529c46cc1b7489c98dd08e4d3b Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Thu, 20 Jul 2023 20:00:12 +0530 Subject: [PATCH 04/21] Test(test_augmentation.py): Added more tests --- tests/test_augmentation.py | 95 +++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 27 deletions(-) diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py index 815e166b9..0f9f7aca9 100644 --- a/tests/test_augmentation.py +++ b/tests/test_augmentation.py @@ -9,9 +9,7 @@ class AugmentWorkflowTestCase(unittest.TestCase): - """ - Test case for the AugmentRobustness class. - """ + """Test case for the AugmentRobustness class.""" def setUp(self) -> None: """""" @@ -51,12 +49,24 @@ def setUp(self) -> None: "config": "tests/fixtures/config_ner.yaml", "hub": "huggingface", }, + "spacy_textclassification_hf_dataset": { + "task": "text-classification", + "model": "textcat_imdb", + "data": {"name": "imdb"}, + "config": "tests/fixtures/config_ner.yaml", + "hub": "spacy", + }, + "huggingface_textclassification_hf_dataset": { + "task": "text-classification", + "model": "lvwerra/distilbert-imdb", + "data": {"name": "imdb"}, + "config": "tests/fixtures/config_ner.yaml", + "hub": "huggingface", + }, } def test_augment_robustness(self): - """ - Test augmenting data for robustness. - """ + """Test augmenting data for robustness.""" temp_df = pd.DataFrame( { "test_type": [ @@ -90,9 +100,8 @@ def test_augment_robustness(self): self.assertTrue(is_file_exist) def test_hf_ner_augmentation(self): - """ - Test augmentation using Hugging Face NER model. - """ + """Test augmentation using Hugging Face NER model.""" + harness = Harness(**self.params["huggingface_ner"]) self.assertIsInstance(harness, Harness) report = harness.generate().run().report() @@ -107,9 +116,8 @@ def test_hf_ner_augmentation(self): self.assertTrue(is_file_exist) def test_spacy_ner_augmentation(self): - """ - Test augmentation using spaCy NER model. - """ + """Test augmentation using spaCy NER model.""" + harness = Harness(**self.params["spacy_ner"]) self.assertIsInstance(harness, Harness) report = harness.generate().run().report() @@ -124,9 +132,8 @@ def test_spacy_ner_augmentation(self): self.assertTrue(is_file_exist) def test_custom_proportions_augment_harness(self): - """ - Test augmentation with custom proportions using Hugging Face NER model. - """ + """Test augmentation with custom proportions using Hugging Face NER model.""" + harness = Harness(**self.params["huggingface_ner"]) self.assertIsInstance(harness, Harness) report = harness.generate().run().report() @@ -145,9 +152,8 @@ def test_custom_proportions_augment_harness(self): self.assertTrue(is_file_exist) def test_templatic_augmentation(self): - """ - Test augmentation using templatic augmentation. - """ + """Test augmentation using templatic augmentation.""" + generator = TemplaticAugment( templates=["I living in {LOC}", "you are working in {ORG}"], task="ner", @@ -161,9 +167,8 @@ def test_templatic_augmentation(self): self.assertTrue(is_file_exist) def test_spacy_templatic_augmentation(self): - """ - Test augmentation using templatic augmentation with spaCy NER model. - """ + """Test augmentation using templatic augmentation with spaCy NER model.""" + harness = Harness(**self.params["spacy_ner"]) self.assertIsInstance(harness, Harness) report = harness.generate().run().report() @@ -178,9 +183,8 @@ def test_spacy_templatic_augmentation(self): self.assertTrue(is_file_exist) def test_csv_dataset_textclassification_hf(self): - """ - Test augmentation using Hugging Face NER model. - """ + """Test augmentation using Hugging Face text-classification model.""" + harness = Harness(**self.params["huggingface_textclassification_csv_dataset"]) self.assertIsInstance(harness, Harness) harness.data = harness.data[:50] @@ -198,9 +202,8 @@ def test_csv_dataset_textclassification_hf(self): self.assertTrue(is_file_exist) def test_csv_dataset_textclassification_spacy(self): - """ - Test augmentation using Hugging Face NER model. - """ + """Test augmentation using Spacy text-classification model.""" + harness = Harness(**self.params["spacy_textclassification_csv_dataset"]) self.assertIsInstance(harness, Harness) harness.data = harness.data[:50] @@ -217,6 +220,44 @@ def test_csv_dataset_textclassification_spacy(self): ).is_file() self.assertTrue(is_file_exist) + def test_hf_dataset_textclassification_hf(self): + """Test augmentation using Hugging Face text-classification model.""" + + harness = Harness(**self.params["huggingface_textclassification_hf_dataset"]) + self.assertIsInstance(harness, Harness) + harness.data = harness.data[:50] + report = harness.generate().run().report() + self.assertIsInstance(report, pd.DataFrame) + + harness.augment( + input_path={"name": "imdb"}, + output_path="augmented_train_transformed.csv", + export_mode="transformed", + ) + is_file_exist = pl.Path( + "tests/fixtures/augmented_train_transformed.csv" + ).is_file() + self.assertTrue(is_file_exist) + + def test_hf_dataset_textclassification_spacy(self): + """Test augmentation using Spacy text-classification model.""" + + harness = Harness(**self.params["spacy_textclassification_hf_dataset"]) + self.assertIsInstance(harness, Harness) + harness.data = harness.data[:50] + report = harness.generate().run().report() + self.assertIsInstance(report, pd.DataFrame) + + harness.augment( + input_path={"name": "imdb"}, + output_path="augmented_train_transformed.csv", + export_mode="transformed", + ) + is_file_exist = pl.Path( + "tests/fixtures/augmented_train_transformed.csv" + ).is_file() + self.assertTrue(is_file_exist) + class TestTemplaticAugmentation(unittest.TestCase): """Test case for the TemplaticAugment class""" From d269f0852e2e76731c3c286c05c74f6478058a32 Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Thu, 20 Jul 2023 20:28:52 +0530 Subject: [PATCH 05/21] Test(test/test_augmentation.py): updated path --- tests/test_augmentation.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py index 0f9f7aca9..d546ea589 100644 --- a/tests/test_augmentation.py +++ b/tests/test_augmentation.py @@ -38,14 +38,14 @@ def setUp(self) -> None: "spacy_textclassification_csv_dataset": { "task": "text-classification", "model": "textcat_imdb", - "data": "imdb/sample.csv", + "data": "tests/fixtures/text_classification.csv", "config": "tests/fixtures/config_ner.yaml", "hub": "spacy", }, "huggingface_textclassification_csv_dataset": { "task": "text-classification", "model": "lvwerra/distilbert-imdb", - "data": "imdb/sample.csv", + "data": "tests/fixtures/text_classification.csv", "config": "tests/fixtures/config_ner.yaml", "hub": "huggingface", }, @@ -192,12 +192,12 @@ def test_csv_dataset_textclassification_hf(self): self.assertIsInstance(report, pd.DataFrame) harness.augment( - input_path="imdb/sample.csv", - output_path="augmented_train_transformed.csv", + input_path="tests/fixtures/text_classification.csv", + output_path="tests/fixtures/augmented_text_classification.csv", export_mode="transformed", ) is_file_exist = pl.Path( - "tests/fixtures/augmented_train_transformed.csv" + "tests/fixtures/augmented_text_classification.csv" ).is_file() self.assertTrue(is_file_exist) @@ -211,12 +211,12 @@ def test_csv_dataset_textclassification_spacy(self): self.assertIsInstance(report, pd.DataFrame) harness.augment( - input_path="imdb/sample.csv", - output_path="augmented_train_transformed.csv", + input_path="tests/fixtures/text_classification.csv", + output_path="tests/fixtures/augmented_text_classification.csv", export_mode="transformed", ) is_file_exist = pl.Path( - "tests/fixtures/augmented_train_transformed.csv" + "tests/fixtures/augmented_text_classification.csv" ).is_file() self.assertTrue(is_file_exist) @@ -231,7 +231,7 @@ def test_hf_dataset_textclassification_hf(self): harness.augment( input_path={"name": "imdb"}, - output_path="augmented_train_transformed.csv", + output_path="tests/fixtures/augmented_train_transformed.csv", export_mode="transformed", ) is_file_exist = pl.Path( @@ -250,7 +250,7 @@ def test_hf_dataset_textclassification_spacy(self): harness.augment( input_path={"name": "imdb"}, - output_path="augmented_train_transformed.csv", + output_path="tests/fixtures/augmented_train_transformed.csv", export_mode="transformed", ) is_file_exist = pl.Path( From fe566c4fa67ae475f9af3b5e6bd9952238f04914 Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Thu, 20 Jul 2023 21:03:47 +0530 Subject: [PATCH 06/21] task(test_augmentation.py): Updated the config path for text-classification --- tests/test_augmentation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py index d546ea589..263f33b57 100644 --- a/tests/test_augmentation.py +++ b/tests/test_augmentation.py @@ -53,14 +53,14 @@ def setUp(self) -> None: "task": "text-classification", "model": "textcat_imdb", "data": {"name": "imdb"}, - "config": "tests/fixtures/config_ner.yaml", + "config": "tests/fixtures/config_text_classification.yaml", "hub": "spacy", }, "huggingface_textclassification_hf_dataset": { "task": "text-classification", "model": "lvwerra/distilbert-imdb", "data": {"name": "imdb"}, - "config": "tests/fixtures/config_ner.yaml", + "config": "tests/fixtures/config_text_classification.yaml", "hub": "huggingface", }, } From 795e6393372a8efc82af26370cc5d8efbb86f38b Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Thu, 20 Jul 2023 21:17:45 +0530 Subject: [PATCH 07/21] task(test_augmentation.py): Updated the config path for text-classification --- tests/test_augmentation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py index 263f33b57..8d0270a98 100644 --- a/tests/test_augmentation.py +++ b/tests/test_augmentation.py @@ -39,14 +39,14 @@ def setUp(self) -> None: "task": "text-classification", "model": "textcat_imdb", "data": "tests/fixtures/text_classification.csv", - "config": "tests/fixtures/config_ner.yaml", + "config": "tests/fixtures/config_text_classification.yaml", "hub": "spacy", }, "huggingface_textclassification_csv_dataset": { "task": "text-classification", "model": "lvwerra/distilbert-imdb", "data": "tests/fixtures/text_classification.csv", - "config": "tests/fixtures/config_ner.yaml", + "config": "tests/fixtures/config_text_classification.yaml", "hub": "huggingface", }, "spacy_textclassification_hf_dataset": { From 81c9835c5f5a48e6d6c8f441cb9a1ad51e486fc6 Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Thu, 20 Jul 2023 22:56:07 +0530 Subject: [PATCH 08/21] Task(test_augmentation): Added custom proportions --- tests/test_augmentation.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py index 8d0270a98..b0489de58 100644 --- a/tests/test_augmentation.py +++ b/tests/test_augmentation.py @@ -190,10 +190,11 @@ def test_csv_dataset_textclassification_hf(self): harness.data = harness.data[:50] report = harness.generate().run().report() self.assertIsInstance(report, pd.DataFrame) - + custom_proportions = {"uppercase": 0.8, "lowercase": 0.8} harness.augment( input_path="tests/fixtures/text_classification.csv", output_path="tests/fixtures/augmented_text_classification.csv", + custom_proportions=custom_proportions, export_mode="transformed", ) is_file_exist = pl.Path( @@ -209,10 +210,11 @@ def test_csv_dataset_textclassification_spacy(self): harness.data = harness.data[:50] report = harness.generate().run().report() self.assertIsInstance(report, pd.DataFrame) - + custom_proportions = {"uppercase": 0.8, "lowercase": 0.8} harness.augment( input_path="tests/fixtures/text_classification.csv", output_path="tests/fixtures/augmented_text_classification.csv", + custom_proportions=custom_proportions, export_mode="transformed", ) is_file_exist = pl.Path( @@ -228,10 +230,11 @@ def test_hf_dataset_textclassification_hf(self): harness.data = harness.data[:50] report = harness.generate().run().report() self.assertIsInstance(report, pd.DataFrame) - + custom_proportions = {"uppercase": 0.8, "lowercase": 0.8} harness.augment( input_path={"name": "imdb"}, output_path="tests/fixtures/augmented_train_transformed.csv", + custom_proportions=custom_proportions, export_mode="transformed", ) is_file_exist = pl.Path( @@ -247,10 +250,11 @@ def test_hf_dataset_textclassification_spacy(self): harness.data = harness.data[:50] report = harness.generate().run().report() self.assertIsInstance(report, pd.DataFrame) - + custom_proportions = {"uppercase": 0.8, "lowercase": 0.8} harness.augment( input_path={"name": "imdb"}, output_path="tests/fixtures/augmented_train_transformed.csv", + custom_proportions=custom_proportions, export_mode="transformed", ) is_file_exist = pl.Path( From ffeeb3fcdeecb6658efa3c7f356e9467b0903ca0 Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Fri, 21 Jul 2023 00:55:45 +0530 Subject: [PATCH 09/21] task(langtest.py): Updated Args --- langtest/langtest.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/langtest/langtest.py b/langtest/langtest.py index 075666afd..250981943 100644 --- a/langtest/langtest.py +++ b/langtest/langtest.py @@ -717,8 +717,8 @@ def generated_results(self) -> Optional[pd.DataFrame]: def augment( self, - input_path: Optional[Union[str, dict]], - output_path: str, + training_data: dict, + augmented_data: str, custom_proportions: Union[Dict, List] = None, export_mode: str = "add", templates: Optional[Union[str, List[str]]] = None, @@ -726,10 +726,8 @@ def augment( """Augments the data in the input file located at `input_path` and saves the result to `output_path`. Args: - input_path (Union[str, dict]): The path to the input data file or a dictionary containing the huggingface dataset directly. - If a dictionary is provided, the keys 'name', 'feature_column', 'target_column', - 'split', and 'subset' can be used to specify the dataset details. - output_path (str): Path to save the augmented data. + training_data (dict): A dictionary containing the input data for augmentation. + augmented_data (str): Path to save the augmented data. custom_proportions (Union[Dict, List]): export_mode (str, optional): Determines how the samples are modified or exported. - 'inplace': Modifies the list of samples in place. @@ -746,9 +744,6 @@ def augment( Note: This method uses an instance of `AugmentRobustness` to perform the augmentation. - Example: - >>> harness = Harness(...) - >>> harness.augment("train.conll", "augmented_train.conll") """ dtypes = list( map( @@ -788,7 +783,7 @@ def augment( _ = TemplaticAugment( templates=templates, task=self.task, - ).fix(input_path=input_path, output_path=output_path) + ).fix(training_data=training_data, output_path=augmented_data) else: _ = AugmentRobustness( @@ -796,7 +791,11 @@ def augment( config=self._config, h_report=self.df_report, custom_proportions=custom_proportions, - ).fix(input_path=input_path, output_path=output_path, export_mode=export_mode) + ).fix( + training_data=training_data, + output_path=augmented_data, + export_mode=export_mode, + ) return self From 631bd4a70aa5bcdeba46c242cbc0ea5ebe254e68 Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Fri, 21 Jul 2023 01:09:32 +0530 Subject: [PATCH 10/21] task(augmentation/__init__.py): Updated Args --- langtest/augmentation/__init__.py | 37 +++++++++++++++---------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/langtest/augmentation/__init__.py b/langtest/augmentation/__init__.py index 030be97a3..d870ccea9 100644 --- a/langtest/augmentation/__init__.py +++ b/langtest/augmentation/__init__.py @@ -95,16 +95,14 @@ def __init__( def fix( self, - input_path: Optional[Union[str, dict]], - output_path, + training_data: dict, + output_path: str, export_mode: str = "add", ): """Applies perturbations to the input data based on the recommendations from harness reports. Args: - input_path (Union[str, dict]): The path to the input data file or a dictionary containing the huggingface dataset directly. - If a dictionary is provided, the keys 'name', 'feature_column', 'target_column', - 'split', and 'subset' can be used to specify the dataset details. + training_data (dict): A dictionary containing the input data for augmentation. output_path (str): The path to save the augmented data file. export_mode (str, optional): Determines how the samples are modified or exported. - 'inplace': Modifies the list of samples in place. @@ -114,16 +112,17 @@ def fix( Returns: List[Dict[str, Any]]: A list of augmented data samples. """ - if type(input_path) == dict: - self.df = HuggingFaceDataset(input_path["name"], self.task) - data = self.df.load_data( - feature_column=input_path.get("feature_column", "text"), - target_column=input_path.get("target_column", "label"), - split=input_path.get("split", "test"), - subset=input_path.get("subset", None), - ) + if len(training_data) > 1: + if "." not in training_data["data_source"]: + self.df = HuggingFaceDataset(training_data["data_source"], self.task) + data = self.df.load_data( + feature_column=training_data.get("feature_column", "text"), + target_column=training_data.get("target_column", "label"), + split=training_data.get("split", "test"), + subset=training_data.get("subset", None), + ) else: - self.df = DataFactory(input_path, self.task) + self.df = DataFactory(training_data["data_source"], self.task) data = self.df.load() TestFactory.is_augment = True supported_tests = TestFactory.test_scenarios() @@ -180,7 +179,7 @@ def fix( if export_mode == "transformed": transformed_data.extend(aug_data) - if type(input_path) == dict: + if len(training_data) > 1: if export_mode == "inplace": final_aug_data = list(hash_map.values()) self.df.export_data(final_aug_data, output_path) @@ -301,7 +300,7 @@ class TemplaticAugment(BaseAugmentaion): Methods: __init__(self, templates: Union[str, List[str]], task: str): Initializes the TemplaticAugment class. - fix(self, input_path: str, output_path: str, *args, **kwargs): Performs the templatic augmentation and exports the results to a specified path. + fix(self, training_data: str, output_path: str, *args, **kwargs): Performs the templatic augmentation and exports the results to a specified path. """ def __init__(self, templates: Union[str, List[str]], task: str) -> None: @@ -322,13 +321,13 @@ def __init__(self, templates: Union[str, List[str]], task: str) -> None: elif isinstance(self.__templates, list) and isinstance(self.__templates[0], str): self.__templates = [self.str_to_sample(i) for i in self.__templates] - def fix(self, input_path: str, output_path: str, max_num=None, *args, **kwargs): + def fix(self, training_data: str, output_path: str, max_num=None, *args, **kwargs): """ This method is used to perform the templatic augmentation. It takes the input data, performs the augmentation and then saves the augmented data to the output path. Parameters: - input_path (str): The path to the input data. + training_data (dict): A dictionary containing the input data for augmentation. output_path (str): The path where the augmented data will be saved. *args: Variable length argument list. **kwargs: Arbitrary keyword arguments. @@ -337,7 +336,7 @@ def fix(self, input_path: str, output_path: str, max_num=None, *args, **kwargs): bool: Returns True upon successful completion of the method. """ - df = DataFactory(input_path, self.__task) + df = DataFactory(training_data["data_source"], self.__task) data = df.load() new_data = [] self.__search_results = self.search_sample_results(data) From ef2bd6c7c96739d24c5fef4bd104773a3aa179f1 Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Fri, 21 Jul 2023 01:50:41 +0530 Subject: [PATCH 11/21] update: test augmentation --- tests/test_augmentation.py | 67 +++++++++++++------------------------- 1 file changed, 22 insertions(+), 45 deletions(-) diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py index b0489de58..57f6d030c 100644 --- a/tests/test_augmentation.py +++ b/tests/test_augmentation.py @@ -35,13 +35,6 @@ def setUp(self) -> None: "config": "tests/fixtures/config_ner.yaml", "hub": "huggingface", }, - "spacy_textclassification_csv_dataset": { - "task": "text-classification", - "model": "textcat_imdb", - "data": "tests/fixtures/text_classification.csv", - "config": "tests/fixtures/config_text_classification.yaml", - "hub": "spacy", - }, "huggingface_textclassification_csv_dataset": { "task": "text-classification", "model": "lvwerra/distilbert-imdb", @@ -91,7 +84,8 @@ def test_augment_robustness(self): config=yaml.safe_load("tests/fixtures/config_ner.yaml"), ) augment.fix( - "tests/fixtures/train.conll", "tests/fixtures/augmentated_train.conll" + training_data={"data_source": "tests/fixtures/train.conll"}, + output_path="tests/fixtures/augmentated_train.conll", ) self.assertIsInstance(augment, AugmentRobustness) self.assertIsInstance(augment.suggestions(temp_df), pd.DataFrame) @@ -108,8 +102,10 @@ def test_hf_ner_augmentation(self): self.assertIsInstance(report, pd.DataFrame) harness.augment( - "tests/fixtures/train.conll", - "tests/fixtures/augmentated_train.conll", + training_data={ + "data_source": "tests/fixtures/train.conll", + }, + augmented_data="tests/fixtures/augmentated_train.conll", export_mode="inplace", ) is_file_exist = pl.Path("tests/fixtures/augmentated_train.conll").is_file() @@ -124,8 +120,8 @@ def test_spacy_ner_augmentation(self): self.assertIsInstance(report, pd.DataFrame) harness.augment( - "tests/fixtures/train.conll", - "tests/fixtures/augmentated_train.conll", + training_data={"data_source": "tests/fixtures/train.conll"}, + augmented_data="tests/fixtures/augmentated_train.conll", export_mode="inplace", ) is_file_exist = pl.Path("tests/fixtures/augmentated_train.conll").is_file() @@ -142,8 +138,8 @@ def test_custom_proportions_augment_harness(self): proportions = {"uppercase": 0.5, "lowercase": 0.5} harness.augment( - "tests/fixtures/train.conll", - "tests/fixtures/augmentated_train.conll", + training_data={"data_source": "tests/fixtures/train.conll"}, + augmented_data="tests/fixtures/augmentated_train.conll", custom_proportions=proportions, export_mode="inplace", ) @@ -160,8 +156,8 @@ def test_templatic_augmentation(self): ) self.assertIsInstance(generator, TemplaticAugment) generator.fix( - "tests/fixtures/train.conll", - "tests/fixtures/augmentated_train.conll", + training_data={"data_source": "tests/fixtures/train.conll"}, + output_path="tests/fixtures/augmentated_train.conll", ) is_file_exist = pl.Path("tests/fixtures/augmentated_train.conll").is_file() self.assertTrue(is_file_exist) @@ -175,8 +171,8 @@ def test_spacy_templatic_augmentation(self): self.assertIsInstance(report, pd.DataFrame) harness.augment( - "tests/fixtures/train.conll", - "tests/fixtures/augmentated_train.conll", + training_data={"data_source": "tests/fixtures/train.conll"}, + augmented_data="tests/fixtures/augmentated_train.conll", templates=["I living in {LOC}", "you are working in {ORG}"], ) is_file_exist = pl.Path("tests/fixtures/augmentated_train.conll").is_file() @@ -192,28 +188,8 @@ def test_csv_dataset_textclassification_hf(self): self.assertIsInstance(report, pd.DataFrame) custom_proportions = {"uppercase": 0.8, "lowercase": 0.8} harness.augment( - input_path="tests/fixtures/text_classification.csv", - output_path="tests/fixtures/augmented_text_classification.csv", - custom_proportions=custom_proportions, - export_mode="transformed", - ) - is_file_exist = pl.Path( - "tests/fixtures/augmented_text_classification.csv" - ).is_file() - self.assertTrue(is_file_exist) - - def test_csv_dataset_textclassification_spacy(self): - """Test augmentation using Spacy text-classification model.""" - - harness = Harness(**self.params["spacy_textclassification_csv_dataset"]) - self.assertIsInstance(harness, Harness) - harness.data = harness.data[:50] - report = harness.generate().run().report() - self.assertIsInstance(report, pd.DataFrame) - custom_proportions = {"uppercase": 0.8, "lowercase": 0.8} - harness.augment( - input_path="tests/fixtures/text_classification.csv", - output_path="tests/fixtures/augmented_text_classification.csv", + training_data={"tests/fixtures/text_classification.csv"}, + augmented_data="tests/fixtures/augmented_text_classification.csv", custom_proportions=custom_proportions, export_mode="transformed", ) @@ -232,8 +208,8 @@ def test_hf_dataset_textclassification_hf(self): self.assertIsInstance(report, pd.DataFrame) custom_proportions = {"uppercase": 0.8, "lowercase": 0.8} harness.augment( - input_path={"name": "imdb"}, - output_path="tests/fixtures/augmented_train_transformed.csv", + training_data={"name": "imdb"}, + augmented_data="tests/fixtures/augmented_train_transformed.csv", custom_proportions=custom_proportions, export_mode="transformed", ) @@ -252,8 +228,8 @@ def test_hf_dataset_textclassification_spacy(self): self.assertIsInstance(report, pd.DataFrame) custom_proportions = {"uppercase": 0.8, "lowercase": 0.8} harness.augment( - input_path={"name": "imdb"}, - output_path="tests/fixtures/augmented_train_transformed.csv", + training_data={"name": "imdb"}, + augmented_data="tests/fixtures/augmented_train_transformed.csv", custom_proportions=custom_proportions, export_mode="transformed", ) @@ -335,7 +311,8 @@ def test_fix(self): templates=["My name is {PER} and I am from {LOC}"], task="ner" ) generator.fix( - input_path=self.conll_path, output_path="/tmp/augmented_conll.conll" + training_data={"data_source": self.conll_path}, + output_path="/tmp/augmented_conll.conll", ) with open("/tmp/augmented_conll.conll", "r") as reader: lines = [line.strip() for line in reader.readlines() if line.strip() != ""] From eed87785a93a8e21c62863be62115db0880ec0c3 Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Fri, 21 Jul 2023 02:00:21 +0530 Subject: [PATCH 12/21] task(test_augmentation.py): added data_source --- tests/test_augmentation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_augmentation.py b/tests/test_augmentation.py index 57f6d030c..4b8feb993 100644 --- a/tests/test_augmentation.py +++ b/tests/test_augmentation.py @@ -188,7 +188,7 @@ def test_csv_dataset_textclassification_hf(self): self.assertIsInstance(report, pd.DataFrame) custom_proportions = {"uppercase": 0.8, "lowercase": 0.8} harness.augment( - training_data={"tests/fixtures/text_classification.csv"}, + training_data={"data_source": "tests/fixtures/text_classification.csv"}, augmented_data="tests/fixtures/augmented_text_classification.csv", custom_proportions=custom_proportions, export_mode="transformed", @@ -208,7 +208,7 @@ def test_hf_dataset_textclassification_hf(self): self.assertIsInstance(report, pd.DataFrame) custom_proportions = {"uppercase": 0.8, "lowercase": 0.8} harness.augment( - training_data={"name": "imdb"}, + training_data={"data_source": "imdb"}, augmented_data="tests/fixtures/augmented_train_transformed.csv", custom_proportions=custom_proportions, export_mode="transformed", @@ -228,7 +228,7 @@ def test_hf_dataset_textclassification_spacy(self): self.assertIsInstance(report, pd.DataFrame) custom_proportions = {"uppercase": 0.8, "lowercase": 0.8} harness.augment( - training_data={"name": "imdb"}, + training_data={"data_source": "imdb"}, augmented_data="tests/fixtures/augmented_train_transformed.csv", custom_proportions=custom_proportions, export_mode="transformed", From f331a236f90b01b3f27a9f693eeafc4a47c8ff5f Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Fri, 21 Jul 2023 02:33:09 +0530 Subject: [PATCH 13/21] updated augmentation/__init__.py --- langtest/augmentation/__init__.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/langtest/augmentation/__init__.py b/langtest/augmentation/__init__.py index d870ccea9..f9f835043 100644 --- a/langtest/augmentation/__init__.py +++ b/langtest/augmentation/__init__.py @@ -112,15 +112,14 @@ def fix( Returns: List[Dict[str, Any]]: A list of augmented data samples. """ - if len(training_data) > 1: - if "." not in training_data["data_source"]: - self.df = HuggingFaceDataset(training_data["data_source"], self.task) - data = self.df.load_data( - feature_column=training_data.get("feature_column", "text"), - target_column=training_data.get("target_column", "label"), - split=training_data.get("split", "test"), - subset=training_data.get("subset", None), - ) + if "." not in training_data["data_source"]: + self.df = HuggingFaceDataset(training_data["data_source"], self.task) + data = self.df.load_data( + feature_column=training_data.get("feature_column", "text"), + target_column=training_data.get("target_column", "label"), + split=training_data.get("split", "test"), + subset=training_data.get("subset", None), + ) else: self.df = DataFactory(training_data["data_source"], self.task) data = self.df.load() @@ -179,7 +178,7 @@ def fix( if export_mode == "transformed": transformed_data.extend(aug_data) - if len(training_data) > 1: + if "." not in training_data["data_source"]: if export_mode == "inplace": final_aug_data = list(hash_map.values()) self.df.export_data(final_aug_data, output_path) From 4e3be6c247c7634227586a834ada8726545da713 Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Tue, 1 Aug 2023 23:49:10 +0530 Subject: [PATCH 14/21] website and notebook updated --- .../misc/Augmentation_Control_Notebook.ipynb | 1815 ++++++++++++++--- docs/pages/docs/generate_augmentation.md | 30 +- 2 files changed, 1585 insertions(+), 260 deletions(-) diff --git a/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb b/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb index e9a5ad4c4..1e14fd913 100644 --- a/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb +++ b/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "e7PsSmy9sCoR" @@ -11,7 +10,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "MhgkQYQiEvZt" @@ -21,7 +19,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "WJJzt3RWhEc6" @@ -33,7 +30,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "26qXWhCYhHAt" @@ -54,7 +50,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "Jx4OHnOchSeC" @@ -75,7 +70,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "yR6kjOaiheKN" @@ -88,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": { "id": "lTzSJpMlhgq5" }, @@ -99,7 +93,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "sBcZjwJBhkOw" @@ -113,10 +106,10 @@ "\n", "\n", "| Parameter | Description | \n", - "| - | - | \n", + "| - | - |\n", "|**task** |Task for which the model is to be evaluated (text-classification or ner)|\n", "|**model** |PipelineModel or path to a saved model or pretrained pipeline/model from hub.\n", - "|**data** |Path to the data that is to be used for evaluation. Can be .csv or .conll file in the CoNLL format \n", + "|**data** |Path to the data that is to be used for evaluation. Can be .csv or .conll file in the CoNLL format\n", "|**config** |Configuration for the tests to be performed, specified in form of a YAML file.\n", "|**hub** |model hub to load from the path. Required if model param is passed as path.|\n", "\n", @@ -125,7 +118,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "JFhJ9CcbsKqN" @@ -137,7 +129,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "UtxtE6Y0r4CJ" @@ -151,7 +142,7 @@ "\n", "2. Test NER model robustness on CoNLL test set\n", "\n", - "3. Augment CoNLL training set based on test results \n", + "3. Augment CoNLL training set based on test results\n", "\n", "4. Train new NER model on augmented CoNLL training set\n", "\n", @@ -161,7 +152,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "I21Jmq79jgC6" @@ -186,7 +176,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "MNtH_HOUt_PL" @@ -197,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": { "id": "jRnEmCfPhsZs" }, @@ -208,13 +197,13 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bHXeP18sGp-g", - "outputId": "1bd2ea97-e002-451b-d60b-cae915c78fb6" + "outputId": "f50e09d2-8c9c-44d5-9287-be7014d1307f" }, "outputs": [ { @@ -233,7 +222,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "kKgXC7cvuyar" @@ -244,35 +232,85 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "metadata": { - "id": "RVk9NWn7u-Lm" + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RVk9NWn7u-Lm", + "outputId": "d542c0fe-78fe-40cd-ce96-a4040b9b040f" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test Configuration : \n", + " {\n", + " \"tests\": {\n", + " \"defaults\": {\n", + " \"min_pass_rate\": 1.0\n", + " },\n", + " \"robustness\": {\n", + " \"add_typo\": {\n", + " \"min_pass_rate\": 0.7\n", + " },\n", + " \"american_to_british\": {\n", + " \"min_pass_rate\": 0.7\n", + " }\n", + " },\n", + " \"accuracy\": {\n", + " \"min_micro_f1_score\": {\n", + " \"min_score\": 0.7\n", + " }\n", + " },\n", + " \"bias\": {\n", + " \"replace_to_female_pronouns\": {\n", + " \"min_pass_rate\": 0.7\n", + " },\n", + " \"replace_to_low_income_country\": {\n", + " \"min_pass_rate\": 0.7\n", + " }\n", + " },\n", + " \"fairness\": {\n", + " \"min_gender_f1_score\": {\n", + " \"min_score\": 0.6\n", + " }\n", + " },\n", + " \"representation\": {\n", + " \"min_label_representation_count\": {\n", + " \"min_count\": 50\n", + " }\n", + " }\n", + " }\n", + "}\n" + ] + } + ], "source": [ "harness = Harness(task=\"ner\", model=ner_model, data=\"sample.conll\", hub=\"johnsnowlabs\")" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mynkAUwZyuFN", - "outputId": "a7b97865-fc75-4070-c5b4-0533617a7782" + "outputId": "1ad0c141-bc67-4ac1-bff7-d102a71b8693" }, "outputs": [ { "data": { "text/plain": [ "{'tests': {'defaults': {'min_pass_rate': 0.65},\n", - " 'robustness': {'add_typo': {'min_pass_rate': 0.65},\n", + " 'robustness': {'add_typo': {'min_pass_rate': 0.65},\n", " 'lowercase': {'min_pass_rate': 0.65}}}}" ] }, - "execution_count": 18, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -281,9 +319,9 @@ "harness.configure({\n", " 'tests': {\n", " 'defaults': {'min_pass_rate': 0.65},\n", - " \n", + "\n", " 'robustness': {\n", - " 'add_typo': {'min_pass_rate': 0.65}, \n", + " 'add_typo': {'min_pass_rate': 0.65},\n", " 'lowercase':{'min_pass_rate': 0.65},\n", " }\n", " }\n", @@ -291,7 +329,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "ZPU46A7WigFr" @@ -301,7 +338,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "MomLlmTwjpzU" @@ -315,20 +351,27 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "UiUNzTwF89ye", - "outputId": "1ec7fe1f-c342-45da-b919-d48e8e082341" + "outputId": "f77a840d-a816-4d2c-9de6-a8a991f047b5" }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generating testcases...: 100%|██████████| 1/1 [00:00<00:00, 5526.09it/s]\n" + ] + }, { "data": { "text/plain": [] }, - "execution_count": 19, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -338,7 +381,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": { "id": "UiMIF-o49Bg_" @@ -349,21 +391,22 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "p0tTwFfc891k", - "outputId": "05b03712-2723-418a-936e-2cbbc818f215" + "outputId": "3676052a-635b-4cc3-b23d-1e44f097065b" }, "outputs": [ { "data": { "text/html": [ "\n", - "
\n", + "\n", + "
\n", "
\n", "
\n", "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categorytest_typeoriginaltest_caseexpected_resultactual_resultpass
0robustnessadd_speech_to_text_typohide new secretions from the parental unitshide new secretions frum the parental units'NEGATIVENEGATIVETrue
1robustnessadd_speech_to_text_typocontains no wit , only labored gagscontains know witte , only labored gagsNEGATIVENEGATIVETrue
2robustnessadd_speech_to_text_typothat loves its characters and communicates som...that loves its characters and communicates som...POSITIVEPOSITIVETrue
3robustnessadd_speech_to_text_typoremains utterly satisfied to remain the same t...remains utterly satisfied to remain the sejm t...NEGATIVENEGATIVETrue
4robustnessadd_speech_to_text_typoon the worst revenge-of-the-nerds clichés the ...aune the worst revenge-of-the-nerds clichés th...NEGATIVENEGATIVETrue
........................
995robustnessadd_ocr_typotrue startrne ftarPOSITIVENEGATIVEFalse
996robustnessadd_ocr_typohampered -- no , paralyzed -- by a self-indulg...hampered -- n^o , paralyzed -- by a self-indul...NEGATIVENEGATIVETrue
997robustnessadd_ocr_typois expressly for idiots who do n't care what k...is expressly f^r idiots avho do n't caie v\\hat...NEGATIVENEGATIVETrue
998robustnessadd_ocr_typois haunting ... ( it 's ) what punk rock music...is haunting ... ( i^t 's ) v\\hat punk rock mul...POSITIVENEGATIVEFalse
999robustnessadd_ocr_typowhich nurses plot holes gaping enough to pilot...y/hich nurses plot holes gaping enongh t^o pil...NEGATIVENEGATIVETrue
\n", + "

1000 rows × 7 columns

\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + " category test_type \\\n", + "0 robustness add_speech_to_text_typo \n", + "1 robustness add_speech_to_text_typo \n", + "2 robustness add_speech_to_text_typo \n", + "3 robustness add_speech_to_text_typo \n", + "4 robustness add_speech_to_text_typo \n", + ".. ... ... \n", + "995 robustness add_ocr_typo \n", + "996 robustness add_ocr_typo \n", + "997 robustness add_ocr_typo \n", + "998 robustness add_ocr_typo \n", + "999 robustness add_ocr_typo \n", + "\n", + " original \\\n", + "0 hide new secretions from the parental units \n", + "1 contains no wit , only labored gags \n", + "2 that loves its characters and communicates som... \n", + "3 remains utterly satisfied to remain the same t... \n", + "4 on the worst revenge-of-the-nerds clichés the ... \n", + ".. ... \n", + "995 true star \n", + "996 hampered -- no , paralyzed -- by a self-indulg... \n", + "997 is expressly for idiots who do n't care what k... \n", + "998 is haunting ... ( it 's ) what punk rock music... \n", + "999 which nurses plot holes gaping enough to pilot... \n", + "\n", + " test_case expected_result \\\n", + "0 hide new secretions frum the parental units' NEGATIVE \n", + "1 contains know witte , only labored gags NEGATIVE \n", + "2 that loves its characters and communicates som... POSITIVE \n", + "3 remains utterly satisfied to remain the sejm t... NEGATIVE \n", + "4 aune the worst revenge-of-the-nerds clichés th... NEGATIVE \n", + ".. ... ... \n", + "995 trne ftar POSITIVE \n", + "996 hampered -- n^o , paralyzed -- by a self-indul... NEGATIVE \n", + "997 is expressly f^r idiots avho do n't caie v\\hat... NEGATIVE \n", + "998 is haunting ... ( i^t 's ) v\\hat punk rock mul... POSITIVE \n", + "999 y/hich nurses plot holes gaping enongh t^o pil... NEGATIVE \n", + "\n", + " actual_result pass \n", + "0 NEGATIVE True \n", + "1 NEGATIVE True \n", + "2 POSITIVE True \n", + "3 NEGATIVE True \n", + "4 NEGATIVE True \n", + ".. ... ... \n", + "995 NEGATIVE False \n", + "996 NEGATIVE True \n", + "997 NEGATIVE True \n", + "998 NEGATIVE False \n", + "999 NEGATIVE True \n", + "\n", + "[1000 rows x 7 columns]" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "harness.generated_results()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5Erhl6nkCQjB" + }, + "source": [ + "This method returns the generated results in the form of a pandas dataframe, which provides a convenient and easy-to-use format for working with the test results. You can use this method to quickly identify the test cases that failed and to determine where fixes are needed." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2gVoIzpWCFk2" + }, + "source": [ + "#### Report of the tests" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 112 + }, + "id": "xjkaiyLd68y9", + "outputId": "0b788ded-a9af-4bcc-b843-293dd90754b4" + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
categorytest_typefail_countpass_countpass_rateminimum_pass_ratepass
0robustnessadd_speech_to_text_typo3546593%60%True
1robustnessadd_ocr_typo9440681%60%True
\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + "
\n", + " \n", + "
\n", + "\n", + "\n", + "\n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n" + ], + "text/plain": [ + " category test_type fail_count pass_count pass_rate \\\n", + "0 robustness add_speech_to_text_typo 35 465 93% \n", + "1 robustness add_ocr_typo 94 406 81% \n", + "\n", + " minimum_pass_rate pass \n", + "0 60% True \n", + "1 60% True " + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "harness.report()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Moh61mF3AvAw" + }, + "source": [ + " Additional parameters (optional): You can pass additional parameters in the `training_data` dictionary to specify the details of the original dataset, such as the data source, subset, feature column, target column, and split. These parameters help in selecting the appropriate data for augmentation.\n", + "\n", + " - Example:\n", + "```\n", + "data_kwargs = {\n", + " \"data_source\": \"glue\",\n", + " \"subset\": \"sst2\",\n", + " \"feature_column\": \"sentence\",\n", + " \"target_column\": \"label\",\n", + " \"split\": \"train\"\n", + "}\n", + "```\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kB6ImMUC9IIO" + }, + "outputs": [], + "source": [ + "custom_proportions = {\n", + " 'add_ocr_typo':0.3\n", + "}\n", + "\n", + "data_kwargs = {\n", + " \"data_source\" : \"glue\",\n", + " \"subset\": \"sst2\",\n", + " \"feature_column\": \"sentence\",\n", + " \"target_column\": \"label\",\n", + " \"split\": \"train\"\n", + " }\n", + "\n", + "\n", + "harness.augment(\n", + " training_data = data_kwargs,\n", + " augmented_data =\"augmented_glue.csv\",\n", + " custom_proportions=custom_proportions,\n", + " export_mode=\"add\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "YPXIxv9D_fR7" + }, + "source": [ + "Essentially it applies perturbations to the input data based on the recommendations from the harness reports. Then this augmented_dataset is used to retrain the original model so as to make the model more robust and improve its performance." ] } ], diff --git a/docs/pages/docs/generate_augmentation.md b/docs/pages/docs/generate_augmentation.md index 5e89085ea..9e6bcbb53 100644 --- a/docs/pages/docs/generate_augmentation.md +++ b/docs/pages/docs/generate_augmentation.md @@ -13,13 +13,33 @@ modify_date: "2023-03-28" The library provides a `augment()` method that facilitates the data augmentation process. Several parameters are available: -- **`input_path`**, which is the path to the original training dataset to be augmented -- **`output_path`**, which is the path to save the augmented dataset -- **`inplace`** which is an optional parameter that controls whether the original input file should be augmented by duplicating rows in the dataset. By default, inplace is set to False. If True, the rows are modified in place and the length of the dataset remains similar. Otherwise, new rows are added to the dataset. + +- **`training_data`**: (Required) Specifies the source of the original training data. It should be a dictionary containing the necessary information about the dataset. + +- **`augmented_data`**: (Required) Name of the file to store the augmented data. The augmented dataset will be saved in this file. + +- **`custom_proportions`**: (Required) custom_proportions is a dictionary with augmentation on test type as key and proportion as value. The proportion is the percentage of the test cases that will be augmented with the given augmentation type. + +- **`export_mode`**: (Optional) Specifies how the augmented data should be exported. The possible values are: + - `'inplace'`: Modifies the list of samples in place. + - `'add'`: Adds new samples to the input data. + - `'transformed'`: Exports only the transformed data, excluding different untransformed samples. ```python -# Generating augmentations -h.augment(input_path='training_dataset', output_path='augmented_dataset', inplace=False) +custom_proportions = { + 'add_typo':0.3, + 'lowercase':0.3 +} + +data_kwargs = { + "data_source" : "conll03.conll", + } + +h.augment( + training_data = data_kwargs, + augmented_data ="augmented_conll03.conll", + custom_proportions=custom_proportions, + export_mode="transformed") ``` This method applies perturbations to the input data based on the recommendations from the Harness report. This augmented dataset can then be used to retrain a model so as to make it more robust than its previous version. From 62b349c0323b958c8cd95e4f063057f4087f04bb Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Wed, 2 Aug 2023 00:01:33 +0530 Subject: [PATCH 15/21] Docs(generate_aug.md): Updated For hf dataset --- docs/pages/docs/generate_augmentation.md | 29 +++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/docs/pages/docs/generate_augmentation.md b/docs/pages/docs/generate_augmentation.md index 9e6bcbb53..93cfbfbbb 100644 --- a/docs/pages/docs/generate_augmentation.md +++ b/docs/pages/docs/generate_augmentation.md @@ -44,4 +44,31 @@ h.augment( This method applies perturbations to the input data based on the recommendations from the Harness report. This augmented dataset can then be used to retrain a model so as to make it more robust than its previous version. -
\ No newline at end of file +
+ +#### Passing a Hugging Face Dataset for Augmentation + +For Augmentations, we specify the HuggingFace data input in the following way: + +```python +custom_proportions = { + 'add_ocr_typo':0.3 +} + +data_kwargs = { + "data_source" : "glue", + "subset": "sst2", + "feature_column": "sentence", + "target_column": "label", + "split": "train" + } + +harness.augment( + training_data = data_kwargs, + augmented_data ="augmented_glue.csv", + custom_proportions=custom_proportions, + export_mode="add", +) +``` + +
\ No newline at end of file From c94ca9f75a26817b7cc1d522373dfccd79ddffe4 Mon Sep 17 00:00:00 2001 From: Rakshit Khajuria Date: Wed, 2 Aug 2023 13:41:29 +0530 Subject: [PATCH 16/21] Updated website for templatic augmentations --- docs/pages/docs/generate_augmentation.md | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/docs/pages/docs/generate_augmentation.md b/docs/pages/docs/generate_augmentation.md index 93cfbfbbb..e63f40153 100644 --- a/docs/pages/docs/generate_augmentation.md +++ b/docs/pages/docs/generate_augmentation.md @@ -46,6 +46,31 @@ This method applies perturbations to the input data based on the recommendations
+#### Templatic Augmentations + +Templatic Augmentation is a technique that allows you to generate new training data by applying a set of predefined templates to the original training data. The templates are designed to introduce noise into the training data in a way that simulates real-world conditions. The augmentation process is controlled by a configuration file that specifies the augmentation templates to be used and the proportion of the training data to be augmented. The augmentation process is performed by the augment() method of the **Harness** class. + +Templatic augmentation is controlled by templates to be used with training data to be augmented. The augmentation process is performed by the augment() method of the **Harness** class. + +``` +template = ["The {ORG} company is located in {LOC}", "The {ORG} company is located in {LOC} and is owned by {PER}"] + +``` + +```python +data_kwargs = { + "data_source" : "conll03.conll", + } + +harness.augment( + training_data=data_kwargs, + augmented_data='augmented_conll03.conll', + templates=template, + ) +``` + +
+ #### Passing a Hugging Face Dataset for Augmentation For Augmentations, we specify the HuggingFace data input in the following way: From 9ebaa5c1348e48adbe9cd826ca300b373e3d13c2 Mon Sep 17 00:00:00 2001 From: Prikshit7766 Date: Wed, 2 Aug 2023 13:44:03 +0530 Subject: [PATCH 17/21] Augmentation notebook updated --- .../misc/Augmentation_Control_Notebook.ipynb | 8 +- .../Templatic_Augmentation_Notebook.ipynb | 785 ++++++++---------- 2 files changed, 331 insertions(+), 462 deletions(-) diff --git a/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb b/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb index 1e14fd913..46a25953b 100644 --- a/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb +++ b/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb @@ -1441,16 +1441,16 @@ "source": [ "The `.augment()` function takes the following parameters:\n", "\n", - "1. `training_data`: (Required) Specifies the source of the original training data. It should be a dictionary containing the necessary information about the dataset.\n", + "1. `training_data` (dict): (Required) Specifies the source of the original training data. It should be a dictionary containing the necessary information about the dataset.\n", " - Example: `{\"data_source\": \"conll03.conll\"}`\n", "\n", - "2. `augmented_data`: (Required) Name of the file to store the augmented data. The augmented dataset will be saved in this file.\n", + "2. `augmented_data` (str): (Required) Name of the file to store the augmented data. The augmented dataset will be saved in this file.\n", " - Example: `augmented_conll03.conll`\n", "\n", - "3. `custom_proportions`: (Required) custom_proportions is a dictionary with augmentation on test type as key and proportion as value. The proportion is the percentage of the test cases that will be augmented with the given augmentation type.\n", + "3. `custom_proportions` (dict): (Required) custom_proportions is a dictionary with augmentation on test type as key and proportion as value. The proportion is the percentage of the test cases that will be augmented with the given augmentation type.\n", " - Example: `{\"add_typo\": 0.3, \"lowercase\": 0.3}`\n", "\n", - "4. `export_mode`: (Optional) Specifies how the augmented data should be exported. The possible values are:\n", + "4. `export_mode` (str): (Optional) Specifies how the augmented data should be exported. The possible values are:\n", " - `'inplace'`: Modifies the list of samples in place.\n", " - `'add'`: Adds new samples to the input data.\n", " - `'transformed'`: Exports only the transformed data, excluding different untransformed samples.\n", diff --git a/demo/tutorials/misc/Templatic_Augmentation_Notebook.ipynb b/demo/tutorials/misc/Templatic_Augmentation_Notebook.ipynb index 4e6ff7067..1bf1dddfd 100644 --- a/demo/tutorials/misc/Templatic_Augmentation_Notebook.ipynb +++ b/demo/tutorials/misc/Templatic_Augmentation_Notebook.ipynb @@ -40,146 +40,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "oGIyE43uhTxH", - "outputId": "b581c350-77e9-4a07-d373-ae53fb6eb9b5" + "id": "oGIyE43uhTxH" }, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Collecting langtest[johnsnowlabs]\n", - " Downloading langtest-1.1.0-py3-none-any.whl (59.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.8/59.8 MB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting jsonlines<4.0.0,>=3.1.0 (from langtest[johnsnowlabs])\n", - " Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)\n", - "Requirement already satisfied: nest-asyncio<2.0.0,>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from langtest[johnsnowlabs]) (1.5.6)\n", - "Collecting pandas<3.0.0,>=2.0.3 (from langtest[johnsnowlabs])\n", - " Downloading pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m88.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting pydantic==1.10.6 (from langtest[johnsnowlabs])\n", - " Downloading pydantic-1.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m92.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: pyyaml<7.0,>=6.0 in /usr/local/lib/python3.10/dist-packages (from langtest[johnsnowlabs]) (6.0)\n", - "Requirement already satisfied: tqdm<5.0.0,>=4.65.0 in /usr/local/lib/python3.10/dist-packages (from langtest[johnsnowlabs]) (4.65.0)\n", - "Collecting typing-extensions<4.6.0 (from langtest[johnsnowlabs])\n", - " Downloading typing_extensions-4.5.0-py3-none-any.whl (27 kB)\n", - "Collecting johnsnowlabs==4.3.5 (from langtest[johnsnowlabs])\n", - " Downloading johnsnowlabs-4.3.5-py3-none-any.whl (75 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.7/75.7 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting pyspark==3.1.2 (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n", - " Downloading pyspark-3.1.2.tar.gz (212.4 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.4/212.4 MB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Collecting spark-nlp==4.3.2 (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n", - " Downloading spark_nlp-4.3.2-py2.py3-none-any.whl (473 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m473.2/473.2 kB\u001b[0m \u001b[31m31.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting nlu==4.2.0 (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n", - " Downloading nlu-4.2.0-py3-none-any.whl (639 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m639.9/639.9 kB\u001b[0m \u001b[31m49.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting spark-nlp-display==4.1 (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n", - " Downloading spark_nlp_display-4.1-py3-none-any.whl (95 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.4/95.4 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (1.22.4)\n", - "Collecting dataclasses (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n", - " Downloading dataclasses-0.6-py3-none-any.whl (14 kB)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (2.27.1)\n", - "Collecting databricks-api (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n", - " Downloading databricks_api-0.9.0-py3-none-any.whl (7.4 kB)\n", - "Collecting colorama (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n", - " Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", - "Requirement already satisfied: pyarrow>=0.16.0 in /usr/local/lib/python3.10/dist-packages (from nlu==4.2.0->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (9.0.0)\n", - "Collecting py4j==0.10.9 (from pyspark==3.1.2->johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n", - " Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m198.6/198.6 kB\u001b[0m \u001b[31m17.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: ipython in /usr/local/lib/python3.10/dist-packages (from spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (7.34.0)\n", - "Collecting svgwrite==1.4 (from spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n", - " Downloading svgwrite-1.4-py3-none-any.whl (66 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.9/66.9 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines<4.0.0,>=3.1.0->langtest[johnsnowlabs]) (23.1.0)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0.0,>=2.0.3->langtest[johnsnowlabs]) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0.0,>=2.0.3->langtest[johnsnowlabs]) (2022.7.1)\n", - "Collecting tzdata>=2022.1 (from pandas<3.0.0,>=2.0.3->langtest[johnsnowlabs])\n", - " Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m30.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas<3.0.0,>=2.0.3->langtest[johnsnowlabs]) (1.16.0)\n", - "Collecting databricks-cli (from databricks-api->johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n", - " Downloading databricks-cli-0.17.7.tar.gz (83 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.5/83.5 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (1.26.16)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (2023.5.7)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (2.0.12)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (3.4)\n", - "Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.10/dist-packages (from databricks-cli->databricks-api->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (8.1.4)\n", - "Requirement already satisfied: pyjwt>=1.7.0 in /usr/lib/python3/dist-packages (from databricks-cli->databricks-api->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (2.3.0)\n", - "Requirement already satisfied: oauthlib>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from databricks-cli->databricks-api->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (3.2.2)\n", - "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.10/dist-packages (from databricks-cli->databricks-api->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.8.10)\n", - "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (67.7.2)\n", - "Collecting jedi>=0.16 (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n", - " Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m74.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: decorator in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (4.4.2)\n", - "Requirement already satisfied: pickleshare in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.7.5)\n", - "Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (5.7.1)\n", - "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (3.0.39)\n", - "Requirement already satisfied: pygments in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (2.14.0)\n", - "Requirement already satisfied: backcall in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.2.0)\n", - "Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.1.6)\n", - "Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (4.8.0)\n", - "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/local/lib/python3.10/dist-packages (from jedi>=0.16->ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.8.3)\n", - "Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.10/dist-packages (from pexpect>4.3->ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.7.0)\n", - "Requirement already satisfied: wcwidth in /usr/local/lib/python3.10/dist-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.2.6)\n", - "Building wheels for collected packages: pyspark, databricks-cli\n", - " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880756 sha256=a525fa77974ef428d0f855d41353c331052adfb594a997d7598044e12271fd11\n", - " Stored in directory: /root/.cache/pip/wheels/ef/70/50/7882e1bcb5693225f7cc86698f10953201b48b3f36317c2d18\n", - " Building wheel for databricks-cli (setup.py) ... \u001b[?25l\u001b[?25hdone\n", - " Created wheel for databricks-cli: filename=databricks_cli-0.17.7-py3-none-any.whl size=143860 sha256=e78be081f408125550e40f4f19107f95f0b21497ad4f0570ed34acd736ebfe3c\n", - " Stored in directory: /root/.cache/pip/wheels/ae/63/93/5402c1a09c1868a59d0b05013484e07af97a9d7b3dbd5bd39a\n", - "Successfully built pyspark databricks-cli\n", - "Installing collected packages: spark-nlp, py4j, dataclasses, tzdata, typing-extensions, svgwrite, pyspark, jsonlines, jedi, colorama, pydantic, pandas, databricks-cli, spark-nlp-display, nlu, langtest, databricks-api, johnsnowlabs\n", - " Attempting uninstall: py4j\n", - " Found existing installation: py4j 0.10.9.7\n", - " Uninstalling py4j-0.10.9.7:\n", - " Successfully uninstalled py4j-0.10.9.7\n", - " Attempting uninstall: typing-extensions\n", - " Found existing installation: typing_extensions 4.7.1\n", - " Uninstalling typing_extensions-4.7.1:\n", - " Successfully uninstalled typing_extensions-4.7.1\n", - " Attempting uninstall: pydantic\n", - " Found existing installation: pydantic 1.10.11\n", - " Uninstalling pydantic-1.10.11:\n", - " Successfully uninstalled pydantic-1.10.11\n", - " Attempting uninstall: pandas\n", - " Found existing installation: pandas 1.5.3\n", - " Uninstalling pandas-1.5.3:\n", - " Successfully uninstalled pandas-1.5.3\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "google-colab 1.0.0 requires pandas==1.5.3, but you have pandas 2.0.3 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed colorama-0.4.6 databricks-api-0.9.0 databricks-cli-0.17.7 dataclasses-0.6 jedi-0.18.2 johnsnowlabs-4.3.5 jsonlines-3.1.0 langtest-1.1.0 nlu-4.2.0 pandas-2.0.3 py4j-0.10.9 pydantic-1.10.6 pyspark-3.1.2 spark-nlp-4.3.2 spark-nlp-display-4.1 svgwrite-1.4 typing-extensions-4.5.0 tzdata-2023.3\n" - ] - }, - { - "output_type": "display_data", - "data": { - "application/vnd.colab-display-data+json": { - "pip_warning": { - "packages": [ - "dataclasses" - ] - } - } - }, - "metadata": {} - } - ], + "outputs": [], "source": [ "!pip install langtest[johnsnowlabs]" ] @@ -197,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 5, "metadata": { "id": "lTzSJpMlhgq5" }, @@ -277,40 +142,40 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6uW22VqJje8E", - "outputId": "04e3b0ed-6113-4fe6-d316-f7db576fd28e" + "outputId": "a06dccd7-59ca-48b0-f657-811cc0a7ad22" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ - "--2023-07-20 11:31:59-- https://raw.githubusercontent.com/JohnSnowLabs/langtest/main/langtest/data/conll/sample.conll\n", - "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", - "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", + "--2023-08-02 07:26:24-- https://raw.githubusercontent.com/JohnSnowLabs/langtest/main/langtest/data/conll/sample.conll\n", + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n", + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 50519 (49K) [text/plain]\n", "Saving to: ‘sample.conll’\n", "\n", - "\rsample.conll 0%[ ] 0 --.-KB/s \rsample.conll 100%[===================>] 49.33K --.-KB/s in 0.004s \n", + "\rsample.conll 0%[ ] 0 --.-KB/s \rsample.conll 100%[===================>] 49.33K --.-KB/s in 0.001s \n", "\n", - "2023-07-20 11:32:00 (13.6 MB/s) - ‘sample.conll’ saved [50519/50519]\n", + "2023-08-02 07:26:24 (45.7 MB/s) - ‘sample.conll’ saved [50519/50519]\n", "\n", - "--2023-07-20 11:32:00-- https://raw.githubusercontent.com/JohnSnowLabs/langtest/main/demo/data/conll03.conll\n", + "--2023-08-02 07:26:24-- https://raw.githubusercontent.com/JohnSnowLabs/langtest/main/demo/data/conll03.conll\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 827443 (808K) [text/plain]\n", "Saving to: ‘conll03.conll’\n", "\n", - "conll03.conll 100%[===================>] 808.05K --.-KB/s in 0.02s \n", + "conll03.conll 100%[===================>] 808.05K --.-KB/s in 0.05s \n", "\n", - "2023-07-20 11:32:00 (46.7 MB/s) - ‘conll03.conll’ saved [827443/827443]\n", + "2023-08-02 07:26:24 (14.4 MB/s) - ‘conll03.conll’ saved [827443/827443]\n", "\n" ] } @@ -334,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": { "id": "jRnEmCfPhsZs" }, @@ -345,18 +210,18 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bHXeP18sGp-g", - "outputId": "6e09335a-7d95-4b6e-b6af-ec2911c13731" + "outputId": "7cc37e0b-c80e-4d8d-f6e5-fee115404ee9" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Warning::Spark Session already created, some configs may not take.\n", "small_bert_L2_128 download started this may take some time.\n", @@ -380,18 +245,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RVk9NWn7u-Lm", - "outputId": "00146078-e7ba-4787-b3ab-b764aa709ad5" + "outputId": "0b61c376-36df-47e6-fb8f-68dc019bc2fc" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Test Configuration : \n", " {\n", @@ -441,17 +306,16 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mynkAUwZyuFN", - "outputId": "378c66c5-b2e6-4d5a-fc31-bf655366d74a" + "outputId": "13035b12-4f98-483b-dc53-8a9cc59a6e80" }, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "{'tests': {'defaults': {'min_pass_rate': 0.65},\n", @@ -459,8 +323,9 @@ " 'lowercase': {'min_pass_rate': 0.65}}}}" ] }, + "execution_count": 10, "metadata": {}, - "execution_count": 6 + "output_type": "execute_result" } ], "source": [ @@ -499,29 +364,29 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "UiUNzTwF89ye", - "outputId": "25ee4b2f-56bb-4822-be59-f1aa82ce2d1c" + "outputId": "533592a1-02a7-4c2b-a75f-4e37c3acb053" }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ - "Generating testcases...: 100%|██████████| 1/1 [00:00<00:00, 4156.89it/s]\n" + "Generating testcases...: 100%|██████████| 1/1 [00:00<00:00, 4911.36it/s]\n" ] }, { - "output_type": "execute_result", "data": { "text/plain": [] }, + "execution_count": 11, "metadata": {}, - "execution_count": 7 + "output_type": "execute_result" } ], "source": [ @@ -539,52 +404,22 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "p0tTwFfc891k", - "outputId": "f9d626b7-af13-4a13-c157-1ebf09da7281" + "outputId": "d1257af9-4ea7-4a5a-a88f-bc1c520d4abd" }, "outputs": [ { - "output_type": "execute_result", "data": { - "text/plain": [ - " category test_type original \\\n", - "0 robustness add_typo SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI... \n", - "1 robustness add_typo Nadim Ladki \n", - "2 robustness add_typo AL-AIN , United Arab Emirates 1996-12-06 \n", - "3 robustness add_typo Japan began the defence of their Asian Cup tit... \n", - "4 robustness add_typo But China saw their luck desert them in the se... \n", - ".. ... ... ... \n", - "447 robustness lowercase Portuguesa 1 Atletico Mineiro 0 \n", - "448 robustness lowercase CRICKET - LARA ENDURES ANOTHER MISERABLE DAY . \n", - "449 robustness lowercase Robert Galvin \n", - "450 robustness lowercase MELBOURNE 1996-12-06 \n", - "451 robustness lowercase Australia gave Brian Lara another reason to be... \n", - "\n", - " test_case \n", - "0 SOCCER - JAPAN GET LUCMY WIN , CHINA IN SURPRI... \n", - "1 Nadim Ladli \n", - "2 AL-AIN , United Arab Smirates 1996-12-06 \n", - "3 Japsn began the defence of their Asian Cup tit... \n", - "4 But China saw their luck desery them in the se... \n", - ".. ... \n", - "447 portuguesa 1 atletico mineiro 0 \n", - "448 cricket - lara endures another miserable day . \n", - "449 robert galvin \n", - "450 melbourne 1996-12-06 \n", - "451 australia gave brian lara another reason to be... \n", - "\n", - "[452 rows x 4 columns]" - ], "text/html": [ "\n", "\n", - "
\n", + "
\n", "
\n", "
\n", "