diff --git a/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb b/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb
index e9a5ad4c4..ec4eac11c 100644
--- a/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb
+++ b/demo/tutorials/misc/Augmentation_Control_Notebook.ipynb
@@ -1,7 +1,6 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "e7PsSmy9sCoR"
@@ -11,7 +10,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "MhgkQYQiEvZt"
@@ -21,7 +19,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "WJJzt3RWhEc6"
@@ -33,7 +30,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "26qXWhCYhHAt"
@@ -54,7 +50,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "Jx4OHnOchSeC"
@@ -75,7 +70,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "yR6kjOaiheKN"
@@ -88,7 +82,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 7,
"metadata": {
"id": "lTzSJpMlhgq5"
},
@@ -99,7 +93,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "sBcZjwJBhkOw"
@@ -113,10 +106,10 @@
"\n",
"\n",
"| Parameter | Description | \n",
- "| - | - | \n",
+ "| - | - |\n",
"|**task** |Task for which the model is to be evaluated (text-classification or ner)|\n",
"|**model** |PipelineModel or path to a saved model or pretrained pipeline/model from hub.\n",
- "|**data** |Path to the data that is to be used for evaluation. Can be .csv or .conll file in the CoNLL format \n",
+ "|**data** |Path to the data that is to be used for evaluation. Can be .csv or .conll file in the CoNLL format\n",
"|**config** |Configuration for the tests to be performed, specified in form of a YAML file.\n",
"|**hub** |model hub to load from the path. Required if model param is passed as path.|\n",
"\n",
@@ -125,7 +118,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "JFhJ9CcbsKqN"
@@ -137,7 +129,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "UtxtE6Y0r4CJ"
@@ -151,7 +142,7 @@
"\n",
"2. Test NER model robustness on CoNLL test set\n",
"\n",
- "3. Augment CoNLL training set based on test results \n",
+ "3. Augment CoNLL training set based on test results\n",
"\n",
"4. Train new NER model on augmented CoNLL training set\n",
"\n",
@@ -161,7 +152,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "I21Jmq79jgC6"
@@ -186,7 +176,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "MNtH_HOUt_PL"
@@ -197,7 +186,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 9,
"metadata": {
"id": "jRnEmCfPhsZs"
},
@@ -208,13 +197,13 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bHXeP18sGp-g",
- "outputId": "1bd2ea97-e002-451b-d60b-cae915c78fb6"
+ "outputId": "f50e09d2-8c9c-44d5-9287-be7014d1307f"
},
"outputs": [
{
@@ -233,7 +222,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "kKgXC7cvuyar"
@@ -244,35 +232,85 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 11,
"metadata": {
- "id": "RVk9NWn7u-Lm"
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "RVk9NWn7u-Lm",
+ "outputId": "d542c0fe-78fe-40cd-ce96-a4040b9b040f"
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Test Configuration : \n",
+ " {\n",
+ " \"tests\": {\n",
+ " \"defaults\": {\n",
+ " \"min_pass_rate\": 1.0\n",
+ " },\n",
+ " \"robustness\": {\n",
+ " \"add_typo\": {\n",
+ " \"min_pass_rate\": 0.7\n",
+ " },\n",
+ " \"american_to_british\": {\n",
+ " \"min_pass_rate\": 0.7\n",
+ " }\n",
+ " },\n",
+ " \"accuracy\": {\n",
+ " \"min_micro_f1_score\": {\n",
+ " \"min_score\": 0.7\n",
+ " }\n",
+ " },\n",
+ " \"bias\": {\n",
+ " \"replace_to_female_pronouns\": {\n",
+ " \"min_pass_rate\": 0.7\n",
+ " },\n",
+ " \"replace_to_low_income_country\": {\n",
+ " \"min_pass_rate\": 0.7\n",
+ " }\n",
+ " },\n",
+ " \"fairness\": {\n",
+ " \"min_gender_f1_score\": {\n",
+ " \"min_score\": 0.6\n",
+ " }\n",
+ " },\n",
+ " \"representation\": {\n",
+ " \"min_label_representation_count\": {\n",
+ " \"min_count\": 50\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ "}\n"
+ ]
+ }
+ ],
"source": [
"harness = Harness(task=\"ner\", model=ner_model, data=\"sample.conll\", hub=\"johnsnowlabs\")"
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mynkAUwZyuFN",
- "outputId": "a7b97865-fc75-4070-c5b4-0533617a7782"
+ "outputId": "1ad0c141-bc67-4ac1-bff7-d102a71b8693"
},
"outputs": [
{
"data": {
"text/plain": [
"{'tests': {'defaults': {'min_pass_rate': 0.65},\n",
- " 'robustness': {'add_typo': {'min_pass_rate': 0.65},\n",
+ " 'robustness': {'add_typo': {'min_pass_rate': 0.65},\n",
" 'lowercase': {'min_pass_rate': 0.65}}}}"
]
},
- "execution_count": 18,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
@@ -281,9 +319,9 @@
"harness.configure({\n",
" 'tests': {\n",
" 'defaults': {'min_pass_rate': 0.65},\n",
- " \n",
+ "\n",
" 'robustness': {\n",
- " 'add_typo': {'min_pass_rate': 0.65}, \n",
+ " 'add_typo': {'min_pass_rate': 0.65},\n",
" 'lowercase':{'min_pass_rate': 0.65},\n",
" }\n",
" }\n",
@@ -291,7 +329,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "ZPU46A7WigFr"
@@ -301,7 +338,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "MomLlmTwjpzU"
@@ -315,20 +351,27 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 13,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "UiUNzTwF89ye",
- "outputId": "1ec7fe1f-c342-45da-b919-d48e8e082341"
+ "outputId": "f77a840d-a816-4d2c-9de6-a8a991f047b5"
},
"outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Generating testcases...: 100%|██████████| 1/1 [00:00<00:00, 5526.09it/s]\n"
+ ]
+ },
{
"data": {
"text/plain": []
},
- "execution_count": 19,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -338,7 +381,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {
"id": "UiMIF-o49Bg_"
@@ -349,21 +391,22 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 15,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 423
},
"id": "p0tTwFfc891k",
- "outputId": "05b03712-2723-418a-936e-2cbbc818f215"
+ "outputId": "3676052a-635b-4cc3-b23d-1e44f097065b"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
- "
\n",
+ "\n",
+ "
\n",
"
\n",
"
\n",
"\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " category | \n",
+ " test_type | \n",
+ " original | \n",
+ " test_case | \n",
+ " expected_result | \n",
+ " actual_result | \n",
+ " pass | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " robustness | \n",
+ " add_speech_to_text_typo | \n",
+ " hide new secretions from the parental units | \n",
+ " hide new secretions frum the parental units' | \n",
+ " NEGATIVE | \n",
+ " NEGATIVE | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " robustness | \n",
+ " add_speech_to_text_typo | \n",
+ " contains no wit , only labored gags | \n",
+ " contains know witte , only labored gags | \n",
+ " NEGATIVE | \n",
+ " NEGATIVE | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " robustness | \n",
+ " add_speech_to_text_typo | \n",
+ " that loves its characters and communicates som... | \n",
+ " that loves its characters and communicates som... | \n",
+ " POSITIVE | \n",
+ " POSITIVE | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " robustness | \n",
+ " add_speech_to_text_typo | \n",
+ " remains utterly satisfied to remain the same t... | \n",
+ " remains utterly satisfied to remain the sejm t... | \n",
+ " NEGATIVE | \n",
+ " NEGATIVE | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " robustness | \n",
+ " add_speech_to_text_typo | \n",
+ " on the worst revenge-of-the-nerds clichés the ... | \n",
+ " aune the worst revenge-of-the-nerds clichés th... | \n",
+ " NEGATIVE | \n",
+ " NEGATIVE | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " | 995 | \n",
+ " robustness | \n",
+ " add_ocr_typo | \n",
+ " true star | \n",
+ " trne ftar | \n",
+ " POSITIVE | \n",
+ " NEGATIVE | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 996 | \n",
+ " robustness | \n",
+ " add_ocr_typo | \n",
+ " hampered -- no , paralyzed -- by a self-indulg... | \n",
+ " hampered -- n^o , paralyzed -- by a self-indul... | \n",
+ " NEGATIVE | \n",
+ " NEGATIVE | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 997 | \n",
+ " robustness | \n",
+ " add_ocr_typo | \n",
+ " is expressly for idiots who do n't care what k... | \n",
+ " is expressly f^r idiots avho do n't caie v\\hat... | \n",
+ " NEGATIVE | \n",
+ " NEGATIVE | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 998 | \n",
+ " robustness | \n",
+ " add_ocr_typo | \n",
+ " is haunting ... ( it 's ) what punk rock music... | \n",
+ " is haunting ... ( i^t 's ) v\\hat punk rock mul... | \n",
+ " POSITIVE | \n",
+ " NEGATIVE | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " | 999 | \n",
+ " robustness | \n",
+ " add_ocr_typo | \n",
+ " which nurses plot holes gaping enough to pilot... | \n",
+ " y/hich nurses plot holes gaping enongh t^o pil... | \n",
+ " NEGATIVE | \n",
+ " NEGATIVE | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1000 rows × 7 columns
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " category test_type \\\n",
+ "0 robustness add_speech_to_text_typo \n",
+ "1 robustness add_speech_to_text_typo \n",
+ "2 robustness add_speech_to_text_typo \n",
+ "3 robustness add_speech_to_text_typo \n",
+ "4 robustness add_speech_to_text_typo \n",
+ ".. ... ... \n",
+ "995 robustness add_ocr_typo \n",
+ "996 robustness add_ocr_typo \n",
+ "997 robustness add_ocr_typo \n",
+ "998 robustness add_ocr_typo \n",
+ "999 robustness add_ocr_typo \n",
+ "\n",
+ " original \\\n",
+ "0 hide new secretions from the parental units \n",
+ "1 contains no wit , only labored gags \n",
+ "2 that loves its characters and communicates som... \n",
+ "3 remains utterly satisfied to remain the same t... \n",
+ "4 on the worst revenge-of-the-nerds clichés the ... \n",
+ ".. ... \n",
+ "995 true star \n",
+ "996 hampered -- no , paralyzed -- by a self-indulg... \n",
+ "997 is expressly for idiots who do n't care what k... \n",
+ "998 is haunting ... ( it 's ) what punk rock music... \n",
+ "999 which nurses plot holes gaping enough to pilot... \n",
+ "\n",
+ " test_case expected_result \\\n",
+ "0 hide new secretions frum the parental units' NEGATIVE \n",
+ "1 contains know witte , only labored gags NEGATIVE \n",
+ "2 that loves its characters and communicates som... POSITIVE \n",
+ "3 remains utterly satisfied to remain the sejm t... NEGATIVE \n",
+ "4 aune the worst revenge-of-the-nerds clichés th... NEGATIVE \n",
+ ".. ... ... \n",
+ "995 trne ftar POSITIVE \n",
+ "996 hampered -- n^o , paralyzed -- by a self-indul... NEGATIVE \n",
+ "997 is expressly f^r idiots avho do n't caie v\\hat... NEGATIVE \n",
+ "998 is haunting ... ( i^t 's ) v\\hat punk rock mul... POSITIVE \n",
+ "999 y/hich nurses plot holes gaping enongh t^o pil... NEGATIVE \n",
+ "\n",
+ " actual_result pass \n",
+ "0 NEGATIVE True \n",
+ "1 NEGATIVE True \n",
+ "2 POSITIVE True \n",
+ "3 NEGATIVE True \n",
+ "4 NEGATIVE True \n",
+ ".. ... ... \n",
+ "995 NEGATIVE False \n",
+ "996 NEGATIVE True \n",
+ "997 NEGATIVE True \n",
+ "998 NEGATIVE False \n",
+ "999 NEGATIVE True \n",
+ "\n",
+ "[1000 rows x 7 columns]"
+ ]
+ },
+ "execution_count": 46,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "harness.generated_results()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "5Erhl6nkCQjB"
+ },
+ "source": [
+ "This method returns the generated results in the form of a pandas dataframe, which provides a convenient and easy-to-use format for working with the test results. You can use this method to quickly identify the test cases that failed and to determine where fixes are needed."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2gVoIzpWCFk2"
+ },
+ "source": [
+ "#### Report of the tests"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 112
+ },
+ "id": "xjkaiyLd68y9",
+ "outputId": "0b788ded-a9af-4bcc-b843-293dd90754b4"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " category | \n",
+ " test_type | \n",
+ " fail_count | \n",
+ " pass_count | \n",
+ " pass_rate | \n",
+ " minimum_pass_rate | \n",
+ " pass | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " robustness | \n",
+ " add_speech_to_text_typo | \n",
+ " 35 | \n",
+ " 465 | \n",
+ " 93% | \n",
+ " 60% | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " robustness | \n",
+ " add_ocr_typo | \n",
+ " 94 | \n",
+ " 406 | \n",
+ " 81% | \n",
+ " 60% | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ "\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n"
+ ],
+ "text/plain": [
+ " category test_type fail_count pass_count pass_rate \\\n",
+ "0 robustness add_speech_to_text_typo 35 465 93% \n",
+ "1 robustness add_ocr_typo 94 406 81% \n",
+ "\n",
+ " minimum_pass_rate pass \n",
+ "0 60% True \n",
+ "1 60% True "
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "harness.report()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Moh61mF3AvAw"
+ },
+ "source": [
+ " Additional parameters (optional): You can pass additional parameters in the `training_data` dictionary to specify the details of the original dataset, such as the data source, subset, feature column, target column, and split. These parameters help in selecting the appropriate data for augmentation.\n",
+ "\n",
+ " - Example:\n",
+ "```\n",
+ "data_kwargs = {\n",
+ " \"data_source\": \"glue\",\n",
+ " \"subset\": \"sst2\",\n",
+ " \"feature_column\": \"sentence\",\n",
+ " \"target_column\": \"label\",\n",
+ " \"split\": \"train\"\n",
+ "}\n",
+ "```\n",
+ " \n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "kB6ImMUC9IIO"
+ },
+ "outputs": [],
+ "source": [
+ "custom_proportions = {\n",
+ " 'add_ocr_typo':0.3\n",
+ "}\n",
+ "\n",
+ "data_kwargs = {\n",
+ " \"data_source\" : \"glue\",\n",
+ " \"subset\": \"sst2\",\n",
+ " \"feature_column\": \"sentence\",\n",
+ " \"target_column\": \"label\",\n",
+ " \"split\": \"train\"\n",
+ " }\n",
+ "\n",
+ "\n",
+ "harness.augment(\n",
+ " training_data = data_kwargs,\n",
+ " save_data_path =\"augmented_glue.csv\",\n",
+ " custom_proportions=custom_proportions,\n",
+ " export_mode=\"add\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "YPXIxv9D_fR7"
+ },
+ "source": [
+ "Essentially it applies perturbations to the input data based on the recommendations from the harness reports. Then this augmented_dataset is used to retrain the original model so as to make the model more robust and improve its performance."
]
}
],
diff --git a/demo/tutorials/misc/Templatic_Augmentation_Notebook.ipynb b/demo/tutorials/misc/Templatic_Augmentation_Notebook.ipynb
index 4e6ff7067..06265ec17 100644
--- a/demo/tutorials/misc/Templatic_Augmentation_Notebook.ipynb
+++ b/demo/tutorials/misc/Templatic_Augmentation_Notebook.ipynb
@@ -40,146 +40,11 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 1000
- },
- "id": "oGIyE43uhTxH",
- "outputId": "b581c350-77e9-4a07-d373-ae53fb6eb9b5"
+ "id": "oGIyE43uhTxH"
},
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "Collecting langtest[johnsnowlabs]\n",
- " Downloading langtest-1.1.0-py3-none-any.whl (59.8 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.8/59.8 MB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hCollecting jsonlines<4.0.0,>=3.1.0 (from langtest[johnsnowlabs])\n",
- " Downloading jsonlines-3.1.0-py3-none-any.whl (8.6 kB)\n",
- "Requirement already satisfied: nest-asyncio<2.0.0,>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from langtest[johnsnowlabs]) (1.5.6)\n",
- "Collecting pandas<3.0.0,>=2.0.3 (from langtest[johnsnowlabs])\n",
- " Downloading pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m88.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hCollecting pydantic==1.10.6 (from langtest[johnsnowlabs])\n",
- " Downloading pydantic-1.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m92.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hRequirement already satisfied: pyyaml<7.0,>=6.0 in /usr/local/lib/python3.10/dist-packages (from langtest[johnsnowlabs]) (6.0)\n",
- "Requirement already satisfied: tqdm<5.0.0,>=4.65.0 in /usr/local/lib/python3.10/dist-packages (from langtest[johnsnowlabs]) (4.65.0)\n",
- "Collecting typing-extensions<4.6.0 (from langtest[johnsnowlabs])\n",
- " Downloading typing_extensions-4.5.0-py3-none-any.whl (27 kB)\n",
- "Collecting johnsnowlabs==4.3.5 (from langtest[johnsnowlabs])\n",
- " Downloading johnsnowlabs-4.3.5-py3-none-any.whl (75 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.7/75.7 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hCollecting pyspark==3.1.2 (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n",
- " Downloading pyspark-3.1.2.tar.gz (212.4 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m212.4/212.4 MB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
- "Collecting spark-nlp==4.3.2 (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n",
- " Downloading spark_nlp-4.3.2-py2.py3-none-any.whl (473 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m473.2/473.2 kB\u001b[0m \u001b[31m31.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hCollecting nlu==4.2.0 (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n",
- " Downloading nlu-4.2.0-py3-none-any.whl (639 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m639.9/639.9 kB\u001b[0m \u001b[31m49.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hCollecting spark-nlp-display==4.1 (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n",
- " Downloading spark_nlp_display-4.1-py3-none-any.whl (95 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.4/95.4 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (1.22.4)\n",
- "Collecting dataclasses (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n",
- " Downloading dataclasses-0.6-py3-none-any.whl (14 kB)\n",
- "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (2.27.1)\n",
- "Collecting databricks-api (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n",
- " Downloading databricks_api-0.9.0-py3-none-any.whl (7.4 kB)\n",
- "Collecting colorama (from johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n",
- " Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
- "Requirement already satisfied: pyarrow>=0.16.0 in /usr/local/lib/python3.10/dist-packages (from nlu==4.2.0->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (9.0.0)\n",
- "Collecting py4j==0.10.9 (from pyspark==3.1.2->johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n",
- " Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m198.6/198.6 kB\u001b[0m \u001b[31m17.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hRequirement already satisfied: ipython in /usr/local/lib/python3.10/dist-packages (from spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (7.34.0)\n",
- "Collecting svgwrite==1.4 (from spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n",
- " Downloading svgwrite-1.4-py3-none-any.whl (66 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.9/66.9 kB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hRequirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines<4.0.0,>=3.1.0->langtest[johnsnowlabs]) (23.1.0)\n",
- "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0.0,>=2.0.3->langtest[johnsnowlabs]) (2.8.2)\n",
- "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0.0,>=2.0.3->langtest[johnsnowlabs]) (2022.7.1)\n",
- "Collecting tzdata>=2022.1 (from pandas<3.0.0,>=2.0.3->langtest[johnsnowlabs])\n",
- " Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m30.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas<3.0.0,>=2.0.3->langtest[johnsnowlabs]) (1.16.0)\n",
- "Collecting databricks-cli (from databricks-api->johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n",
- " Downloading databricks-cli-0.17.7.tar.gz (83 kB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m83.5/83.5 kB\u001b[0m \u001b[31m8.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
- "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (1.26.16)\n",
- "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (2023.5.7)\n",
- "Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (2.0.12)\n",
- "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (3.4)\n",
- "Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.10/dist-packages (from databricks-cli->databricks-api->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (8.1.4)\n",
- "Requirement already satisfied: pyjwt>=1.7.0 in /usr/lib/python3/dist-packages (from databricks-cli->databricks-api->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (2.3.0)\n",
- "Requirement already satisfied: oauthlib>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from databricks-cli->databricks-api->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (3.2.2)\n",
- "Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.10/dist-packages (from databricks-cli->databricks-api->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.8.10)\n",
- "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (67.7.2)\n",
- "Collecting jedi>=0.16 (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs])\n",
- " Downloading jedi-0.18.2-py2.py3-none-any.whl (1.6 MB)\n",
- "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m74.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
- "\u001b[?25hRequirement already satisfied: decorator in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (4.4.2)\n",
- "Requirement already satisfied: pickleshare in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.7.5)\n",
- "Requirement already satisfied: traitlets>=4.2 in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (5.7.1)\n",
- "Requirement already satisfied: prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (3.0.39)\n",
- "Requirement already satisfied: pygments in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (2.14.0)\n",
- "Requirement already satisfied: backcall in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.2.0)\n",
- "Requirement already satisfied: matplotlib-inline in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.1.6)\n",
- "Requirement already satisfied: pexpect>4.3 in /usr/local/lib/python3.10/dist-packages (from ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (4.8.0)\n",
- "Requirement already satisfied: parso<0.9.0,>=0.8.0 in /usr/local/lib/python3.10/dist-packages (from jedi>=0.16->ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.8.3)\n",
- "Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.10/dist-packages (from pexpect>4.3->ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.7.0)\n",
- "Requirement already satisfied: wcwidth in /usr/local/lib/python3.10/dist-packages (from prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0->ipython->spark-nlp-display==4.1->johnsnowlabs==4.3.5->langtest[johnsnowlabs]) (0.2.6)\n",
- "Building wheels for collected packages: pyspark, databricks-cli\n",
- " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
- " Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880756 sha256=a525fa77974ef428d0f855d41353c331052adfb594a997d7598044e12271fd11\n",
- " Stored in directory: /root/.cache/pip/wheels/ef/70/50/7882e1bcb5693225f7cc86698f10953201b48b3f36317c2d18\n",
- " Building wheel for databricks-cli (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
- " Created wheel for databricks-cli: filename=databricks_cli-0.17.7-py3-none-any.whl size=143860 sha256=e78be081f408125550e40f4f19107f95f0b21497ad4f0570ed34acd736ebfe3c\n",
- " Stored in directory: /root/.cache/pip/wheels/ae/63/93/5402c1a09c1868a59d0b05013484e07af97a9d7b3dbd5bd39a\n",
- "Successfully built pyspark databricks-cli\n",
- "Installing collected packages: spark-nlp, py4j, dataclasses, tzdata, typing-extensions, svgwrite, pyspark, jsonlines, jedi, colorama, pydantic, pandas, databricks-cli, spark-nlp-display, nlu, langtest, databricks-api, johnsnowlabs\n",
- " Attempting uninstall: py4j\n",
- " Found existing installation: py4j 0.10.9.7\n",
- " Uninstalling py4j-0.10.9.7:\n",
- " Successfully uninstalled py4j-0.10.9.7\n",
- " Attempting uninstall: typing-extensions\n",
- " Found existing installation: typing_extensions 4.7.1\n",
- " Uninstalling typing_extensions-4.7.1:\n",
- " Successfully uninstalled typing_extensions-4.7.1\n",
- " Attempting uninstall: pydantic\n",
- " Found existing installation: pydantic 1.10.11\n",
- " Uninstalling pydantic-1.10.11:\n",
- " Successfully uninstalled pydantic-1.10.11\n",
- " Attempting uninstall: pandas\n",
- " Found existing installation: pandas 1.5.3\n",
- " Uninstalling pandas-1.5.3:\n",
- " Successfully uninstalled pandas-1.5.3\n",
- "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
- "google-colab 1.0.0 requires pandas==1.5.3, but you have pandas 2.0.3 which is incompatible.\u001b[0m\u001b[31m\n",
- "\u001b[0mSuccessfully installed colorama-0.4.6 databricks-api-0.9.0 databricks-cli-0.17.7 dataclasses-0.6 jedi-0.18.2 johnsnowlabs-4.3.5 jsonlines-3.1.0 langtest-1.1.0 nlu-4.2.0 pandas-2.0.3 py4j-0.10.9 pydantic-1.10.6 pyspark-3.1.2 spark-nlp-4.3.2 spark-nlp-display-4.1 svgwrite-1.4 typing-extensions-4.5.0 tzdata-2023.3\n"
- ]
- },
- {
- "output_type": "display_data",
- "data": {
- "application/vnd.colab-display-data+json": {
- "pip_warning": {
- "packages": [
- "dataclasses"
- ]
- }
- }
- },
- "metadata": {}
- }
- ],
+ "outputs": [],
"source": [
"!pip install langtest[johnsnowlabs]"
]
@@ -197,7 +62,7 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 5,
"metadata": {
"id": "lTzSJpMlhgq5"
},
@@ -277,40 +142,40 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "6uW22VqJje8E",
- "outputId": "04e3b0ed-6113-4fe6-d316-f7db576fd28e"
+ "outputId": "a06dccd7-59ca-48b0-f657-811cc0a7ad22"
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
- "--2023-07-20 11:31:59-- https://raw.githubusercontent.com/JohnSnowLabs/langtest/main/langtest/data/conll/sample.conll\n",
- "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
- "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+ "--2023-08-02 07:26:24-- https://raw.githubusercontent.com/JohnSnowLabs/langtest/main/langtest/data/conll/sample.conll\n",
+ "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n",
+ "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 50519 (49K) [text/plain]\n",
"Saving to: ‘sample.conll’\n",
"\n",
- "\rsample.conll 0%[ ] 0 --.-KB/s \rsample.conll 100%[===================>] 49.33K --.-KB/s in 0.004s \n",
+ "\rsample.conll 0%[ ] 0 --.-KB/s \rsample.conll 100%[===================>] 49.33K --.-KB/s in 0.001s \n",
"\n",
- "2023-07-20 11:32:00 (13.6 MB/s) - ‘sample.conll’ saved [50519/50519]\n",
+ "2023-08-02 07:26:24 (45.7 MB/s) - ‘sample.conll’ saved [50519/50519]\n",
"\n",
- "--2023-07-20 11:32:00-- https://raw.githubusercontent.com/JohnSnowLabs/langtest/main/demo/data/conll03.conll\n",
+ "--2023-08-02 07:26:24-- https://raw.githubusercontent.com/JohnSnowLabs/langtest/main/demo/data/conll03.conll\n",
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 827443 (808K) [text/plain]\n",
"Saving to: ‘conll03.conll’\n",
"\n",
- "conll03.conll 100%[===================>] 808.05K --.-KB/s in 0.02s \n",
+ "conll03.conll 100%[===================>] 808.05K --.-KB/s in 0.05s \n",
"\n",
- "2023-07-20 11:32:00 (46.7 MB/s) - ‘conll03.conll’ saved [827443/827443]\n",
+ "2023-08-02 07:26:24 (14.4 MB/s) - ‘conll03.conll’ saved [827443/827443]\n",
"\n"
]
}
@@ -334,7 +199,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 7,
"metadata": {
"id": "jRnEmCfPhsZs"
},
@@ -345,18 +210,18 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bHXeP18sGp-g",
- "outputId": "6e09335a-7d95-4b6e-b6af-ec2911c13731"
+ "outputId": "7cc37e0b-c80e-4d8d-f6e5-fee115404ee9"
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Warning::Spark Session already created, some configs may not take.\n",
"small_bert_L2_128 download started this may take some time.\n",
@@ -380,18 +245,18 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RVk9NWn7u-Lm",
- "outputId": "00146078-e7ba-4787-b3ab-b764aa709ad5"
+ "outputId": "0b61c376-36df-47e6-fb8f-68dc019bc2fc"
},
"outputs": [
{
- "output_type": "stream",
"name": "stdout",
+ "output_type": "stream",
"text": [
"Test Configuration : \n",
" {\n",
@@ -441,17 +306,16 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 10,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mynkAUwZyuFN",
- "outputId": "378c66c5-b2e6-4d5a-fc31-bf655366d74a"
+ "outputId": "13035b12-4f98-483b-dc53-8a9cc59a6e80"
},
"outputs": [
{
- "output_type": "execute_result",
"data": {
"text/plain": [
"{'tests': {'defaults': {'min_pass_rate': 0.65},\n",
@@ -459,8 +323,9 @@
" 'lowercase': {'min_pass_rate': 0.65}}}}"
]
},
+ "execution_count": 10,
"metadata": {},
- "execution_count": 6
+ "output_type": "execute_result"
}
],
"source": [
@@ -499,29 +364,29 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "UiUNzTwF89ye",
- "outputId": "25ee4b2f-56bb-4822-be59-f1aa82ce2d1c"
+ "outputId": "533592a1-02a7-4c2b-a75f-4e37c3acb053"
},
"outputs": [
{
- "output_type": "stream",
"name": "stderr",
+ "output_type": "stream",
"text": [
- "Generating testcases...: 100%|██████████| 1/1 [00:00<00:00, 4156.89it/s]\n"
+ "Generating testcases...: 100%|██████████| 1/1 [00:00<00:00, 4911.36it/s]\n"
]
},
{
- "output_type": "execute_result",
"data": {
"text/plain": []
},
+ "execution_count": 11,
"metadata": {},
- "execution_count": 7
+ "output_type": "execute_result"
}
],
"source": [
@@ -539,52 +404,22 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 423
},
"id": "p0tTwFfc891k",
- "outputId": "f9d626b7-af13-4a13-c157-1ebf09da7281"
+ "outputId": "d1257af9-4ea7-4a5a-a88f-bc1c520d4abd"
},
"outputs": [
{
- "output_type": "execute_result",
"data": {
- "text/plain": [
- " category test_type original \\\n",
- "0 robustness add_typo SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRI... \n",
- "1 robustness add_typo Nadim Ladki \n",
- "2 robustness add_typo AL-AIN , United Arab Emirates 1996-12-06 \n",
- "3 robustness add_typo Japan began the defence of their Asian Cup tit... \n",
- "4 robustness add_typo But China saw their luck desert them in the se... \n",
- ".. ... ... ... \n",
- "447 robustness lowercase Portuguesa 1 Atletico Mineiro 0 \n",
- "448 robustness lowercase CRICKET - LARA ENDURES ANOTHER MISERABLE DAY . \n",
- "449 robustness lowercase Robert Galvin \n",
- "450 robustness lowercase MELBOURNE 1996-12-06 \n",
- "451 robustness lowercase Australia gave Brian Lara another reason to be... \n",
- "\n",
- " test_case \n",
- "0 SOCCER - JAPAN GET LUCMY WIN , CHINA IN SURPRI... \n",
- "1 Nadim Ladli \n",
- "2 AL-AIN , United Arab Smirates 1996-12-06 \n",
- "3 Japsn began the defence of their Asian Cup tit... \n",
- "4 But China saw their luck desery them in the se... \n",
- ".. ... \n",
- "447 portuguesa 1 atletico mineiro 0 \n",
- "448 cricket - lara endures another miserable day . \n",
- "449 robert galvin \n",
- "450 melbourne 1996-12-06 \n",
- "451 australia gave brian lara another reason to be... \n",
- "\n",
- "[452 rows x 4 columns]"
- ],
"text/html": [
"\n",
"\n",
- "