PacificAI · ArshaanNazir · Sep 19, 2023 · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023
diff --git a/demo/tutorials/llm_notebooks/Azure_OpenAI_QA_Summarization_Testing_Notebook.ipynb b/demo/tutorials/llm_notebooks/Azure_OpenAI_QA_Summarization_Testing_Notebook.ipynb
@@ -48,7 +48,7 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "!pip install \"langtest[evaluate,langchain,openai,transformers]\" "
+        "!pip install \"langtest[evaluate,openai,transformers]\" "
       ]
     },
     {

diff --git a/demo/tutorials/llm_notebooks/Clinical_Tests.ipynb b/demo/tutorials/llm_notebooks/Clinical_Tests.ipynb
@@ -46,7 +46,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install \"langtest[langchain,openai,transformers]\""
+        "!pip install \"langtest[openai,transformers]\""
       ]
     },
     {

diff --git a/demo/tutorials/llm_notebooks/Factuality_Test.ipynb b/demo/tutorials/llm_notebooks/Factuality_Test.ipynb
@@ -36,7 +36,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install \"langtest[langchain,openai,transformers]\" "
+    "!pip install \"langtest[openai,transformers]\" "
    ]
   },
   {
@@ -1391,7 +1391,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.4"
+   "version": "3.9.13"
   },
   "orig_nbformat": 4
  },

diff --git a/demo/tutorials/llm_notebooks/Legal_Support.ipynb b/demo/tutorials/llm_notebooks/Legal_Support.ipynb
@@ -46,7 +46,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install \"langtest[langchain,openai,transformers]\""
+        "!pip install \"langtest[openai]\""
       ]
     },
     {
@@ -175,7 +175,7 @@
         "id": "jWPAw9q0PwD1"
       },
       "source": [
-        "We have specified task as `wino-bias` , hub as `huggingface` and model as `bert-base-uncased`\n",
+        "We have specified task as `legal-tests` , hub as `openai` and model as `text-davinci-002`\n",
         "\n"
       ]
     },

diff --git a/demo/tutorials/llm_notebooks/OpenAI_QA_Summarization_Testing_Notebook.ipynb b/demo/tutorials/llm_notebooks/OpenAI_QA_Summarization_Testing_Notebook.ipynb
@@ -48,7 +48,7 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "!pip install \"langtest[evaluate,langchain,openai,transformers]\" "
+        "!pip install \"langtest[evaluate,openai,transformers]\" "
       ]
     },
     {

diff --git a/demo/tutorials/llm_notebooks/Sensitivity_Test.ipynb b/demo/tutorials/llm_notebooks/Sensitivity_Test.ipynb
@@ -36,7 +36,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install \"langtest[evaluate,langchain,openai,transformers]\""
+    "!pip install \"langtest[evaluate,openai,transformers]\" "
    ]
   },
   {

diff --git a/demo/tutorials/llm_notebooks/Toxicity_NB.ipynb b/demo/tutorials/llm_notebooks/Toxicity_NB.ipynb
@@ -48,7 +48,7 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "!pip install \"langtest[evaluate,langchain,openai,transformers]\" "
+        "!pip install \"langtest[evaluate,openai,transformers]\" "
       ]
     },
     {

diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/ASDiv_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/ASDiv_dataset.ipynb
diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/BBQ_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/BBQ_dataset.ipynb
@@ -46,7 +46,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install \"langtest[langchain,openai,transformers,evaluate]\""
+        "!pip install \"langtest[openai,transformers,evaluate]\""
       ]
     },
     {

diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/Bigbench_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/Bigbench_dataset.ipynb
@@ -46,7 +46,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install \"langtest[langchain,openai,transformers,evaluate]\""
+        "!pip install \"langtest[openai,transformers,evaluate]\""
       ]
     },
     {

diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/BoolQ_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/BoolQ_dataset.ipynb
diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/HellaSwag_Question_Answering.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/HellaSwag_Question_Answering.ipynb
diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/LogiQA_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/LogiQA_dataset.ipynb
diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/MultiLexSum_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/MultiLexSum_dataset.ipynb
diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/NQ_open_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/NQ_open_dataset.ipynb
diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/NarrativeQA_Question_Answering.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/NarrativeQA_Question_Answering.ipynb
diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/OpenbookQA_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/OpenbookQA_dataset.ipynb
diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/TruthfulQA_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/TruthfulQA_dataset.ipynb
diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/XSum_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/XSum_dataset.ipynb
diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/mmlu_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/mmlu_dataset.ipynb
diff --git a/demo/tutorials/llm_notebooks/dataset-notebooks/quac_dataset.ipynb b/demo/tutorials/llm_notebooks/dataset-notebooks/quac_dataset.ipynb
diff --git a/demo/tutorials/misc/Different_Report_formats.ipynb b/demo/tutorials/misc/Different_Report_formats.ipynb
diff --git a/demo/tutorials/misc/HuggingFace_Dataset_Notebook.ipynb b/demo/tutorials/misc/HuggingFace_Dataset_Notebook.ipynb
@@ -3929,7 +3929,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install \"langtest[evaluate,langchain,openai,transformers]\""
+    "!pip install \"langtest[evaluate,openai,transformers]\""
    ]
   },
   {

diff --git a/demo/tutorials/misc/Loading_Data_with_Custom_Columns.ipynb b/demo/tutorials/misc/Loading_Data_with_Custom_Columns.ipynb
@@ -39,7 +39,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install \"langtest[langchain,openai,transformers,evaluate]\""
+    "!pip install \"langtest[openai,transformers,evaluate]\""
    ]
   },
   {

diff --git a/demo/tutorials/test-specific-notebooks/Fairness_Demo.ipynb b/demo/tutorials/test-specific-notebooks/Fairness_Demo.ipynb
diff --git a/demo/tutorials/test-specific-notebooks/Political_Demo.ipynb b/demo/tutorials/test-specific-notebooks/Political_Demo.ipynb
@@ -46,7 +46,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install langtest[evaluate,langchain,openai,transformers]"
+        "!pip install langtest[evaluate,openai,transformers]"
       ]
     },
     {

diff --git a/docs/_data/navigation.yml b/docs/_data/navigation.yml
@@ -98,3 +98,11 @@ tests:
     url: /docs/pages/tests/security
   - title: Disinformation
     url: /docs/pages/tests/disinformation
+  - title: Sensitivity
+    url: /docs/pages/tests/sensitivity
+  - title: Factuality
+    url: /docs/pages/tests/factuality
+  - title: Wino Bias
+    url: /docs/pages/tests/wino-bias
+  - title: Legal 
+    url: /docs/pages/tests/legal
diff --git a/docs/_layouts/landing.html b/docs/_layouts/landing.html
@@ -125,7 +125,7 @@ <h3 class="grey h3_title">{{ _section.title }}</h3>
                     <div class="highlight-box">
                       {% highlight python %}
 
-!pip install "langtest[langchain,openai,transformers]"
+!pip install "langtest[openai,transformers]"
 
 from langtest import Harness
 

diff --git a/docs/pages/docs/data.md b/docs/pages/docs/data.md
@@ -47,6 +47,8 @@ Supported `data_source` formats are task-dependent. The following table provides
 | **clinical-tests**      | Select list of curated datasets                          |
 | **disinformation-test** | Select list of curated datasets                          |
 | **political**           | Select list of curated datasets                          |
+| **factuality test**     | Select list of curated datasets                          |
+| **sensitivity test**    | Select list of curated datasets                          |
 
 </div><div class="h3-box" markdown="1">
 
@@ -188,6 +190,7 @@ To test Question Answering models, the user is meant to select a benchmark datas
 | **BoolQ-dev-tiny**                                      | [BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions](https://aclanthology.org/N19-1300/) | Truncated version of the dev set from the BoolQ dataset, containing 50 labeled examples                                                                                                                           |
 | **BoolQ-test**                                          | [BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions](https://aclanthology.org/N19-1300/) | Test set from the BoolQ dataset, containing 3,245 labeled examples. This dataset does not contain labels and accuracy & fairness tests cannot be run with it.                                                     |
 | **BoolQ-test-tiny**                                     | [BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions](https://aclanthology.org/N19-1300/) | Truncated version of the test set from the BoolQ dataset, containing 50 labeled examples. This dataset does not contain labels and accuracy & fairness tests cannot be run with it.                               |
+| **BoolQ-bias**                                               | [BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions](https://aclanthology.org/N19-1300/) | Manually annotated bias version of BoolQ dataset, containing 136 labeled examples   
 | **NQ-open**                                             | [Natural Questions: A Benchmark for Question Answering Research](https://aclanthology.org/Q19-1026/)         | Training & development set from the NaturalQuestions dataset, containing 3,569 labeled examples                                                                                                                   |
 | **NQ-open-test**                                        | [Natural Questions: A Benchmark for Question Answering Research](https://aclanthology.org/Q19-1026/)         | Development set from the NaturalQuestions dataset, containing 1,769 labeled examples                                                                                                                              |
 | **NQ-open-test-tiny**                                   | [Natural Questions: A Benchmark for Question Answering Research](https://aclanthology.org/Q19-1026/)         | Training, development & test set from the NaturalQuestions dataset, containing 50 labeled examples                                                                                                                |
@@ -276,6 +279,7 @@ To test Summarization models, the user is meant to select a benchmark dataset fr
 | **XSum**           | [Don’t Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization](https://aclanthology.org/D18-1206/) | Training & development set from the Extreme Summarization (XSum) Dataset, containing 226,711 labeled examples |
 | **XSum-test**      | [Don’t Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization](https://aclanthology.org/D18-1206/) | Test set from the Xsum dataset, containing 1,000 labeled examples                                             |
 | **XSum-test-tiny** | [Don’t Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization](https://aclanthology.org/D18-1206/) | Truncated version of the test set from the Xsum dataset, containing 50 labeled examples                       |
+| **XSum-bias** | [Don’t Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization](https://aclanthology.org/D18-1206/) | Manually annotated bias version of the Xsum dataset, containing 382 labeled examples 
 
 </div><div class="h3-box" markdown="1">
 #### Summarization Benchmarks: Use Cases and Evaluations
@@ -391,7 +395,7 @@ harness = Harness(task='disinformation-test',
                   model={"model": "j2-jumbo-instruct", "hub":"ai21"},
                   data={"data_source": "Narrative-Wedging"})
 ```
-
+</div><div class="h3-box" markdown="1">
 
 ### Political Test
 
@@ -418,5 +422,71 @@ harness = Harness(task='political',
                   model={"model": "j2-jumbo-instruct", "hub":"ai21"})
 ```
 
+</div><div class="h3-box" markdown="1">
+
+### Factuality Test
+
+The Factuality Test is designed to evaluate the ability of LLMs to determine the factuality of statements within summaries, particularly focusing on the accuracy of LLM-generated summaries and potential biases in their judgments. Users should choose a benchmark dataset from the provided list.
+
+#### Datasets
+
+{:.table2}
+| Dataset               | Source                                                                                                                                            | Description                                                |
+| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
+| **Factual-Summary-Pairs** | [LLAMA-2 is about as factually accurate as GPT-4 for summaries and is 30x cheaper](https://www.anyscale.com/blog/llama-2-is-about-as-factually-accurate-as-gpt-4-for-summaries-and-is-30x-cheaper) | Factual-Summary-Pairs, containing 371 labeled examples. |
+
+</div><div class="h3-box" markdown="1">
+
+#### Factuality Test Dataset: Use Cases and Evaluations
+
+{:.table2}
+| Dataset               | Use Case                                                                                                                                                  | Notebook                                                                                                                                                                                                      |
+| --------------------- |
+| **Factual-Summary-Pairs** | The Factuality Test is designed to evaluate the ability of LLMs to determine the factuality of statements within summaries, particularly focusing on the accuracy of LLM-generated summaries and potential biases in their judgments. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/langtest/blob/main/demo/tutorials/llm_notebooks/Factuality_Test.ipynb) |
+
+</div><div class="h3-box" markdown="1">
 
+#### Passing a Factuality Test Dataset to the Harness
+
+In the Harness, we specify the data input in the following way:
+
+```python
+# Import Harness from the LangTest library
+from langtest import Harness
+
+harness = Harness(task='factuality-test',
+                  model={"model": "text-davinci-003", "hub":"openai"},
+                  data={"data_source": "Factual-Summary-Pairs"})
+```
+</div><div class="h3-box" markdown="1">
+
+### Sensitivity Test
+
+The Evaluating Model’s Sensitivity to Negation Test focuses on assessing a model’s responsiveness to negations introduced into its input text. The primary objective is to determine whether the model can effectively detect and respond to negations. Users should choose a benchmark dataset from the provided list.
+
+#### Datasets
+
+{:.table2}
+| Dataset               | Source                                                                                                                                            | Description                                                |
+| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
+| **NQ-open**                                             | [Natural Questions: A Benchmark for Question Answering Research](https://aclanthology.org/Q19-1026/)         | Training & development set from the NaturalQuestions dataset, containing 3,569 labeled examples                                                                                                                   |
+| **NQ-open-test**                                        | [Natural Questions: A Benchmark for Question Answering Research](https://aclanthology.org/Q19-1026/)         | Development set from the NaturalQuestions dataset, containing 1,769 labeled examples                                                                                                                              |
+| **NQ-open-test-tiny**                                   | [Natural Questions: A Benchmark for Question Answering Research](https://aclanthology.org/Q19-1026/)         | Training, development & test set from the NaturalQuestions dataset, containing 50 labeled examples   
+| **OpenBookQA-test**                                     | [OpenBookQA Dataset](https://allenai.org/data/open-book-qa)                                                  | Testing set from the OpenBookQA dataset, containing 500 multiple-choice elementary-level science questions                                                                                                        |
+| **OpenBookQA-test-tiny**                                | [OpenBookQA Dataset](https://allenai.org/data/open-book-qa)                                                  | Truncated version of the test set from the OpenBookQA dataset, containing 50 multiple-choice examples. 
+
+</div><div class="h3-box" markdown="1">
+
+#### Passing a Sensitivity Test Dataset to the Harness
+
+In the Harness, we specify the data input in the following way:
+
+```python
+# Import Harness from the LangTest library
+from langtest import Harness
+
+harness = Harness(task='sensitivity-test',
+                  model={"model": "text-davinci-003", "hub":"openai"},
+                  data = {"data_source": "NQ-open-test-tiny"})
+```
 </div></div>
diff --git a/docs/pages/docs/model.md b/docs/pages/docs/model.md
@@ -117,7 +117,7 @@ h.generate().run().report()
 Using any large language model from the [OpenAI API](https://platform.openai.com/docs/models/overview):
 
 ```bash
-!pip install "langtest[langchain,openai,transformers]"
+!pip install "langtest[openai]"
 ```
 
 ```python
@@ -170,7 +170,7 @@ h.generate().run().report()
 #### Pretrained Models
 
 ```bash
-!pip install "langtest[transformers,langchain,cohere]"
+!pip install "langtest[langchain,cohere]"
 ```
 
 ```python
@@ -193,7 +193,7 @@ h.generate().run().report()
 #### Pretrained Models
 
 ```bash
-!pip install "langtest[transformers,langchain,ai21]"
+!pip install "langtest[langchain,ai21]"
 ```
 
 ```python
@@ -215,7 +215,7 @@ h.generate().run().report()
 #### Pretrained Models
 
 ```bash
-!pip install "langtest[transformers,langchain,openai]"
+!pip install "langtest[openai]"
 ```
 
 ```python
@@ -243,7 +243,7 @@ h.generate().run().report()
 #### Pretrained Models
 
 ```bash
-!pip install "langtest[transformers,langchain,huggingface-hub]"
+!pip install "langtest[langchain,huggingface-hub]"
 ```
 
 ```python