From 8d7b586ee6de77c71af6785820182c5fc7a162f1 Mon Sep 17 00:00:00 2001
From: Farhan Ahmed <Farhan.Ahmed@ibm.com>
Date: Wed, 13 Dec 2023 20:59:25 -0800
Subject: [PATCH] add hugging face language model demo notebook

Signed-off-by: Farhan Ahmed <Farhan.Ahmed@ibm.com>
---
 .../language_modeling/hugging_face.py         |   5 +-
 notebooks/hugging_face_language_model.ipynb   | 478 ++++++++++++++++++
 2 files changed, 481 insertions(+), 2 deletions(-)
 create mode 100644 notebooks/hugging_face_language_model.ipynb

diff --git a/art/estimators/language_modeling/hugging_face.py b/art/estimators/language_modeling/hugging_face.py
index b0c9d8c109..00c87fb970 100644
--- a/art/estimators/language_modeling/hugging_face.py
+++ b/art/estimators/language_modeling/hugging_face.py
@@ -87,7 +87,6 @@ def __init__(
             preprocessing=preprocessing,
         )
 
-        self._model = model
         self._tokenizer = tokenizer
         self._loss = loss
         self._optimizer = optimizer
@@ -276,8 +275,10 @@ def predict(self, x: Optional[Union[str, List[str]]] = None, **kwargs) -> Dict[s
                     inputs[key] = [v_i.to(self._device) for v_i in value]
                 elif isinstance(value[0], np.ndarray):
                     inputs[key] = [torch.from_numpy(v_i).to(self._device) for v_i in value]
-                elif isinstance(value[0], (float, int)):
+                elif isinstance(value[0], list):
                     inputs[key] = torch.tensor(value).to(self._device)
+                elif isinstance(value[0], (float, int)):
+                    inputs[key] = torch.tensor([value]).to(self._device)
                 else:
                     inputs[key] = value
             else:
diff --git a/notebooks/hugging_face_language_model.ipynb b/notebooks/hugging_face_language_model.ipynb
new file mode 100644
index 0000000000..2b3d20ee80
--- /dev/null
+++ b/notebooks/hugging_face_language_model.ipynb
@@ -0,0 +1,478 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Hugging Face Language Models with ART\n",
+    "\n",
+    "In this notebook we will go over how to use Hugging Face language models with ART. Currently this is a developing feature, and so not all ART tools are supported. Further tools and development is planned. As of ART 1.17 we support:\n",
+    "* Tokenization\n",
+    "* Inference\n",
+    "* Text Generation\n",
+    "\n",
+    "If you have a use case that is not supported (or find a bug in this new feature) please raise an issue on ART.\n",
+    "\n",
+    "Let's look at how we can use ART to run Hugging Face language models!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import torch\n",
+    "\n",
+    "from art.estimators.language_modeling import HuggingFaceLanguageModel"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tokenization\n",
+    "\n",
+    "Using the ART wrapper for the Hugging Face language model, we can easily tokenize text. The model can accept a string (or list or strings) and output the tokens in the same way as a Hugging Face tokenizer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
+    "model = AutoModel.from_pretrained('bert-base-uncased')\n",
+    "\n",
+    "language_model = HuggingFaceLanguageModel(\n",
+    "    model=model,\n",
+    "    tokenizer=tokenizer,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': [101, 2023, 2003, 1037, 7099, 6251, 102],\n",
+       " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0],\n",
+       " 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# We can tokenize a string like a normal Hugging Face tokenizer\n",
+    "\n",
+    "output = language_model.tokenize('this is a sample sentence')\n",
+    "output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': [[101, 2023, 2003, 1037, 7099, 6251, 102],\n",
+       "  [101, 2178, 5164, 102, 0, 0, 0]],\n",
+       " 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0]],\n",
+       " 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 0, 0, 0]]}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# We can also tokenize multiple strings and pass any additional keyword arguments\n",
+    "\n",
+    "output = language_model.tokenize(['this is a sample sentence', 'another string'], padding=True, truncation=True)\n",
+    "output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[101, 2023, 2003, 1037, 6251, 102]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# We can encode strings into tokens and decode them back into string\n",
+    "\n",
+    "token_ids = language_model.encode('this is a sentence')\n",
+    "token_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'this is a sentence'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "strings = language_model.decode(token_ids, skip_special_tokens=True)\n",
+    "strings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Model Inference\n",
+    "\n",
+    "We can use the ART wrapper to perform inference using the Hugging Face language model. Input strings will be automatically tokenized. Additional keyword arguments can be provided or the tokenized inputs can also be passed directly. The output will be a dictionary that contains the same fields as the output of the language model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['last_hidden_state', 'pooler_output'])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Automatic tokenization\n",
+    "\n",
+    "output = language_model.predict('this is a sentence')\n",
+    "output.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['last_hidden_state', 'pooler_output'])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Manual tokenization\n",
+    "\n",
+    "tokens = language_model.tokenize('this is a sentence')\n",
+    "output = language_model.predict(**tokens)\n",
+    "output.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "dict_keys(['last_hidden_state', 'pooler_output'])"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Additional keyword arguments\n",
+    "\n",
+    "output = language_model.predict('this is a sentence', attention_mask=[1, 1, 1, 1, 0, 0])\n",
+    "output.keys()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Generation\n",
+    "\n",
+    "We can use the ART wrapper to generate text using decoder models with the Hugging Face API."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained('t5-small')\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')\n",
+    "\n",
+    "language_model = HuggingFaceLanguageModel(\n",
+    "    model=model,\n",
+    "    tokenizer=tokenizer,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"C'est une belle maison.\""
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# We can generate for a single sentence\n",
+    "\n",
+    "output = language_model.generate('translate English to French: This is a nice house.')\n",
+    "output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[\"C'est une belle maison.\", 'Das ist ein schönes Haus.']"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# We can generate for a multiple sentences\n",
+    "\n",
+    "output = language_model.generate(['translate English to French: This is a nice house.', 'translate English to German: This is a nice house.'])\n",
+    "output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Downstream Tasks\n",
+    "\n",
+    "We can use the ART wrapper to perform downstream tasks using various language models."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### BERT Embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(3, 768)"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
+    "model = AutoModel.from_pretrained('bert-base-uncased')\n",
+    "\n",
+    "language_model = HuggingFaceLanguageModel(\n",
+    "    model=model,\n",
+    "    tokenizer=tokenizer,\n",
+    ")\n",
+    "\n",
+    "sentences = [\n",
+    "    'this is a sample sentence',\n",
+    "    'here is another sentence',\n",
+    "    'and yet another sentence',\n",
+    "]\n",
+    "\n",
+    "output = language_model.predict(sentences)\n",
+    "sentence_embeddings = output['last_hidden_state'][:, -1]\n",
+    "sentence_embeddings.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Sentiment Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "array([1, 1])"
+      ]
+     },
+     "execution_count": 62,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
+    "model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')\n",
+    "\n",
+    "language_model = HuggingFaceLanguageModel(\n",
+    "    model=model,\n",
+    "    tokenizer=tokenizer,\n",
+    ")\n",
+    "\n",
+    "sentences = [\n",
+    "    'I like apples',\n",
+    "    'I like oranges',\n",
+    "]\n",
+    "\n",
+    "output = language_model.predict(sentences)\n",
+    "np.argmax(output['logits'], axis=-1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Masked Language Modeling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias']\n",
+      "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'paris'"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForMaskedLM\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
+    "model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')\n",
+    "\n",
+    "language_model = HuggingFaceLanguageModel(\n",
+    "    model=model,\n",
+    "    tokenizer=tokenizer,\n",
+    ")\n",
+    "\n",
+    "output = language_model.predict('The capital of France is [MASK].')\n",
+    "predicted_token_id = output['logits'][0, -3].argmax(axis=-1)\n",
+    "language_model.decode(predicted_token_id)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "art",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}