In [1]:
! pip install datasets
! pip install transformers

Collecting datasets
  Downloading datasets-1.14.0-py3-none-any.whl (290 kB)
[?25l[K     |█▏                              | 10 kB 19.6 MB/s eta 0:00:01[K     |██▎                             | 20 kB 10.1 MB/s eta 0:00:01[K     |███▍                            | 30 kB 6.9 MB/s eta 0:00:01[K     |████▌                           | 40 kB 7.7 MB/s eta 0:00:01[K     |█████▋                          | 51 kB 5.3 MB/s eta 0:00:01[K     |██████▊                         | 61 kB 5.7 MB/s eta 0:00:01[K     |████████                        | 71 kB 5.6 MB/s eta 0:00:01[K     |█████████                       | 81 kB 6.3 MB/s eta 0:00:01[K     |██████████▏                     | 92 kB 4.9 MB/s eta 0:00:01[K     |███████████▎                    | 102 kB 5.4 MB/s eta 0:00:01[K     |████████████▍                   | 112 kB 5.4 MB/s eta 0:00:01[K     |█████████████▌                  | 122 kB 5.4 MB/s eta 0:00:01[K     |██████████████▊                 | 133 kB 5.4 MB/s eta 0:00:01

# Loading data - IMDB Dataset for sentiment prediction

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("imdb")

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

# Tokenizing the dataset using bert-base-cased

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [4]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)


  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

# Sampling data to create train & eval dataset

In [5]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

In [6]:
small_train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
    num_rows: 1000
})

# Defining Model with Pretrained Model & Classifier

In [7]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


# Formatting in Tensorflow Format

In [9]:
tf_train_dataset = small_train_dataset.remove_columns(["text"]).with_format("tensorflow")
tf_eval_dataset = small_eval_dataset.remove_columns(["text"]).with_format("tensorflow")

In [10]:
train_features = {x: tf_train_dataset[x] for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset["label"]))
train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(8)

eval_features = {x: tf_eval_dataset[x] for x in tokenizer.model_input_names}
eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, tf_eval_dataset["label"]))
eval_tf_dataset = eval_tf_dataset.batch(8)

In [15]:
train_features.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

# Training

In [11]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy()
)

model.fit(train_tf_dataset, validation_data=eval_tf_dataset, epochs=3,callbacks=[tensorboard_callback],verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fd29fcc71d0>

In [12]:
from tensorboard import notebook
notebook.display(port=6006, height=1000)

<IPython.core.display.Javascript object>

# Simple component built with gradio

In [13]:
! pip install gradio

Collecting gradio
  Downloading gradio-2.4.0-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 5.3 MB/s 
Collecting flask-cachebuster
  Downloading Flask-CacheBuster-1.0.0.tar.gz (3.1 kB)
Collecting markdown2
  Downloading markdown2-2.4.1-py2.py3-none-any.whl (34 kB)
Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting ffmpy
  Downloading ffmpy-0.3.0.tar.gz (4.8 kB)
Collecting pycryptodome
  Downloading pycryptodome-3.11.0-cp35-abi3-manylinux2010_x86_64.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 16.4 MB/s 
Collecting paramiko
  Downloading paramiko-2.8.0-py2.py3-none-any.whl (206 kB)
[K     |████████████████████████████████| 206 kB 46.6 MB/s 
[?25hCollecting Flask-Login
  Downloading Flask_Login-0.5.0-py2.py3-none-any.whl (16 kB)
Collecting Flask-Cors>=3.0.8
  Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)
Collecting analytics-python
  Downloading analytics_python-1.4.0-py2.py3-none-any.whl (15 k

In [14]:
import gradio
def get_prediction(text):
  def get_text(prediction):
    if prediction_probs[0][1] > prediction_probs[0][0]:
      return "Positive"
    else:
      return "Negative"
  tokenized_text=tokenizer.encode(text,truncation=True,padding=True,return_tensors="tf")
  prediction=model(tokenized_text)
  prediction_logits = prediction[0]
  prediction_probs = tf.nn.softmax(prediction_logits,axis=1).numpy()
  return get_text(prediction_probs)

iface = gradio.Interface(fn=get_prediction, inputs="text", outputs=["text"])
iface.launch(debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
This share link will expire in 72 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted
Running on External URL: https://58070.gradio.app


KeyboardInterrupt: ignored