In [1]:
import tensorflow as tf
import os
!pip install gradio
!pip install transformers
!pip install tensorflow_datasets

Collecting gradio
  Downloading gradio-4.44.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from g

In [2]:
num_gpus_available = len(tf.config.experimental.list_physical_devices('GPU'))
print("Num GPUs Available: ", num_gpus_available)
assert num_gpus_available > 0

sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))
os.environ["CUDA_VISIBLE_DEVICES"]='0'

Num GPUs Available:  1
Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5



In [None]:
import pandas as pd
import numpy as np
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
# Load the tensorflow dataset
dataset = tfds.load('amazon_us_reviews/Mobile_Electronics_v1_00', split='train', shuffle_files=True)

In [None]:
# Convert the dataset to pandas dataframe
dataset = tfds.as_dataframe(dataset)
dataset.head()

In [None]:
# Pick the relevant columns of interest
columns_of_interest = ['data/review_body', 'data/star_rating']
data = dataset[columns_of_interest]
data.head()

In [None]:
# Get the number of samples
print("Total number of datapoints:", data.shape[0])

In [None]:
# Get the number of class (we will convert this the rating to sentiment in the next stage)
print("Number of classes:", data['data/star_rating'].nunique())

In [None]:
# Get the number of samples per class
print("Number of samples per class: \n" ,data['data/star_rating'].value_counts())

In [None]:
# Get the number of words per sample
print("Number of words per sample: " ,np.mean(data['data/review_body'].apply(lambda x: len(x.split()))))

In [None]:
# Get the distribution of number of words per sample
print("Distribution of number of words per sample:")
pd.DataFrame([len(s) for s in data['data/review_body']]).value_counts()

In [None]:
# Plot the distribution of of sample length
def plot_sample_length_distribution(sample_texts):
    """Plots the sample length distribution.

    # Arguments
        samples_texts: list, sample texts.
    """

    plt.hist([len(s) for s in sample_texts], 50)
    plt.xlabel('Length of a sample')
    plt.ylabel('Number of samples')
    plt.title('Sample length distribution')
    plt.show()

plot_sample_length_distribution(data['data/review_body'].tolist())

In [None]:
# Take a sample of 50% of the dataset for the purpose of demonstration
data = data.sample(frac=0.3, random_state=42)
data.head()

In [None]:
print("Number of datapoints after sampling:", data.shape)

In [None]:
# Convert the star rating to sentiment
data['sentiment'] = data['data/star_rating'].apply(lambda rating: "positive" if rating >= 3 else "negative")
data.head()

In [None]:
# One-hot encode the target label
data['sentiment'] = data['sentiment'].map({'positive':1, 'negative':0})

# Drop the rating column
data.drop(columns=['data/star_rating'], inplace=True)
data.head()

In [None]:
# Preprocess the reviews
data.rename(columns={'data/review_body': 'review'}, inplace=True)
data['review'] = data['review'].str.decode('utf-8')
data['review'] = data["review"].apply(str.lower)
data['review']

data.head()

In [None]:
# Check the samples per class
data['sentiment'].value_counts()

In [None]:
# Balance out the classes for the purpose of demonstration
positive = data[data['sentiment'] == 1].sample(n=7425, random_state=42)
negative = data[data['sentiment'] == 0]
train_data = pd.concat([positive, negative]).sample(frac=1).reset_index(drop=True)
train_data['sentiment'].value_counts()

In [None]:
# Sample reviews and its corresponding labels
reviews = train_data['review'].tolist()
labels = train_data['sentiment'].tolist()

print("Sample reviews:")
print(reviews[10:16])
print("Corresponding labels:")
print(labels[10:16])

In [None]:
# Split the data into train, test and vvalidation split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(reviews, labels, test_size=.2, random_state=42)
# Number of train and test data
print("Number of train data points:", len(X_train))
print("Number of train labels:", len(y_train))
print("Number of test data points:", len(X_test))
print("Number of test labels:", len(y_test))

In [None]:
# Load the tokenizer
from transformers import DistilBertTokenizerFast
MODEL_NAME = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)
# Sample tokenization
print("Sentence:" , X_train[0])
tokenizer(X_train[0], truncation=True, padding=True, max_length=256, return_tensors="tf")

In [None]:
# Perform tokenization
MAX_LENGTH=256
train_encodings = tokenizer(X_train,
                            truncation=True,
                            padding=True,
                            return_tensors="tf",
                            max_length=MAX_LENGTH)
test_encodings = tokenizer(X_test,
                           truncation=True,
                           padding=True,
                           return_tensors="tf",
                           max_length=MAX_LENGTH)
# Convert the tokenizer to TF object
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                                    y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                                   y_test))

In [None]:
# Define the model - use pretrained distilbert model and specify the label
from transformers import TFDistilBertForSequenceClassification
model = TFDistilBertForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=MODEL_NAME,
    num_labels=2
    )

In [None]:
# Specify the model optimizer
optimizer = tf.keras.optimizers.Adam(
            learning_rate=5e-5,
        )
# Specify the loss function
loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# Compile the model

model.compile(optimizer=optimizer, loss=loss_func, metrics=['accuracy'])
# Inspect the model
model.summary()

In [None]:
# Define hyperparameters
BATCH_SIZE = 16
NUM_EPOCHS = 2

# Train the model
model.fit(train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE),
          epochs=NUM_EPOCHS,
          batch_size=BATCH_SIZE,
          validation_data=test_dataset.shuffle(len(X_test)).batch(BATCH_SIZE)
          )

In [None]:
# Save the model
model.save_pretrained("/tmp/sentiment_model")
# Preprocess the unseen data
unseen_data = data.sample(frac=0.05, random_state=42)
unseen_reviews = unseen_data['review'].tolist()

unseen_encodings = tokenizer(unseen_reviews,
                            padding=True,
                            truncation=True,
                            max_length=MAX_LENGTH,
                            return_tensors="tf")

y_unseen = unseen_data['sentiment'].tolist()

unseen_encodings = tf.data.Dataset.from_tensor_slices((dict(unseen_encodings),
                                                       y_unseen
                                                      ))
# Evaluate the model
model.evaluate(unseen_encodings.shuffle(len(unseen_reviews))
               .batch(BATCH_SIZE),
               return_dict=True,
               batch_size=BATCH_SIZE)

In [None]:
import gradio as gr

def predict(sentence):
    # Load model
    loaded_model = TFDistilBertForSequenceClassification.from_pretrained("/tmp/sentiment_model")

    # Get the prediction
    predict_input = tokenizer.encode(sentence,
                                 truncation=True,
                                 padding=True,
                                 return_tensors="tf")

    tf_output = loaded_model.predict(predict_input)[0]
    tf_prediction = tf.nn.softmax(tf_output, axis=1).numpy()[0]
    return ['negative', 'positive'][np.argmax(tf_prediction)]

demo = gr.Interface(fn=predict, inputs="text", outputs="text",live=True)

demo.launch()
Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Running on public URL: https://17805.gradio.app
This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)