# Sentiment Analysis

## Sentiment 140 Dataset

- Developing a sentiment analysis model using Tensorflow that assigns a sentiment by looking at the review of the product
- will be using pre-built embeddings for data dictionary
- will be training a transformer based model
- will be aiming to acheive a higher Precision
- Use TF Serving to deploy the model as an API
- Build a TF Serving client to interact with the API. The client should also be able to continuously accept data entered by the user and provide the sentiment for the review entered by the user.

# Installing Necessary Libraries

In [None]:
! pip install Sentencepiece
! pip install transformers


# Importing Libraries

In [None]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import tensorflow as tf

import transformers
from transformers import (
    AutoTokenizer,
    TFDistilBertModel,
    TFBertModel,
    TFRobertaModel
)
from tensorflow.keras.optimizers import (
    Adam,
    SGD
)

from tensorflow.keras.utils import plot_model

import re

import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('corpus')

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
data = pd.read_csv(
    "",
    encoding='latin-1',
    names=(
        'target',
        'id',
        'date',
        'flag',
        'user',
        'text'
    )
)
data.shape

NameError: ignored

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.isnull().sum()

In [None]:
data['target'].value_counts()

In [None]:
data = data[['target', 'text']]

data['target'] = data['target'].replace(4, 1)

In [None]:
ax = data.groupby('target').count().plot(
    kind='bar',
    title='Distribution of Data',
    legend=False
)

ax.set_xticklabels(
    ['Negative', 'Positive'],
    rotation=0
)

In [None]:
data.head()

In [None]:
text, sentiment = list(data['text']), list(data['target'])

# Preprocessing of text

1. LowerCase: Each text is converted into lowercase.
2. Replacing URLs: Links starting with "http" or "https" or "www" are replaced by URL.
3. Replacing Emojis: Replace emojis by using a pre-defined dictionary containing emojis along with their meaning Ex: EMOJIsmile.
4. Replacing Usernames: Replace @Username with the word "USER".
5. Removing Non-Alphabets: Replacing characters except Digits and Alphabets with a space.
6. Removing Consecutive letters: 3 or more consecutive letters are replaced by 2 letters. Ex: "Heyyy" to "Heyy"
7. Removing Short Words: Words with length less than 2 are removed
8. Removing Stopwords: Stopwords are the English words which does not add much meaning to the sentence. They can safely be ignored without the meaning of the sentence. Ex: the, he, have.
9. Lemmatizing: Lemmatization is the process of converting a word to its base form. Ex: "Great" to "Good".

In [None]:
emojis = {
    ':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad',
    ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
    ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed',
    ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
    '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
    '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink',
    ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'
}

In [None]:
def preprocess(text):
  processed_text = []

  wordlemm = WordNetLemmatizer()

  url_pattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
  user_pattern = "@[^\s]"
  alpha_pattern = "[^a-zA-Z0-9]"
  sequence_pattern = r"(.)\1\1+"
  seq_replace_pattern = r"\1\1"

  for tweet in text:
    tweet = tweet.lower()
    tweet = re.sub(url_pattern, ' URL', tweet)
    for emoji in emojis.keys():
      tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])

    tweet = re.sub(user_pattern, ' USER', tweet)
    tweet = re.sub(alpha_pattern, " ", tweet)
    tweet = re.sub(sequence_pattern, seq_replace_pattern, tweet)

    tweet_tokens = word_tokenize(tweet)
    filtered_words = [word for word in tweet_tokens if word not in stop_words]

    tweetwords = ""

    for word in tweet.split():
      if len(word) > 1:
        word = wordlemm.lemmatize(word)
        tweetwords += (word + " ")

    processed_text.append(tweetwords)

  return processed_text

In [None]:
%%time

processed_text = preprocess(text)

In [None]:
data['text'] = processed_text
data['text'].shape

In [None]:
data = data.sample(800000)
data.shape

In [None]:
data.target.value_counts()

# Plotting Word Cloud

## Word Cloud For Negative Tweets

In [None]:
data_neg = processed_text[:800000]
plt.figure(figsize=(20, 20))

wc = WordCloud(
    max_words=1000,
    width=1600,
    height=800,
    collocations=False
).generate(" ".join(data_neg))

plt.imshow(wc)

## Word Cloud for Positive Tweets

In [None]:
data_pos = processed_text[800000:]
plt.figure(figsize=(20, 20))

wc = WordCloud(
    max_words=1000,
    width=1600,
    height=800,
    collocations=False
).generate(" ".join(data_pos))

plt.imshow(wc)

# Creating Train and Test Data

In [None]:
X_train, x_test, Y_train, y_test = train_test_split(
    data,
    stratify=data['target'],
    test_size=0.2,
    random_state=42
)

x_train, x_val, y_train, y_val = train_test_split(
    X_train,
    Y_train,
    stratify=Y_train,
    test_size=0.2,
    random_state=42

)

In [None]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape

# Encoding of Data

1. Tokenizer
2. Batch Encode Plus
3. Encode Plus
4. Encode
5. Tokenize and then get token ids

In [None]:
from transformers import AutoTokenizer

db_model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(db_model_name)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [3]:
max_seq_len = 512

In [None]:
def encode_text_to_dataset(tokenizer, x_train, x_val, x_test, y_train, y_val, y_test):
  x_train_enc = tokenizer(
      x_train.to_list(),
      max_length=max_seq_len,
      add_special_tokens=True,
      return_tensors='np'
  )

  x_val_enc = tokenizer(
      x_val.to_list(),
      max_length=max_seq_len,
      add_special_tokens=True,
      return_tensors='np'
  )

  x_test_enc = tokenizer(
      x_test.to_list(),
      max_length=max_seq_len,
      add_special_tokens=True,
      return_tensors='np'
  )

  train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(x_train_enc),
      y_train
  ))

  val_dataset = tf.data.Dataset.from_tensor_slices((
      dict(x_val_enc),
      y_val
  ))

  test_dataset = tf.data.Dataset.from_tensor_slices((
      dict(x_test_enc),
      y_test
  ))

  return train_dataset, val_dataset, test_dataset

In [None]:
train_dataset, val_dataset, test_dataset = encode_text_to_dataset(tokenizer, x_train, x_val, x_test, y_train, y_val, y_test)

# Model Building

In [None]:
distilbert_model = TFDistilBertModel.from_pretrained(db_model_name)

In [None]:
distilbert_model.trainable = False

In [None]:
from tensorflow.keras.layers import (
    Dense,
    Input,
    Dropout,
    Conv1D,
    BatchNormalization
)

from tensorflow.keras.models import Model
from tensorflow.keras.optimizers immport Adam

In [None]:
def db_model(train_dataset, val_dataset, model, max_len, epochs):
  print("------------------Building the model -------------------------")
  input_ids = Input(
      shape=(max_len, ),
      dtype=tf.int32,
      name='input_ids'
  )

  attention_mask = Input(
      shape=(max_len, ),
      dtype=tf.int32,
      name='attention_mask'
  )

  sequence_output = model(
      input_ids,
      attention_mask
  )[0]

  cls_token = sequence_output[:, 0, :]
  x = BatchNormalization()(cls_token)
  x = Dense(512, activation='relu')(x)
  x = Dropout(0.5)(x)
  y = Dense(1, activation='sigmoid')(x)

  transformer_model = Model(
      inputs=[input_ids, attention_mask],
      outputs=y,
  )

  transformer_model.summary()
  transformer_model.compile(
      Adam(lr=2e-5),
      loss='binary_crossentropy',
      metrics=['accuracy' ,'Precision']
  )

  return transformer_model

In [None]:
epochs = 2
dbert_model = db_model(
    train_dataset,
    val_dataset,
    distilbert_model,
    max_seq_len,
    epochs
)

In [None]:
dbert_image_path = ""
plot_model(
    dbert_model,
    to_file=dbert_image_path,
    expand_nested=True,
    show_shapes=True
)

In [None]:
BATCH_SIZE = 128
dbert_model_history = dbert_model.fit(
    train_dataset.batch(BATCH_SIZE),
    batch_size=BATCH_SIZE,
    validation_data=val_dataset.batch(BATCH_SIZE),
    epochs=epochs
)

In [None]:
dbert_model_history.history.keys()

# Plotting Accuracy Graph

In [None]:
plt.figure()

plt.subplot(2, 1, 0)
plt.plot(dbert_model_history['accuracy'], label='Train')
plt.plot(dbert_model_history['val_accuracy'], label='Test')

plt.subplot(2, 1, 1)
plt.plot(dbert_model_history['Precision'], label='Train')
plt.plot(dbert_model_history['val_Precision'], label='Test')

plt.subplot(2, 2, 1)
plt.plot(dbert_model_history['loss'], label='Train')
plt.plot(dbert_model_history['val_loss'], label='Test')


In [None]:
dbert_y_pred_logits = model.predict(test_dataset)
dbert_y_pred = [1 if logits > 0.5 else 0 for logits in dbert_y_pred_logits]

In [None]:
def metrics(y_test, y_pred):
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred))
    print('Recall: ', recall_score(y_test, y_pred))
    print('F_Score: ', f1_score(y_test, y_pred))
    print('Classification Report')
    print(classification_report(y_test, y_pred))
    print('Confusion Matrix')
    print(confusion_matrix(y_test, y_pred))

In [None]:
db_model_save_path = ""
dbert_model.save(db_model_save_path)

## Loading Model

In [None]:
dbert_model = tf.keras.models.load_model(
    db_model_save_path,
    custom_objects={
        'TFDistilBertModel': TFDistilBertModel
    }
)

In [None]:
def make_prediction(model, processed_data):
  probs = model.predict(processed_data)
  results = []
  for prob in probs:
    if prob <= 0.5:
      results.append("Negative")
    else:
      results.append("Positive")
  return results

In [None]:
def bert_encode_input(data, max_len):
  input_ids = []
  attention_masks = []

  for i in range(len(data)):
    encoded = tokenizer.encode_plus(
        data[i],
        add_special_tokens=True,
        max_length=max_len,
        pad_to_max_length=True,
        return_attention_mask=True
    )

    input_ids.append(encoded['input_ids'])
    attention_masks.append(encoded['attention_mask'])

  return np.array(input_ids), np.array(attention_masks)

In [None]:
n = int(input())

input_text = []

for i in range(n):
  line = input()
  input_text.append(line)

input_text

In [None]:
preprocess_input = preprocess(input_text)
print(preprocess_input)

input_ids, attention_masks = bert_encode_input(
    processed_input,
    max_seq_len
)

print(input_ids)

result = make_prediction(
    dbert_model,
    [input_ids, attention_masks]
)

print(f"Predicted Sentiment: {result}")

# Deployment

In [None]:
import tempfile
import os

MODEL_DIR = ""
version = 1
export_path = os.path.join(MODEL_DIR, str(version))

print(f"Export Path: {export_path}")


tf.keras.models.save_model(
    dbert_model,
    export_path,
    overwrite=True,
    include_optimizer=True,
    save_format=None,
    signatures=None,
    options=None
)

### Check the model stats using saved model cli

In [None]:
! saved_model_cli show --dir f"{MODEL_DIR}" --all

### Installation of TF Serving on Server

In [None]:
! echo "deb http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | tee /etc/apt/sources.list.d/tensorflow-serving.list && \
curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | apt-key add -
!apt update

In [None]:
! apt-get install tensorflow-model-server

In [None]:
! apt-get upgrade tensorflow-model-server

In [None]:
os.environ['MODEL_DIR'] = MODEL_DIR

### Start the server in BG

In [None]:
%%bash --bg

nohup tensorflow_model_server \
  --rest_api_port=8501 \
  --model_name=bert_model \
  --model_base_path="${MODEL_DIR}" >server.log 2>%1

In [None]:
!tail server.log

## Client Side Implementation

In [None]:
import json

data = json.dumps({
    "signature_name": "serving_default",
    "inputs": {
        'inputs_ids': input_ids.tolist(),
        'attention_mask': attention_masks.tolist()
    }
})

print(f"Data: {data} ... {len(data)}")

In [None]:
! pip install -q requests

In [None]:
import requests

headers = {'content-type': "application/json"}

json_response = requests.post(
    "http://localhost:8501/v1/models/bert_model:predict",
    data=data,
    headers=headers
)

predictions = json.loads(json_response.text)

print(predictions)

class_names = {
    0: 'Negative',
    1: 'Positive'
}

print(round(predictions['outputs'][0][0]))

print(f"Sentiment Class of given Sentence is {class_names[round(predictions['outputs'][0][0])]}")


In [None]:
import requests

headers = {"content-type": "application/json"}

json_response = requests.post(
    "http://localhost:8501/v1/models/bert_model:predict",
    data=data,
    headers=headers
)

predictions = json.loads(json_resposne.text)

print(predictions)

class_names = {
    0: "Negatives",
    1: 'Positives'
}

print(round(predictions['outputs'][0][0]))

for i in range(len(predictions['outputs'])):
  print(f"Sentiment Class of {input_text[i]} Sentence is: {class_names[round(predictions['outputs'][i][0])]}")

# BERT Model

In [None]:
bert_model_name = 'bert-base-uncased'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

## Encoding BERT

In [None]:
bert_model = TFBertModel.from_pretrained(bert_model_name)
bert_model.trainable = False

In [None]:
def bert_model_init(train_dataset, val_dataset, model, max_len, epochs):
  print("------------------Building the model -------------------------")
  input_ids = Input(
      shape=(max_len, ),
      dtype=tf.int32,
      name='input_ids'
  )

  attention_mask = Input(
      shape=(max_len, ),
      dtype=tf.int32,
      name='attention_mask'
  )

  sequence_output = model(
      input_ids,
      attention_mask
  )[0]

  cls_token = sequence_output[:, 0, :]
  x = BatchNormalization()(cls_token)
  x = Dense(512, activation='relu')(x)
  x = Dropout(0.5)(x)
  y = Dense(1, activation='sigmoid')(x)

  transformer_model = Model(
      inputs=[input_ids, attention_mask],
      outputs=y,
  )

  transformer_model.summary()
  transformer_model.compile(
      Adam(lr=1e-3),
      loss='binary_crossentropy',
      metrics=['accuracy' ,'Precision']
  )

  return transformer_model

In [None]:
train_bert_dataset, val_bert_dataset, test_bert_dataset = encode_text_to_dataset(bert_tokenizer, x_train, x_val, x_test, y_train, y_val, y_test)

In [None]:
epochs = 2
model_bert = bert_model_init(
    train_bert_dataset,
    val_bert_dataset,
    bert_model,
    max_seq_len
)

In [None]:
bert_img_path = ""
plot_model(
    model_bert,
    to_file=bert_img_path,
    expand_nested=True,
    show_shapes=True
)

In [None]:
BATCH_SIZE = 128
model_bert_history = model_bert.fit(
    train_bert_dataset.batch(BATCH_SIZE),
    batch_size=BATCH_SIZE,
    validation_data=val_bert_dataset.batch(BATCH_SIZE),
    epochs=epochs
)

In [None]:
plt.figure()

plt.subplot(2, 1, 0)
plt.plot(model_bert_history['accuracy'], label='Train')
plt.plot(model_bert_history['val_accuracy'], label='Test')

plt.subplot(2, 1, 1)
plt.plot(model_bert_history['Precision'], label='Train')
plt.plot(model_bert_history['val_Precision'], label='Test')

plt.subplot(2, 2, 1)
plt.plot(model_bert_history['loss'], label='Train')
plt.plot(model_bert_history['val_loss'], label='Test')

In [None]:
bert_y_pred_logits = model_bert.predict(test_dataset)
bert_y_pred = [1 if logits > 0.5 else 0 for logits in bert_y_pred_logits]

# Roberta

In [None]:
roberta_model_name = 'roberta-base'
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)

In [None]:
roberta_model = TFRobertaModel.from_pretrained(roberta_model_name)

In [None]:
train_roberta_dataset, val_roberta_dataset, test_roberta_dataset = encode_text_to_dataset(roberta_tokenizer, x_train, x_val, x_test, y_train, y_val, y_test)

In [None]:
def roberta_model_init(train_dataset, val_dataset, model, max_len, epochs):
  print("------------------Building the model -------------------------")
  input_ids = Input(
      shape=(max_len, ),
      dtype=tf.int32,
      name='input_ids'
  )

  attention_mask = Input(
      shape=(max_len, ),
      dtype=tf.int32,
      name='attention_mask'
  )

  sequence_output = model(
      input_ids,
      attention_mask
  )[0]

  cls_token = sequence_output[:, 0, :]
  x = BatchNormalization()(cls_token)
  x = Dense(512, activation='relu')(x)
  x = Dropout(0.5)(x)
  y = Dense(1, activation='sigmoid')(x)

  transformer_model = Model(
      inputs=[input_ids, attention_mask],
      outputs=y,
  )

  transformer_model.summary()
  transformer_model.compile(
      Adam(lr=1e-3),
      loss='binary_crossentropy',
      metrics=['accuracy' ,'Precision']
  )

  return transformer_model

In [None]:
epochs = 2
model_roberta = roberta_model_init(
    train_roberta_dataset,
    val_roberta_dataset,
    test_roberta_dataset,
    roberta_model,
    max_seq_len
)

In [None]:
roberta_img_path = ""
plot_model(
    model_roberta,
    to_file=roberta_img_path,
    expand_nested=True,
    show_shapes=True
)

In [None]:
BATCH_SIZE = 128
model_roberta_history = model_roberta.fit(
    train_roberta_dataset.batch(BATCH_SIZE),
    batch_size=BATCH_SIZE,
    validation_data=val_roberta_dataset.batch(BATCH_SIZE),
    epochs=epochs
)

In [None]:
plt.figure()

plt.subplot(2, 1, 0)
plt.plot(model_roberta_history['accuracy'], label='Train')
plt.plot(model_roberta_history['val_accuracy'], label='Test')

plt.subplot(2, 1, 1)
plt.plot(model_roberta_history['Precision'], label='Train')
plt.plot(model_roberta_history['val_Precision'], label='Test')

plt.subplot(2, 2, 1)
plt.plot(model_roberta_history['loss'], label='Train')
plt.plot(model_roberta_history['val_loss'], label='Test')

In [None]:
roberta_y_pred_logits = model_roberta.predict(test_dataset)
roberta_y_pred = [1 if logits > 0.5 else 0 for logits in roberta_y_pred_logits]

# XLNET

In [None]:
xlnet_model_name = "xlnet-base-cased"
xlnet_tokenizer = AutoTokenizer.from_pretrained(xlnet_model_name)

In [None]:
xlnet_model = TFXLNetModel.from_pretrained(xlnet_model_name)

In [None]:
train_xlnet_dataset, val_xlnet_dataset, test_xlnet_dataset = encode_text_to_dataset(xlnet_tokenizer, x_train, x_val, x_test, y_train, y_val, y_test)

In [None]:
def xlnet_model_init(train_dataset, val_dataset, model, max_len, epochs):
  print("------------------Building the model -------------------------")
  input_ids = Input(
      shape=(max_len, ),
      dtype=tf.int32,
      name='input_ids'
  )

  attention_mask = Input(
      shape=(max_len, ),
      dtype=tf.int32,
      name='attention_mask'
  )

  sequence_output = model(
      input_ids,
      attention_mask
  )[0]

  cls_token = sequence_output[:, 0, :]
  x = BatchNormalization()(cls_token)
  x = Dense(512, activation='relu')(x)
  x = Dropout(0.5)(x)
  y = Dense(1, activation='sigmoid')(x)

  transformer_model = Model(
      inputs=[input_ids, attention_mask],
      outputs=y,
  )

  transformer_model.summary()
  transformer_model.compile(
      Adam(lr=1e-3),
      loss='binary_crossentropy',
      metrics=['accuracy' ,'Precision']
  )

  return transformer_model

In [None]:
epochs = 2
model_xlnet = xlnet_model_init(
    train_xlnet_dataset,
    val_xlnet_dataset,
    test_xlnet_dataset,
    max_seq_len
    )

In [None]:
xlnet_img_path = ""
plot_model(
    model_xlnet,
    to_file=xlnet_img_path,
    expand_nested=True,
    show_shapes=True
)

In [None]:
BATCH_SIZE = 128
model_xlnet_history = model_xlnet.fit(
    train_xlnet_dataset.batch(BATCH_SIZE),
    batch_size=BATCH_SIZE,
    validation_data=val_xlnet_dataset.batch(BATCH_SIZE),
    epochs=epochs
)

In [None]:
plt.figure()

plt.subplot(2, 1, 0)
plt.plot(model_xlnet_history['accuracy'], label='Train')
plt.plot(model_xlnet_history['val_accuracy'], label='Test')

plt.subplot(2, 1, 1)
plt.plot(model_xlnet_history['Precision'], label='Train')
plt.plot(model_xlnet_history['val_Precision'], label='Test')

plt.subplot(2, 2, 1)
plt.plot(model_xlnet_history['loss'], label='Train')
plt.plot(model_xlnet_history['val_loss'], label='Test')

In [None]:
xlnet_y_pred_logits = model_xlnet.predict(test_dataset)
xlnet_y_pred = [1 if logits > 0.5 else 0 for logits in xlnet_y_pred_logits]