<a href="https://colab.research.google.com/github/Rishu-N/TENSORFLOW/blob/main/NLP_TWEET_SENTIMENT_ANALYSIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup

from google.colab import drive

In [2]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [3]:
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
cols = ["sentiment","id","data","query","user","text"]
train_Data = pd.read_csv("/content/drive/MyDrive/TENSORFLOW/NLP/trainingandtestdata/training.1600000.processed.noemoticon.csv",header=None,names=cols,
                         engine="python",encoding="latin1")
test_Data = pd.read_csv("/content/drive/MyDrive/TENSORFLOW/NLP/trainingandtestdata/testdata.manual.2009.06.14.csv",header=None,names=cols,
                         engine="python",encoding="latin1")

In [5]:
train_Data.head()

Unnamed: 0,sentiment,id,data,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
test_Data.head()

Unnamed: 0,sentiment,id,data,query,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...


In [7]:
data = train_Data
data.drop(['id','data','query','user'],axis=1,inplace=True)

In [8]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [9]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet, "lxml").get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet) # replace all mentions with blankspace
  tweet = re.sub(r"https?://[A-Za-z0-9./]"," ",tweet) # replace urls
  tweet = re.sub(r"[^a-zA-Z.!?]"," ",tweet) #anything inside a square bracket that is not
  tweet = re.sub(r" +"," ",tweet) #removing multiple whitespaces
  return tweet

In [10]:
data_clean = [clean_tweet(tweet) for tweet in data.text]


  tweet = BeautifulSoup(tweet, "lxml").get_text()


In [11]:
data_labels = data.sentiment.values
set(data_labels)

{0, 4}

In [12]:
data_labels[data_labels == 4] = 1
set(data_labels)

{0, 1}

In [13]:
# TOKENIZATION

tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    data_clean,target_vocab_size=2**16
)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [14]:
len(data_inputs),len(data_inputs[0]),

(1600000, 25)

In [15]:
# PADDING
MAX_LEN = max([len(sentences) for sentences in data_inputs])
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs, value=0,padding="post",maxlen=MAX_LEN)
MAX_LEN

74

In [16]:
# splitting into test and train
test_idx = np.random.randint(0,800000, 8000)
test_idx = np.concatenate((test_idx,test_idx+800000))

In [17]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]
train_inputs = np.delete(data_inputs, test_idx, axis=0)
train_labels = np.delete(data_labels, test_idx)

In [18]:
# BUILD MODEL

class DCNN(tf.keras.Model):

  def __init__(self,
               vocab_size,
               emb_dim=128,
               nb_filters=50,
               FFN_units=512,
               nb_classes=2,
               dropout_rate=0.1,
               training = False,
               name="dcnn"):
    super(DCNN,self).__init__(name=name)

    self.embedding = layers.Embedding(vocab_size,emb_dim)
    self.bigram = layers.Conv1D(filters = nb_filters, kernel_size=2,padding="valid",activation="relu")
    self.pool_1 = layers.GlobalMaxPool1D()

    self.trigram = layers.Conv1D(filters = nb_filters, kernel_size=3,padding="valid",activation="relu")
    self.pool_2 = layers.GlobalMaxPool1D()

    self.fourgram = layers.Conv1D(filters = nb_filters, kernel_size=4,padding="valid",activation="relu")
    self.pool_3 = layers.GlobalMaxPool1D()

    self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
    self.dropout = layers.Dropout(rate=dropout_rate)

    if nb_classes == 2:
      self.last_dense = layers.Dense(units=1,activation = "sigmoid")
    else:
      self.last_dense = layers.Dense(units = nb_classes, activation="softmax")

  def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool_1(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool_2(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool_3(x_3)

        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)

        return output

In [19]:
# GLOBAL VARIABLE
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [20]:
Dcnn = DCNN(vocab_size = VOCAB_SIZE,
            emb_dim = EMB_DIM,
            nb_filters = NB_FILTERS,
            FFN_units = FFN_UNITS,
            nb_classes = NB_CLASSES,
            dropout_rate = DROPOUT_RATE)

In [21]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [22]:
checkpoint_path = "/content/drive/MyDrive/TENSORFLOW/NLP/trainingandtestdata/ckpt"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep = 1)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Latest checkpoint restored")

In [23]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS,
         validation_data = (test_inputs,test_labels))
#ckpt_manager.save()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fbf0895e680>

In [24]:
results = Dcnn.evaluate(test_inputs, test_labels, batch_size=BATCH_SIZE)
print(results)

[0.5138245224952698, 0.8123124837875366]


In [28]:
def op(x):
  print(Dcnn(np.array([tokenizer.encode(x)]), training=False).numpy())

In [30]:
Dcnn(np.array([tokenizer.encode("You are so nice")]), training=False).numpy()

array([[0.6413028]], dtype=float32)

In [31]:
op("You are so nice")

[[0.6413028]]


In [32]:
op("I love you")

[[nan]]


In [33]:
op("i HATE YOU")

[[nan]]


In [34]:
op("I wish I never have to do this again")

[[0.02236004]]


In [37]:
op("I c Good morning")

[[0.41684636]]
