#### Author: Satwik Ram K

## Twitter Sentiment Analysis

###Connecting to Kaggle for fast file computation

In [None]:
from google.colab import files

files.upload()


! mkdir ~/.kaggle


! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json


Saving kaggle.json to kaggle.json


###Dowloading the dataset

In [None]:
! kaggle datasets download -d imrandude/twitter-sentiment-analysis

Downloading twitter-sentiment-analysis.zip to /content
  0% 0.00/3.67M [00:00<?, ?B/s]
100% 3.67M/3.67M [00:00<00:00, 123MB/s]


#### Unzipping the dataset

In [None]:
! unzip /content/twitter-sentiment-analysis.zip

Archive:  /content/twitter-sentiment-analysis.zip
  inflating: train.csv               


### Importing Dependencies

In [None]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
from tensorflow import keras
import math 
from bs4 import BeautifulSoup

In [None]:
tf.__version__

'2.2.0'

## Loading Dataset

In [None]:
dataset = pd.read_csv('/content/train.csv', engine = 'python', encoding = 'latin1')

#### Dataset info
SentimentText - text of the twit

0 - negative

1 - positive

In [None]:
dataset.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [None]:
dataset['SentimentText'][3]

"          .. Omgaga. Im sooo  im gunna CRy. I've been at this dentist since 11.. I was suposed 2 just get a crown put on (30mins)..."

In [None]:
dataset.drop('ItemID', axis = 1, inplace = True)

In [None]:
dataset.tail()

Unnamed: 0,Sentiment,SentimentText
99984,0,@Cupcake seems like a repeating problem hop...
99985,1,@cupcake__ arrrr we both replied to each other...
99986,0,@CuPcAkE_2120 ya i thought so
99987,1,@Cupcake_Dollie Yes. Yes. I'm glad you had mor...
99988,1,@cupcake_kayla haha yes you do


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99989 entries, 0 to 99988
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Sentiment      99989 non-null  int64 
 1   SentimentText  99989 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.5+ MB


In [None]:
dataset.isna().sum()

Sentiment        0
SentimentText    0
dtype: int64

In [None]:
dataset['Sentiment'].unique()

array([0, 1])

### Cleaning the Tweets using Regular Expression

In [None]:
def clean_tweet(tweet):
  tweet = BeautifulSoup(tweet, 'lxml').get_text()
  tweet = re.sub(r"@[A-Za-z0-9]+", "", tweet)
  tweet = re.sub(r"https?://[A-Za-z0-9./]", "", tweet)
  tweet = re.sub(r"[^A-Za-z]", "", tweet)
  tweet = re.sub(r" +", '', tweet)
  return tweet


In [None]:
data_clean = [clean_tweet(tweet) for tweet in dataset.SentimentText]

In [None]:
data_clean[0]

'issosadformyAPLfriend'

In [None]:
data_labels = dataset['Sentiment'].values

In [None]:
set(data_labels)

{0, 1}

## Tokenization

In [None]:
import tensorflow_datasets as tfds
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean,
    target_vocab_size = 2**17
)

In [None]:
tokenizer.save_to_file('tokenizer')


## Load the saved tokenizer

In [None]:
# Load
encoder = tfds.features.text.SubwordTextEncoder.load_from_file('/content/tokenizer')

In [None]:
data_input = [encoder.encode(sentence) for sentence in data_clean]


In [None]:
data_input[0]

[26721, 125249, 18702, 207]

## Padding

In [None]:
MAX_LEN = max([len(sentence) for sentence in data_input])


In [None]:
MAX_LEN

98

In [None]:
data_input = tf.keras.preprocessing.sequence.pad_sequences(
    data_input, value = 0, padding = 'post', maxlen = MAX_LEN 
)

In [None]:
data_input[0]

array([ 26721, 125249,  18702,    207,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0], dtype=int32)

## Splitting into train and test

In [None]:
len(data_input)

99989

In [None]:
len(data_labels)

99989

In [None]:
X = []
y = []

In [None]:
for i in range(0, len(data_input)):
  X.append(data_input[i])
  y.append(data_labels[i])


In [None]:
print(X[0],"\n")
print("Sentiment is:",y[0])

[ 26721 125249  18702    207      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0] 

Sentiment is: 0


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3787)

In [None]:
print(X_train[0])
y_train[0]

[ 82287   5090 139207   2772  53018   6410  10393  25181 133846  61231
  16524    340      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0]


1

In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

## Building the Model!

In [None]:
class DCNN(tf.keras.Model):

  def __init__(self, vocab_size, dropout_rate = 0.1, training = False, nb_classes = 2,
               FFN_units = 512, nb_filters = 50, emb_dim = 128, name = 'dcnn'):
    
    super(DCNN, self).__init__(name = name)

    self.embedding = tf.keras.layers.Embedding(vocab_size, emb_dim)

    self.bigram = tf.keras.layers.Conv1D(filters = nb_filters, padding = 'valid', kernel_size = 2, activation = 'relu')

    self.pool1 = tf.keras.layers.GlobalMaxPool1D()

    self.trigram = tf.keras.layers.Conv1D(filters = nb_filters, padding = 'valid', kernel_size = 2, activation = 'relu')

    self.pool2 = tf.keras.layers.GlobalMaxPool1D()

    self.fourgram = tf.keras.layers.Conv1D(filters = nb_filters, padding = 'valid', kernel_size = 2, activation = 'relu')

    self.pool3 = tf.keras.layers.GlobalMaxPool1D()

    self.dense = tf.keras.layers.Dense(units = FFN_units, activation = 'relu')

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

    if nb_classes == 2:
      self.last_dense = tf.keras.layers.Dense(1, activation = 'sigmoid')

    else:
      self.last_dense = tf.keras.layers.Dense(nb_classes, activation = 'softmax')



  def call(self, inputs, training):
    x = self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool1(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool2(x_2)
    x_3 = self.fourgram(x)
    x_3 = self.pool3(x_3)

    merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
    merged = self.dense(merged)
    merged = self.dropout(merged, training)
    output = self.last_dense(merged)

    return output






## Config


In [None]:
VOCAB_SIZE = encoder.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2 #len(set(train labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

## Defining and Compiling

> Indented block



In [None]:
model = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    model.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    model.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

## Creating Checkpoints

In [None]:
checkpoint_path = "ckpt/"

ckpt = tf.train.Checkpoint(model = model)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep = 1 )

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")


## Training

In [None]:
model.fit(X_train, y_train, batch_size = BATCH_SIZE, epochs= NB_EPOCHS, validation_split = 0.1)
ckpt_manager.save()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


'ckpt/ckpt-1'

In [None]:
loss, acc = model.evaluate(X_test, y_test)



In [None]:
acc * 100

65.94659686088562

In [None]:
sentence = "the service of the hotel was worst"


In [None]:
sentence = np.array([sentence])

In [None]:
sentence[0]

'the service of the hotel was worst'

In [None]:
sentence = encoder.encode(sentence[0])

In [None]:
pred = model.predict([sentence])

In [None]:
pred = pred >= 0.5

In [None]:
if pred:
  print("Its a postive sentiment")

else:
  print("oops its a negative sentiment")

oops its a negative sentiment


In [None]:
type(model)


__main__.DCNN

In [None]:
model.save('sentiment')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: sentiment/assets


In [None]:
from google.colab import files

In [None]:
files.download('/content/sentiment')

In [None]:
reconstructed_model = tf.keras.models.load_model("/content/sentiment")

In [None]:
! zip -r 'sentiment.zip' '/content/sentiment'

  adding: content/sentiment/ (stored 0%)
  adding: content/sentiment/variables/ (stored 0%)
  adding: content/sentiment/variables/variables.data-00000-of-00001 (deflated 23%)
  adding: content/sentiment/variables/variables.index (deflated 67%)
  adding: content/sentiment/saved_model.pb (deflated 89%)
  adding: content/sentiment/assets/ (stored 0%)


In [None]:
! zip -r 'ckpt.zip' '/content/ckpt'

  adding: content/ckpt/ (stored 0%)
  adding: content/ckpt/ckpt-1.index (deflated 67%)
  adding: content/ckpt/ckpt-1.data-00000-of-00001 (deflated 23%)
  adding: content/ckpt/checkpoint (deflated 40%)
