# 0.0 Imports

In [1]:
import pandas as pd
import numpy as np
import re

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
from transformers import BertTokenizer

from bs4 import BeautifulSoup

import random

## 0.1 Loading Data

In [2]:
df = pd.read_csv('D:\\My Drive\\Pessoal\\Projetos\\sentiment_analysis\\training.1600000.processed.noemoticon.csv', header=None, encoding='latin1')

df.columns= 'sentiment id date query user text'.split()

df.head()

Unnamed: 0,sentiment,id,date,query,user,text
0,0,2057279074,Sat Jun 06 12:42:20 PDT 2009,NO_QUERY,PolloLoco66,Ukraine 2 Croatia 2 after 68 minutes.
1,0,2204150887,Wed Jun 17 01:27:26 PDT 2009,NO_QUERY,kaylaSTACK,"@courtney_xxx out of shower , picked up phone,..."
2,4,2047790284,Fri Jun 05 14:11:06 PDT 2009,NO_QUERY,smacula,@S4BI i am a tweeting god... hahahah
3,4,1972065032,Sat May 30 08:29:19 PDT 2009,NO_QUERY,Lorena182,@charmingsharo 17 again ..... again ;)
4,0,1823113262,Sat May 16 21:08:49 PDT 2009,NO_QUERY,leleana,"@sarahactually OMG, your cats name is Pippin? ..."


## 0.2 Helper Functions

In [3]:
# data clean
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, 'lxml').get_text()
    tweet = re.sub(r'@[A-Za-z0-9]+', ' ', tweet)
    tweet = re.sub(r'https?://[A-Za-z0-9./]+', ' ', tweet)
    tweet = re.sub(r"[^A-Za-z.!?']", ' ', tweet)
    tweet = re.sub(r' +', ' ', tweet)
    return tweet

# encoding 
def encode_sentence(sent):
    return token_model.convert_tokens_to_ids(token_model.tokenize(sent))

# 1.0 Data Description

## 1.1 Data Dimensions

In [4]:
print(f'Há no total: {df.shape[0]} linhas')
print(f'Há no total: {df.shape[1]} columnas')

Há no total: 400000 linhas
Há no total: 6 columnas


## 1.2 Data Types

In [5]:
df.dtypes

sentiment     int64
id            int64
date         object
query        object
user         object
text         object
dtype: object

## 1.3 Check NA

In [6]:
df.isna().sum()

sentiment    0
id           0
date         0
query        0
user         0
text         0
dtype: int64

# 2.0 Data Cleaning

###### 2.1.1 Changing positive sentiments values to 1

In [7]:
df.loc[df['sentiment']==4, 'sentiment'] = 1

###### 2.1.2 Removing Regular Expressions

In [8]:
df['text'] = df['text'].apply(clean_tweet)

# 3.0 Data Preparation

## 3.1 Tokenization

In [9]:
# trained algotithm
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4', trainable=False)

In [10]:
# vocab dataframe file
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()

In [11]:
# token definition
token_model = BertTokenizer(vocab_file, do_lower_case=True)

In [12]:
# encoding
df['text'] = df['text'].apply(encode_sentence)

## 3.3 Data Filtering

###### 5.1 Columns

In [13]:
df.drop(['id', 'date', 'query', 'user'], axis=1, inplace=True)

## 3.4 Preprocessing

In [14]:
# changing dataframe to lists
sentence = df['text'].to_list()
sentiment = df['sentiment'].to_list()

dataset = [[sent, sentiment[i], len(sent)] for i, sent in enumerate(sentence)]
random.shuffle(dataset)
dataset = [(sent[0], sent[1])
          for sent in dataset if sent[2] > 7] # droping rows with 7 words or less

# parameters
BATCH_SIZE = 32
NB_BATCHES = len(df) // BATCH_SIZE
NB_BATCHES_TEST = NB_BATCHES // 10

# changing dataframe to tensorflow format
tf_dataset = tf.data.Dataset.from_generator(lambda: dataset, output_types=(tf.int32, tf.int32))

# adding padds
all_batched = tf_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None,), ()))

# shuffle
all_batched.shuffle(NB_BATCHES)

# splits
test = all_batched.take(NB_BATCHES_TEST)
train = all_batched.skip(NB_BATCHES_TEST)

# 4 Embeging

# 6.0 Building Model

## 6.1 Model Definition

In [15]:
class DCNN(tf.keras.Model):
    
    def __init__(self, 
                 vocab_size,
                 emb_dim = 128,
                 nb_filters = 50, 
                 FFN_units = 512,
                 nb_classes = 2,
                 dropout_rate = 0.1,
                 training = False,
                 name = 'dcnn'
                ):
        super(DCNN, self).__init__(name=name)

        self.embedding = layers.Embedding(VOCAB_SIZE, EMB_DIM)

        self.bigram = layers.Conv1D(filters = NB_FILTERS,
                                   kernel_size = 2,
                                   padding = 'valid',
                                   activation = 'relu')

        self.trigram = layers.Conv1D(filters = NB_FILTERS,
                                   kernel_size = 3,
                                   padding = 'valid',
                                   activation = 'relu')

        self.fourgram = layers.Conv1D(filters = NB_FILTERS,
                                   kernel_size = 4,
                                   padding = 'valid',
                                   activation = 'relu')

        self.pool = layers.GlobalAvgPool1D()

        self.dense_1 = layers.Dense(units = FFN_UNITS, activation = 'relu')

        self.dropout = layers.Dropout(rate = DROPOUT_RATE)

        if NB_CLASSES == 2:
            self.last_dense = layers.Dense(units = 1, activation = 'sigmoid')
        else:
            self.last_dense = layers.Dense(units = NB_CLASSES, activation = 'softmax')
        
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)
        
        merged = tf.concat([x_1, x_2, x_3], axis=-1)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)

        return output

# parameters
EMB_DIM = 200
VOCAB_SIZE = len(token_model)
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2
DROPOUT_RATE = 0.2
NB_EPOCHS = 5

# instanciate model
Dcnn = DCNN(vocab_size = VOCAB_SIZE,
           emb_dim = EMB_DIM, 
           nb_filters = NB_FILTERS,
           FFN_units = FFN_UNITS,
           nb_classes = NB_CLASSES,
           dropout_rate = DROPOUT_RATE)

## 6.2 Creating a Save Checkpoints

In [18]:
if NB_CLASSES ==2:
    Dcnn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
else:
    Dcnn.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

checkpoint_patch = 'D:\\My Drive\\Pessoal\\Projetos\\sentiment_analysis\\exports'

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_patch, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!')

class MyCustomCallBack(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print(f'Checkpoint saved at: {checkpoint_patch}')

Latest checkpoint restored!


## 6.3 Fiting

In [None]:
history = Dcnn.fit(train,
                  epochs=NB_EPOCHS,
                  callbacks=[MyCustomCallBack()])

Epoch 1/5
   5368/Unknown - 458s 85ms/step - loss: 0.3483 - accuracy: 0.8481