<a href="https://colab.research.google.com/github/Satwikram/Deep-Learning-Notebooks/blob/master/CNN/CNN%20for%20NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import re
import math
import time
import tensorflow as tf
from tensorflow.keras import layers
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
tf.__version__


'2.2.0'

In [None]:
import tensorflow_datasets as tfds

Loading Data


In [None]:
cols = ["sentiment", "id", "date", "query", "user", "text"]

In [None]:
train = pd.read_csv('/content/drive/My Drive/Tweets/training.csv', 
                    header = None, names = cols, engine = 'python', encoding = 'latin1')

In [None]:
train.head(n = 3)

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


In [None]:
test = pd.read_csv('/content/drive/My Drive/Tweets/test.csv', 
                    header = None, names = cols, engine = 'python', encoding = 'latin1')

In [None]:
test.head(n = 3)

Unnamed: 0,sentiment,id,date,query,user,text
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."


Data Preprocessing

In [None]:
train.drop(["id", "user", "query", "date"], axis = 1, inplace = True)

In [None]:
train.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
test.drop(["id", "user", "query", "date"], axis = 1, inplace = True)

In [None]:
test.head()

Unnamed: 0,sentiment,text
0,4,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,Reading my kindle2... Love it... Lee childs i...
2,4,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,@kenburbary You'll love your Kindle2. I've had...
4,4,@mikefish Fair enough. But i have the Kindle2...


In [None]:
# Cleaning the tweets

def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, 'lxml').get_text()
    tweet = re.sub(r"@[A-Za-z0-9]+", '', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", '', tweet)
    tweet = re.sub(r"[^A-Za-z.?!]", '', tweet)
    tweet = re.sub(r" +", '', tweet)
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in train.text]

In [None]:
data_labels = train.sentiment.values

In [None]:
data_labels[data_labels == 4] = 1

In [None]:
set(data_labels)

{0, 1}

In [None]:
data_clean[0]

'Awwwthatsabummer.YoushouldagotDavidCarrofThirdDaytodoit.D'

Tokenization

In [None]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean,
    target_vocab_size = 2**16,
)

In [None]:
data_input = [tokenizer.encode(sentence) for sentence in data_clean]

Padding

In [None]:
MAX_LEN = max([len(sentence) for sentence in data_input])

In [None]:
data_input = tf.keras.preprocessing.sequence.pad_sequences(
    data_input, value = 0, padding = 'post', maxlen = MAX_LEN
) 

Splitting into Train and Test

In [None]:
test_indx = np.random.randint(0, 800000, 8000)
test_indx = np.concat((test_indx, test_indx+800000))

In [None]:
test_input = data_input[test_indx]
test_labels = data_labels[test_indx]
train_input = np.delete(data_input, test_indx, axis = 0)
train_labels = np.delete(data_labels, test_indx)



Building Model

In [None]:
class DCNN(tf.keras.model):
  def __init__(self, vocab_size, dropout_rate = 0.1, training = False, nb_classes = 2, 
               FFN_units = 512, nb_filters = 50, emb_dim = 128, name = 'dcnn'):
    
    super(DCNN, self).__init__(name = name)

    self.embedding = layers.Embedding(vocab_size, emb_dim)

    self.bigram = layers.Conv1D(filters = nb_filters, padding = 'valid', kernel_size = 2, activation = 'relu')

    self.pool_1 = layers.GlobalMaxPool1D()

    self.trigram = layers.Conv1D(filters = nb_filters, padding = 'valid', kernel_size = 2, activation = 'relu')

    self.pool_2 = layers.GlobalMaxPool1D()

    self.fourgram = layers.Conv1D(filters = nb_filters, padding = 'valid', kernel_size = 2, activation = 'relu')

    self.pool_3 = layers.GlobalMaxPool1D()

    self.dense = layers.Dense(units = FFN_units, activation = 'relu')
    self.dropout = layers.dropout(droput_rate)

    if nb_classes == 2:
      self.last_dense = layers.Dense(units = 1, activation = 'sigmoid')

    else:
      self.last_dense = layers.Dense(units = nb_classes, activation = 'softmax')


    def call(self, input, training):
      x = self.embedding(inputs)
      x_1 = self.bigram(x)
      x_1 = self.pool(x_1)
      x_2 = self.trigram(x)
      x_2 = self.pool(x_2)
      x_3 = self.fourgram(x)
      x_3 = self.pool(x_3)
      
      merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
      merged = self.dense_1(merged)
      merged = self.dropout(merged, training)
      output = self.last_dense(merged)
      
      return output


  
  


In [None]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = 2#len(set(train_labels))

DROPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 5

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
"""
checkpoint_path = "./drive/My Drive/projects/CNN_for_NLP/ckpt/"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!!")

"""

In [None]:
"""
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         epochs=NB_EPOCHS)
ckpt_manager.save()

"""