<a href="https://colab.research.google.com/github/Statistically-Inclined/NLP-Tutorial/blob/main/LSTM_RNN_word2vec_GloVe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('twitter_training.csv', header=None)
df_test = pd.read_csv('twitter_validation.csv', header=None)

In [3]:
df = df_train.append(df_test)
df.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
df.drop([0,1], axis=1, inplace=True)
df.rename(columns={2:'label', 3:'text'}, inplace=True)
df.head()

Unnamed: 0,label,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
df.tail()

Unnamed: 0,label,text
995,Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,Positive,Today sucked so it’s time to drink wine n play...
998,Positive,Bought a fraction of Microsoft today. Small wins.
999,Neutral,Johnson & Johnson to stop selling talc baby po...


In [6]:
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)
df.tail()

Unnamed: 0,label,text
75677,Irrelevant,⭐️ Toronto is the arts and culture capital of ...
75678,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
75679,Positive,Today sucked so it’s time to drink wine n play...
75680,Positive,Bought a fraction of Microsoft today. Small wins.
75681,Neutral,Johnson & Johnson to stop selling talc baby po...


In [7]:
df.head()

Unnamed: 0,label,text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


### NLP Text Cleaning

In [8]:
!pip install bs4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

string.punctuation
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [10]:
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer 

In [11]:
lm = WordNetLemmatizer()

In [12]:
def clean_content(text):

  # change integers to string format
  text = str(text)

  # change the text into lower case.(Note: in case of social media text, it is good to leave them as it is)
  text = text.lower()

  # removing xml tags from tweets
  text = BeautifulSoup(text, 'lxml').get_text()

  # removing URLS 
  text = re.sub('https?://[A-Za-z0-9./]+','',text)

  # removing Numbers 
  text = re.sub('https?://[A-Za-z0-9./]+','',text)

  # removing words with "@"
  text =re.sub(r'@[A-Za-z0-9]+','',text)

  # removing special characters
  text = re.sub(r"\W+|_", ' ', text)

  # tokenization of sentences
  text = word_tokenize(text)

  # lemmatize the text using WordNetn
  words = [lm.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]   

  return " ".join(words)

In [13]:
df['clean_text'] = df['text'].apply(clean_content)

In [14]:
df.head()

Unnamed: 0,label,text,clean_text
0,Positive,im getting on borderlands and i will murder yo...,im getting borderland murder
1,Positive,I am coming to the borders and I will kill you...,coming border kill
2,Positive,im getting on borderlands and i will kill you ...,im getting borderland kill
3,Positive,im coming on borderlands and i will murder you...,im coming borderland murder
4,Positive,im getting on borderlands 2 and i will murder ...,im getting borderland 2 murder


In [15]:
df.tail()

Unnamed: 0,label,text,clean_text
75677,Irrelevant,⭐️ Toronto is the arts and culture capital of ...,toronto art culture capital canada wonder want...
75678,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...,actually good move tot bring viewer one people...
75679,Positive,Today sucked so it’s time to drink wine n play...,today sucked time drink wine n play borderland...
75680,Positive,Bought a fraction of Microsoft today. Small wins.,bought fraction microsoft today small win
75681,Neutral,Johnson & Johnson to stop selling talc baby po...,johnson johnson stop selling talc baby powder ...


### Text Vectorization

In [16]:
from keras import layers
from keras import losses
from keras import preprocessing
from keras import utils
from keras.layers import TextVectorization

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [17]:
max_features = 2000
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(df['clean_text'].values)

In [18]:
tokenizer.num_words

2000

In [19]:
X = tokenizer.texts_to_sequences(df['clean_text'].values)
X[0]

[204, 97, 81, 1459]

In [20]:
df['clean_text'][0]

'im getting borderland murder'

In [21]:
print("100th sentence in text: ", df['clean_text'][120])
print("100th sentence in tokenize: ", X[120])

100th sentence in text:  finally played borderland 3 actual game play great almost gun feel fun never felt like needed grind story hand say much looking forward eventual cutscene skip
100th sentence in tokenize:  [153, 109, 81, 18, 839, 1, 9, 31, 251, 778, 116, 61, 80, 895, 3, 1046, 1319, 216, 491, 64, 48, 201, 370]


In [22]:
X = pad_sequences(X, padding = 'post' ,maxlen=300)
X

array([[ 204,   97,   81, ...,    0,    0,    0],
       [ 297, 1873,  214, ...,    0,    0,    0],
       [ 204,   97,   81, ...,    0,    0,    0],
       ...,
       [  59, 1943,   11, ...,    0,    0,    0],
       [ 357,   44,   59, ...,    0,    0,    0],
       [  16,   16,  108, ...,    0,    0,    0]], dtype=int32)

In [23]:
y = pd.get_dummies(df['label']).values

In [24]:
y

array([[0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       ...,
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 1, 0]], dtype=uint8)

In [25]:
print(tokenizer.word_index)



In [26]:
vocab_size = len(tokenizer.word_index) + 1

In [27]:
vocab_size

28393

### Data Splitting

In [28]:
X

array([[ 204,   97,   81, ...,    0,    0,    0],
       [ 297, 1873,  214, ...,    0,    0,    0],
       [ 204,   97,   81, ...,    0,    0,    0],
       ...,
       [  59, 1943,   11, ...,    0,    0,    0],
       [ 357,   44,   59, ...,    0,    0,    0],
       [  16,   16,  108, ...,    0,    0,    0]], dtype=int32)

In [29]:
y

array([[0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       ...,
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 1, 0]], dtype=uint8)

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,  random_state=1234, shuffle=True)

In [31]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((60545, 300), (15137, 300), (60545, 4), (15137, 4))

### LSTM RNN Implementation

In [32]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, Dropout

In [33]:
X.shape

(75682, 300)

In [34]:
X.shape[1]

300

In [35]:
embid_dim = 300
lstm_out = 128

model = Sequential()
model.add(Embedding(max_features, embid_dim, input_length = X.shape[1]))
model.add(Bidirectional(LSTM(lstm_out, dropout=0.2)))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(4, activation = 'softmax'))

In [36]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 300)          600000    
                                                                 
 bidirectional (Bidirectiona  (None, 256)              439296    
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 4)                 260       
                                                        

In [37]:
history = model.fit(X_train, y_train, epochs=5, batch_size=128, verbose=True, validation_data = (X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### GloVe Implementation with LSTM RNN

In [38]:
!unzip glove.6B.100d.txt.zip

Archive:  glove.6B.100d.txt.zip
  inflating: glove.6B.100d.txt       


In [59]:
from tqdm import tqdm

embedding_vector = {}
f = open('glove.6B.100d.txt')
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

400000it [00:07, 56949.76it/s]


In [60]:
embedding_matrix = np.zeros((vocab_size, 100))
for word,i in tqdm(tokenizer.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

100%|██████████| 28392/28392 [00:00<00:00, 635901.72it/s]


In [61]:
embedding_matrix.shape

(28393, 100)

In [62]:
embid_dim = 100
lstm_out = 128

model = Sequential()
model.add(Embedding(vocab_size, embid_dim, input_length=X.shape[1], weights = [embedding_matrix] , trainable = False))
model.add(Bidirectional(LSTM(lstm_out, dropout=0.2)))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(4, activation = 'softmax'))

In [63]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 300, 100)          2839300   
                                                                 
 bidirectional_3 (Bidirectio  (None, 256)              234496    
 nal)                                                            
                                                                 
 dense_9 (Dense)             (None, 128)               32896     
                                                                 
 dropout_3 (Dropout)         (None, 128)               0         
                                                                 
 dense_10 (Dense)            (None, 64)                8256      
                                                                 
 dense_11 (Dense)            (None, 4)                 260       
                                                      

In [64]:
history = model.fit(X_train, y_train, epochs=20, batch_size=64, verbose=True, validation_data = (X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Word2Vec with LSTM RNN

In [65]:
from tqdm import tqdm

In [66]:
sentences =[]
for t in  tqdm(range(len(df['clean_text']))):
    text = nltk.word_tokenize(df['clean_text'][t])
    sentences.append(text)

100%|██████████| 75682/75682 [00:07<00:00, 9944.40it/s]


In [67]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(sentences, size=300, min_count=2, sg = 0 )

In [68]:
words = list(w2v_model.wv.vocab)
print('Vocabulary size: %d' % len(words))

# save model 
filename = 'embedding_word2vec.txt'
w2v_model.wv.save_word2vec_format(filename, binary=False)

Vocabulary size: 21576


In [69]:
embedding_vector = {}
f = open('./embedding_word2vec.txt')
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

21577it [00:01, 16440.13it/s]


In [70]:
embedding_matrix = np.zeros((vocab_size,300))
for word,i in tqdm(tokenizer.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value  

100%|██████████| 28392/28392 [00:00<00:00, 747883.11it/s]


In [72]:
embid_dim = 300
lstm_out = 128

model = Sequential()
model.add(Embedding(vocab_size, embid_dim, input_length=X.shape[1], weights=[embedding_matrix] , trainable = False))
model.add(Bidirectional(LSTM(lstm_out, dropout=0.2)))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(4, activation = 'softmax'))

In [73]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 300, 300)          8517900   
                                                                 
 bidirectional_5 (Bidirectio  (None, 256)              439296    
 nal)                                                            
                                                                 
 dense_15 (Dense)            (None, 128)               32896     
                                                                 
 dropout_5 (Dropout)         (None, 128)               0         
                                                                 
 dense_16 (Dense)            (None, 64)                8256      
                                                                 
 dense_17 (Dense)            (None, 4)                 260       
                                                      

In [74]:
history = model.fit(X_train, y_train, epochs=20, batch_size=64, verbose=True, validation_data = (X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Pretrained Word2Vec Embedding

In [None]:
from gensim.models import KeyedVectors
filename = '../input/nlpword2vecembeddingspretrained/GoogleNews-vectors-negative300.bin'
w2v_pretrained_model = KeyedVectors.load_word2vec_format(filename, binary=True)

In [None]:
embedding_matrix = np.zeros((vocab_size,300))
for word,i in tqdm(tokenizer.word_index.items()):
    try:
        embedding_value = w2v_pretrained_model[word]
        if embedding_value is not None:
            embedding_matrix[i] = embedding_value         
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),300) 

In [None]:
embid_dim = 300
lstm_out = 128


model = Sequential()
model.add(Embedding(vocab_size, 300, input_length =300, weights = [embedding_matrix ] , trainable = False)) # trainable = False
model.add(Bidirectional(LSTM(lstm_out, dropout=0.2)))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(4, activation = 'softmax'))

In [None]:
model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=5, batch_size=128, verbose=True, validation_data = (X_test, y_test))

In [None]:
embid_dim = 300
lstm_out = 128

model = Sequential()
model.add(Embedding(vocab_size, 300, input_length =300, weights = [embedding_matrix ], trainable = True))  # trainable = True
model.add(Bidirectional(LSTM(lstm_out, dropout=0.2)))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(4, activation = 'softmax'))
model.summary()