In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil
import matplotlib.pyplot as plt

import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Save Tokenizer

In [None]:
tweet_dataset = pd.read_csv('/content/drive/dataset.csv',
                            encoding = "ISO-8859-1",
                            usecols=[0, 5],
                            names=["label","tweet"])

In [None]:
print(tweet_dataset.shape) 

(1600000, 2)


In [None]:
tweet_dataset['label'].replace([4, 0],[1, 0], inplace=True) 

In [None]:
tweet_dataset.head()

Unnamed: 0,label,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [None]:
tweet_dataset['label'].value_counts()

1    800000
0    800000
Name: label, dtype: int64

In [None]:
import random 
random.seed(41)
tweet_dataset.iloc[random.sample(range(1, 1600000), 10), :]

Unnamed: 0,label,tweet
799059,0,awww RIP Farrah Fawcett bummer @takianballard...
696726,0,Morning Tweet Peeps &amp; FB Folks. Had plans ...
483854,0,back still aching . hmmm.. it's been a while ...
348205,0,@vrikis I just noticed that you have in- follo...
808725,1,wow just got how it was great a bunch of new c...
1212026,1,loves his PraiseTEAM family I will never forg...
1447314,1,Sprinkers need some work #fb
594454,0,@garpods22 I know! I can't believe we haven't ...
1159850,1,@mosessaur U sure know that !!!
580463,0,wishes that it would just rain already. My ki...


In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')

def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)

def clean_train_data(x):
  text = x
  text = text.lower()   
  text = re.sub('\[.*?\]', '', text)
  text = re.sub(r'[^\w\s]','',text)
  text = re.sub('\w*\d\w*', '', text) 
  text = re.sub('\n', '', text) 
  text = cleaning_URLs(text) 
  text = text.split()
  stop_words = set(stopwords.words('english')) 
  text = [w for w in text if not w in stop_words] 
  lemma = nltk.wordnet.WordNetLemmatizer()
  text = ' '.join([lemma.lemmatize(word) for word in text])

  return text

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
tweet_dataset['tweet'] = tweet_dataset.tweet.apply(lambda x : clean_train_data(x)) 
tweet_dataset.head()

Unnamed: 0,label,tweet
0,0,switchfoot bummer shoulda got david carr third...
1,0,upset cant update facebook texting might cry r...
2,0,kenichan dived many time ball managed save res...
3,0,whole body feel itchy like fire
4,0,nationwideclass behaving im mad cant see


In [None]:
max_features = 6000
token = Tokenizer(num_words=max_features, split = ' ')
token.fit_on_texts(tweet_dataset['tweet'].values)


##### At this stage, the dataset has been tokenized. Let's load the model to see if the Tokenizer is working fine as expected

In [None]:
import keras
reconstructed_model = keras.models.load_model("/content/drive/LSTM_Model")

In [None]:
sentences = ['Have a bad day', 'Just feel like killing myself', 'Nobody cares', 'I hate #covid19', 'Bad day', 'Love programming',
             'Hooray! I finally make it~~~~', 'Why does everyone hate me?']

In [None]:
pred_sentences = token.texts_to_sequences(sentences)
pred_sentences = pad_sequences(pred_sentences)

In [None]:
true_label = []

for i in range(len(sentences)):
  tf_outputs = reconstructed_model(tf.convert_to_tensor(pred_sentences))
  tf_predictions = tf.nn.softmax(tf_outputs[i], axis=-1)
  labels = ['Negative','Positive']
  label = tf.argmax(tf_predictions,None)
  label = np.array(label)
  true_label.append(label)

In [None]:
print(true_label)

[array(0), array(0), array(0), array(0), array(0), array(1), array(1), array(0)]


In [None]:
for i in range(len(sentences)):
  if true_label[i] == 0:
    result = "negative"
  if true_label[i] == 1:
    result = "positive"
  print(sentences[i], ": \n", result)

Have a bad day : 
 negative
Just feel like killing myself : 
 negative
Nobody cares : 
 negative
I hate #covid19 : 
 negative
Bad day : 
 negative
Love programming : 
 positive
Hooray! I finally make it~~~~ : 
 positive
Why does everyone hate me? : 
 negative


##### Perfect. Let's pickle the Tokenizer as tokenizer.pkl

In [None]:
import pickle

with open('/content/drive/tokenizer.pkl', 'wb') as handle:
  pickle.dump(token, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load the saved model without tokenizer.pkl

In [None]:
import keras
reconstructed_model = keras.models.load_model("/content/drive/LSTM_Model")

In [None]:
sentences = ['Have a bad day', 'Just feel like killing myself', 'Nobody cares', 'I hate #covid19', 'Bad day', 'Love programming',
             'Hooray! I finally make it~~~~', 'Why does everyone hate me?']

In [None]:
max_features = 6000
token = Tokenizer(num_words=max_features, split = ' ')
token.fit_on_texts(sentences)

pred_sentences = token.texts_to_sequences(sentences)
pred_sentences = pad_sequences(pred_sentences) 

In [None]:
true_label = []

for i in range(len(sentences)):
  tf_outputs = reconstructed_model(tf.convert_to_tensor(pred_sentences))
  tf_predictions = tf.nn.softmax(tf_outputs[i], axis=-1)
  labels = ['Negative','Positive']
  label = tf.argmax(tf_predictions,None)
  label = np.array(label)
  true_label.append(label)

In [None]:
print(true_label)

[array(1), array(0), array(1), array(0), array(1), array(1), array(1), array(1)]


In [None]:
for i in range(len(sentences)):
  if true_label[i] == 0:
    result = "negative"
  if true_label[i] == 1:
    result = "positive"
  print(sentences[i], ": \n", result)

Have a bad day : 
 positive
Just feel like killing myself : 
 negative
Nobody cares : 
 positive
I hate #covid19 : 
 negative
Bad day : 
 positive
Love programming : 
 positive
Hooray! I finally make it~~~~ : 
 positive
Why does everyone hate me? : 
 positive


# Load the saved model with tokenizer.pkl

In [None]:
import keras
reconstructed_model = keras.models.load_model("/content/drive/LSTM_Model")

In [None]:
# loading
import pickle
with open('/content/drive/tokenizer.pkl', 'rb') as handle:
  token = pickle.load(handle)

In [None]:
sentences = ['Have a bad day', 'Just feel like killing myself', 'Nobody cares', 'I hate #covid19', 'Bad day', 'Love programming',
             'Hooray! I finally make it~~~~', 'Why does everyone hate me?']

In [None]:
pred_sentences = token.texts_to_sequences(sentences)
pred_sentences = pad_sequences(pred_sentences)

In [None]:
true_label = []

for i in range(len(sentences)):
  tf_outputs = reconstructed_model(tf.convert_to_tensor(pred_sentences))
  tf_predictions = tf.nn.softmax(tf_outputs[i], axis=-1)
  labels = ['Negative','Positive']
  label = tf.argmax(tf_predictions,None)
  label = np.array(label)
  true_label.append(label)

In [None]:
print(true_label)

[array(0), array(0), array(0), array(0), array(0), array(1), array(1), array(0)]


In [None]:
for i in range(len(sentences)):
  if true_label[i] == 0:
    result = "negative"
  if true_label[i] == 1:
    result = "positive"
  print(sentences[i], ": \n", result)

Have a bad day : 
 negative
Just feel like killing myself : 
 negative
Nobody cares : 
 negative
I hate #covid19 : 
 negative
Bad day : 
 negative
Love programming : 
 positive
Hooray! I finally make it~~~~ : 
 positive
Why does everyone hate me? : 
 negative
