**Importation the required modules**

In [None]:

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GlobalMaxPooling1D
from tensorflow.keras.models import Model
from google.colab import drive
import torch
from torchvision import datasets, models, transforms

#mount drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
df = pd.read_csv('/content/gdrive/My Drive/sentimentDataset.csv')

In [None]:
#remove breake tags in the reviews

def remove_breaktags(text):
  text = re.sub('<br\s?\/>|<br>', " ", text)
  return text

df['review'] = df['review'].apply(remove_breaktags)
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming t...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [None]:
# convert sentiment column from word to integer 
# df['sentiment'].apply(toSentimentInteger)
df['sentiment'] = df['sentiment'].apply(lambda x : 1 if x == "positive" else 0)
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. The filming t...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
9,If you like original gut wrenching laughter yo...,1


**Splitting the data into training and test data**



In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['review'].values, df['sentiment'].values, test_size=0.2)
print("Train")
print("Review: ", x_train[0])
print('sentiment: ', y_train[0])
print("Test")
print("Review: ", x_test[0])
print('sentiment: ', y_test[0])

Train
Review:  After reading a biography on the last Russian Tzar (Nicholas II), and his failure to secure the army's support, I decided to give this film a try.  I watched it with a completely open mind, not knowing anything about it (except its reputation).  These are the things that impressed me the most.  1) The shots of battleships, and the soldiers used as extras. More than once I stopped to think "if this was done in this time and place, 80% of this would have been computer-generated".  2) The Realism in it. From the maggot-infested meat to the shot of the sailor with his candle and the legend "Killed for a bowl of soup", this movie makes no concessions to the PC cause (which, thankfully, hadn't been invented yet).  3) The slow descent into madness of the Odessa Steps sequence. From the first shot, when the limbless man appears, you get the idea something might be wrong; since the overall shots are composed, though, you end up feeling comfortable in your surroundings. Then an am

In [None]:
# Conversion of Strings to integers using tokenizer
maximum_vocab = 5000000
tokenizer = Tokenizer(num_words=maximum_vocab)
tokenizer.fit_on_texts(x_train)

#check word index and vocab of dataset
wordIndex = tokenizer.word_index
vocab_size = len(wordIndex)
print("Size of dataset vocabulary: ", vocab_size)

#convert train and test sequences into seqences
train_sequence = tokenizer.texts_to_sequences(x_train)
test_sequence = tokenizer.texts_to_sequences(x_test)
print("Training sequence: ", train_sequence[0])
print("Testing sequence: ", test_sequence[0])

# Pad sequences to get equal length
# padding the traing sequence
pad_train = pad_sequences(train_sequence)
T = pad_train.shape[1]
print('The length of training sequence is: ', T)

# padding the test sequence
pad_test = pad_sequences(test_sequence, maxlen=T)
print('The length of testing sequence is: ', pad_test.shape[1])

Size of dataset vocabulary:  112343
Training sequence:  [98, 880, 3, 4784, 19, 1, 234, 1489, 50953, 4839, 1517, 2, 23, 2097, 5, 8073, 1, 29647, 1445, 9, 879, 5, 196, 10, 18, 3, 354, 9, 284, 8, 14, 3, 337, 834, 324, 20, 1353, 230, 41, 8, 555, 91, 2613, 131, 22, 1, 176, 11, 1490, 67, 1, 87, 303, 1, 638, 4, 50954, 2, 1, 1265, 333, 13, 2302, 50, 71, 281, 9, 2286, 5, 100, 42, 10, 12, 223, 7, 10, 54, 2, 273, 3169, 4, 10, 57, 24, 75, 1281, 4522, 231, 1, 1820, 7, 8, 35, 1, 19390, 11751, 3417, 5, 1, 318, 4, 1, 6023, 14, 23, 7751, 2, 1, 1877, 525, 15, 3, 6884, 4, 6254, 10, 16, 163, 53, 21782, 5, 1, 5905, 1129, 59, 2501, 1844, 75, 5301, 243, 331, 1, 548, 5198, 82, 2901, 4, 1, 13144, 2954, 709, 35, 1, 84, 318, 49, 1, 38495, 127, 743, 21, 73, 1, 321, 136, 235, 25, 358, 233, 1, 441, 638, 22, 4120, 151, 21, 126, 52, 558, 3883, 7, 125, 5742, 92, 31, 27815, 743, 2, 81, 372, 1515, 7, 941, 6614, 129, 1, 638, 22, 4120, 370, 1, 38496, 951, 82, 128, 2, 1, 1066, 318, 27816, 951, 10, 128, 6, 722, 278, 1, 1857

**Model defining/building**

In [None]:

def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))
max_length = get_max_length()

def create_model():
  # ARCHITECTURE
  EMBED_DIM = 32
  LSTM_OUT = 64

  model = Sequential()
  model.add(Embedding(vocab_size+1, EMBED_DIM, input_length = max_length))
  model.add(LSTM(LSTM_OUT))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

  return model

  # Create a basic model instance
model = create_model()

# Display the model's architecture
model.summary()



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1291, 32)          3595008   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 3,619,905
Trainable params: 3,619,905
Non-trainable params: 0
_________________________________________________________________


In [None]:
#Saving of checkpoints
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1)

# training the model
model.fit(pad_train, y_train, batch_size = 128, epochs = 2, callbacks=[cp_callback])

Epoch 1/2









Epoch 00001: saving model to training_1/cp.ckpt
Epoch 2/2

Epoch 00002: saving model to training_1/cp.ckpt


<keras.callbacks.History at 0x7f283684d950>

In [None]:
os.listdir(checkpoint_dir)

['cp.ckpt.data-00000-of-00001', 'cp.ckpt.index', 'checkpoint']

**Only run this cell if loading a previously saved model with a checkpoint**

---



In [None]:

# Create a basic model instance
model = create_model()

# Loads the weights
model.load_weights(checkpoint_path)

# Re-evaluate the model
loss, acc = model.evaluate(pad_test, y_test, verbose=1)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))









Restored model, accuracy: 89.28%


In [None]:
# Save the weights
model.save_weights('./checkpoints/my_checkpoint')

In [None]:
# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('saved_model/my_model')



INFO:tensorflow:Assets written to: saved_model/my_model/assets


INFO:tensorflow:Assets written to: saved_model/my_model/assets


**Run cell if saved model exists else skip to the next**

In [None]:
new_model = tf.keras.models.load_model('saved_model/my_model')

# Check its architecture
new_model.summary()

In [None]:
# Re-evaluate the model
loss, acc = model.evaluate(pad_test, y_test, verbose=1)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))





Restored model, accuracy: 86.44%


In [None]:
# Evaluating the model
# plotting the loss of the model
plt.plot(model.history['loss'], label='loss')
plt.legend()

TypeError: ignored

In [None]:
# plotting the accuracy of the model
plt.plot(r.history['accuracy'], label= 'accuracy')
plt.legend()

**Install Demoji package for the decoding of meanings from emojis**

In [None]:
!pip install demoji


Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[?25l[K     |███████▋                        | 10 kB 20.6 MB/s eta 0:00:01[K     |███████████████▎                | 20 kB 24.7 MB/s eta 0:00:01[K     |███████████████████████         | 30 kB 19.7 MB/s eta 0:00:01[K     |██████████████████████████████▋ | 40 kB 16.7 MB/s eta 0:00:01[K     |████████████████████████████████| 42 kB 1.1 MB/s 
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0


In [None]:
import demoji
 
demoji.download_codes()

  This is separate from the ipykernel package so we can avoid doing imports until


**Main program**

In [None]:
import re 
import tweepy 
from tweepy import OAuthHandler 

class TwitterClient(object): 
	''' 
	Generic Twitter Class for sentiment analysis. 
	'''
	def __init__(self): 
		''' 
		Class constructor or initialization method. 
		'''
		# keys and tokens from the Twitter Dev Console 
		consumer_key = 'fv1rLtlNAmnAnO6GEzP7JRjoA'
		consumer_secret = 'z3wHb8LnTDsuk6zwxIUYQnni8mPANCZyRpfWKAFEpSrz7R2cze'
		access_token = '454016013-y3XL4CRq5o9HvPPdcpMAg7Je631qvC6UypUvbW0B'
		access_token_secret = 'KwCqxqzxDVAOuJ0fvRUrxHYgp9qxoXTs4szxTU3aBg6EK'

		# attempt authentication 
		try: 
			# create OAuthHandler object 
			self.auth = OAuthHandler(consumer_key, consumer_secret) 
			# set access token and secret 
			self.auth.set_access_token(access_token, access_token_secret) 
			# create tweepy API object to fetch tweets 
			self.api = tweepy.API(self.auth) 
		except: 
			print("Error: Authentication Failed") 

	def clean_tweet(self, tweet): 
		''' 
		Utility function to clean tweet text by removing links, special characters 
		using simple regex statements. 
		'''
		return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet[0]).split()) 


	def predict_sentiment(self, text):
		text = self.clean_tweet(text)
		# print("\nI'm the cleaned text: ", text)

		# preprocessing the given text 
		text_seq = tokenizer.texts_to_sequences([text])
		text_pad = pad_sequences(text_seq, maxlen=T)

		# predicting the class
		predicted_sentiment = model.predict(text_pad).round()
		# print("the pred...,", predicted_sentiment)
		
		if predicted_sentiment == 1.0:
			return 'positive'
		else:
			return 'negative'
  
	def get_tweets(self, query, count = 10): 

		''' 
		Main function to fetch tweets and parse them. 
		'''
		# bucket for storing fetched tweets
		bucket = []
		# empty list to store parsed tweets, a dic which contains each tweet and its sentiment
		tweets = [] 

		try: 
			# call twitter api to fetch tweets 
			fetched_tweets = self.api.search(q = query, count = count)
	 
			for tweet in fetched_tweets:
				emojis = ''.join(demoji.findall(tweet.text).values())
				bucket.append([tweet.text + emojis])


			# parsing tweets one by one 
			for tweet in bucket: 
				# empty dictionary to store required params of a tweet 
				parsed_tweet = {} 

				# saving text of tweet 
				parsed_tweet['text'] = tweet 
				# saving sentiment of tweet 
				parsed_tweet['sentiment'] = self.predict_sentiment(tweet) 
				# appending parsed tweet to tweets list 
				
				
				tweets.append(parsed_tweet)

			# return parsed tweets 
			return tweets 

		except tweepy.TweepError as e: 
			# print error (if any) 
			print("Error : " + str(e)) 


	def oldget_tweets(self, query, count = 10): 

		bucket = []
		
		''' 
		Main function to fetch tweets and parse them. 
		'''
		# empty list to store parsed tweets 
		tweets = [] 

		try: 
			# call twitter api to fetch tweets 
			fetched_tweets = self.api.search(q = query, count = count) 

			# parsing tweets one by one 
			for tweet in fetched_tweets: 
				# empty dictionary to store required params of a tweet 
				parsed_tweet = {} 

				# saving text of tweet 
				parsed_tweet['text'] = tweet.text 
				# saving sentiment of tweet 
				parsed_tweet['sentiment'] = self.predict_sentiment(tweet.text) 
				# appending parsed tweet to tweets list 
				if tweet.retweet_count > 0: 
					# if tweet has retweets, ensure that it is appended only once 
					if parsed_tweet not in tweets: 
						tweets.append(parsed_tweet) 
				else: 
					tweets.append(parsed_tweet)

			# return parsed tweets 
			return tweets 

		except tweepy.TweepError as e: 
			# print error (if any) 
			print("Error : " + str(e)) 
	 
#@title Input movie title to perform sentiment analysis on
def main(): 
	# creating object of TwitterClient Class 
	api = TwitterClient() 
	# calling function to get tweets 
	Title = 'the umbrella academy' #@param {type: "string"}
	tweets = api.get_tweets(query = Title, count = 50) 

	# picking positive tweets from tweets 
	ptweets = [tweet for tweet in tweets if tweet['sentiment'] == 'positive'] 
	# percentage of positive tweets 
	print("Positive tweets percentage: {} %".format(100*len(ptweets)/len(tweets))) 
	# picking negative tweets from tweets 
	ntweets = [tweet for tweet in tweets if tweet['sentiment'] == 'negative'] 
	# percentage of negative tweets 
	print("Negative tweets percentage: {} %".format(100*len(ntweets)/len(tweets))) 

	# printing first 5 positive tweets 
	# print("\n\nPositive tweets:") 
	# for tweet in ptweets[:5]: 
	# 	print(tweet['text']) 

	# # printing first 5 negative tweets 
	# print("\n\nNegative tweets:") 
	# for tweet in ntweets[:5]: 
	# 	print(tweet['text']) 

if __name__ == "__main__": 
	# calling main function 
	main() 





Positive tweets percentage: 62.0 %
Negative tweets percentage: 38.0 %
