<a href="https://colab.research.google.com/github/SpectreB/Data_Sci_Tweets_sentiment/blob/main/Tweets_Sentiment_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import shutil
import tensorflow.keras as keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, TimeDistributed, Flatten, Bidirectional
from sklearn.model_selection import train_test_split
from keras.models import Sequential
import matplotlib.pyplot as plt
from keras.layers import BatchNormalization
from keras.regularizers import l2
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('wordnet')
import random
from random_word import RandomWords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Load dataset
tweets = pd.read_csv('/content/drive/MyDrive/Tweets.csv')

In [None]:
tweets

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


In [None]:
# Handle missing values
tweets.isnull().sum()
tweets['text'].fillna("", inplace=True)
tweets = tweets[tweets['text'] != ""]

In [None]:
# Drop unrelevent columns and values
tweets = tweets.drop(['textID','selected_text'], axis=1)
tweets = tweets[tweets['sentiment'] != "neutral"]
tweets

Unnamed: 0,text,sentiment
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
6,2am feedings for the baby are fun when he is a...,positive
...,...,...
27475,enjoy ur night,positive
27476,wish we could come see u on Denver husband l...,negative
27477,I`ve wondered about rake to. The client has ...,negative
27478,Yay good for both of you. Enjoy the break - y...,positive


In [None]:
# Preprocess text data
tweets.text = tweets.text.astype(str).str.lower()
tweets['text'] = tweets['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))
tweets

Unnamed: 0,text,sentiment
1,sooo sad i will miss you here in san diego,negative
2,my boss is bullying me,negative
3,what interview leave me alone,negative
4,sons of why couldn`t they put them on the re...,negative
6,2am feedings for the baby are fun when he is a...,positive
...,...,...
27475,enjoy ur night,positive
27476,wish we could come see u on denver husband l...,negative
27477,i`ve wondered about rake to the client has m...,negative
27478,yay good for both of you enjoy the break you...,positive


In [None]:
# Data augmentation: random word insertion
r = RandomWords()
def get_fixed_random_words_list(n=10):
    return [r.get_random_word() for _ in range(n)]

def distribute_fixed_random_words(text, words_list):
    words = word_tokenize(text)
    random_word = random.choice(words_list)
    insert_index = random.randint(0, len(words))
    words.insert(insert_index, random_word)
    return ' '.join(words)

fixed_random_words_list = get_fixed_random_words_list()
tweets['augmented_text'] = tweets['text'].apply(lambda x: distribute_fixed_random_words(x, fixed_random_words_list))

In [None]:
tweets

Unnamed: 0,text,sentiment,augmented_text
1,sooo sad i will miss you here in san diego,negative,sooo sad i branchiform will miss you here in s...
2,my boss is bullying me,negative,my disinterestedness boss is bullying me
3,what interview leave me alone,negative,decimalist what interview leave me alone
4,sons of why couldn`t they put them on the re...,negative,sons of why couldn ` t they put them on the re...
6,2am feedings for the baby are fun when he is a...,positive,2am feedings for the baby are fun scarted when...
...,...,...,...
27475,enjoy ur night,positive,enjoy ur night disinterestedness
27476,wish we could come see u on denver husband l...,negative,wish wreathwork we could come see u on denver ...
27477,i`ve wondered about rake to the client has m...,negative,i ` ve wondered about rake to the client has m...
27478,yay good for both of you enjoy the break you...,positive,yay good plouked for both of you enjoy the bre...


In [None]:
# Stemming text data
stemmer = PorterStemmer()
tweets['stemmed_text'] = tweets['augmented_text'].apply(lambda x: ' '.join([stemmer.stem(word) for word in word_tokenize(x)]))

In [None]:
tweets

Unnamed: 0,text,sentiment,augmented_text,stemmed_text
1,sooo sad i will miss you here in san diego,negative,sooo sad i branchiform will miss you here in s...,sooo sad i branchiform will miss you here in s...
2,my boss is bullying me,negative,my disinterestedness boss is bullying me,my disinterested boss is bulli me
3,what interview leave me alone,negative,decimalist what interview leave me alone,decimalist what interview leav me alon
4,sons of why couldn`t they put them on the re...,negative,sons of why couldn ` t they put them on the re...,son of whi couldn ` t they put them on the rel...
6,2am feedings for the baby are fun when he is a...,positive,2am feedings for the baby are fun scarted when...,2am feed for the babi are fun scart when he is...
...,...,...,...,...
27475,enjoy ur night,positive,enjoy ur night disinterestedness,enjoy ur night disinterested
27476,wish we could come see u on denver husband l...,negative,wish wreathwork we could come see u on denver ...,wish wreathwork we could come see u on denver ...
27477,i`ve wondered about rake to the client has m...,negative,i ` ve wondered about rake to the client has m...,i ` ve wonder about rake to the client ha made...
27478,yay good for both of you enjoy the break you...,positive,yay good plouked for both of you enjoy the bre...,yay good plouk for both of you enjoy the break...


In [None]:
# Tokenization and padding for LSTM
max_features = 5000
tokenizer = Tokenizer(num_words=max_features, oov_token='OOV')
tokenizer.fit_on_texts(tweets['stemmed_text'].values)
features = tokenizer.texts_to_sequences(tweets['stemmed_text'].values)
features = pad_sequences(features, padding='post')
labels = pd.get_dummies(tweets['sentiment']).values

In [None]:
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.1, random_state=42)

In [None]:
# LSTM Model: 4 layers
embedding_dimension = 128
dropout_value = 0.4
regularizer = 0.001

model_lstm = Sequential()
model_lstm.add(Embedding(max_features, embedding_dimension, input_length=features.shape[1]))
model_lstm.add(SpatialDropout1D(dropout_value))
model_lstm.add(LSTM(embedding_dimension, kernel_regularizer=l2(regularizer), dropout=dropout_value, recurrent_dropout=dropout_value))
model_lstm.add(Dense(2, activation='softmax'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Model summary and training
model_lstm.summary()
batch_size = 256
model_lstm_history = model_lstm.fit(X_train, Y_train, epochs=5, batch_size=batch_size, verbose=1, shuffle=True, validation_data=(X_test, Y_test))

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 34, 128)           640000    
                                                                 
 spatial_dropout1d_11 (Spat  (None, 34, 128)           0         
 ialDropout1D)                                                   
                                                                 
 lstm_11 (LSTM)              (None, 128)               131584    
                                                                 
 dense_16 (Dense)            (None, 2)                 258       
                                                                 
Total params: 771842 (2.94 MB)
Trainable params: 771842 (2.94 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Tokenization and padding for Bidirectional LSTM
vocab_size = len(tokenizer.word_index) + 1
tokenizer_bi = Tokenizer(num_words=vocab_size, oov_token='OOV')
tokenizer_bi.fit_on_texts(tweets['stemmed_text'].values)
features_bi = tokenizer_bi.texts_to_sequences(tweets['stemmed_text'].values)
features_bi = pad_sequences(features_bi, padding='post')

In [None]:
# Train-test split for Bidirectional LSTM
X_train, X_test, Y_train, Y_test = train_test_split(features_bi, labels, test_size=0.1, random_state=42)

In [None]:
# Bidirectional LSTM Model: 6 layers
BiLSTM = Sequential()
BiLSTM.add(Embedding(vocab_size, embedding_dimension, input_length=features.shape[1]))
BiLSTM.add(Bidirectional(LSTM(embedding_dimension, return_sequences=True, dropout=dropout_value, recurrent_dropout=dropout_value, kernel_regularizer=l2(regularizer)), merge_mode='concat'))
BiLSTM.add(TimeDistributed(Dense(embedding_dimension, activation='relu')))
BiLSTM.add(SpatialDropout1D(dropout_value))
BiLSTM.add(Flatten())
BiLSTM.add(Dense(2, activation='softmax'))
BiLSTM.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Model summary and training for Bidirectional LSTM
BiLSTM.summary()
BiLTSM_history = BiLSTM.fit(X_train, Y_train, epochs=5, batch_size=batch_size, verbose=1, validation_data=(X_test, Y_test))

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 34, 128)           2148864   
                                                                 
 bidirectional_5 (Bidirecti  (None, 34, 256)           263168    
 onal)                                                           
                                                                 
 time_distributed_5 (TimeDi  (None, 34, 128)           32896     
 stributed)                                                      
                                                                 
 spatial_dropout1d_12 (Spat  (None, 34, 128)           0         
 ialDropout1D)                                                   
                                                                 
 flatten_5 (Flatten)         (None, 4352)              0         
                                                     