In [1]:
# OS Navigation libraries
import os
import glob

# Computing Libraries
import numpy as np
import pandas as pd
from time import time

In [2]:
path = r"C:\Users\russe\Desktop\LDA_Topic_Modeling\data\Datasets\Emotion_Data"
extension = 'csv'
os.chdir(path)
result = glob.glob(f'*.{extension}')
print(f"Current files in Path Folder: {result}")

Current files in Path Folder: ['#angry.csv', '#fear.csv', '#happy.csv', '#joy.csv', '#love.csv', '#rage.csv', '#sad.csv', '#surprise.csv', 'DNN_Train_Data.csv']


In [3]:
# Load All Dataframes and store into variables in a dictionary
dfs = {os.path.splitext(os.path.basename(f)[1:-4])[0]: pd.read_csv(f,names=['id', 'date', 'user', 'text'],low_memory=False) for f in glob.glob('*.csv')}

# Assign Labels to twitter dataframes based on search queries
for key in dfs.keys():   
    if key == 'rage':
        dfs[key]['label'] = 'anger'
    
    if key == 'angry':
        dfs[key]['label'] = 'anger'
        
    elif key == 'fear' :
        dfs[key]['label'] = 'fear'
        
    elif key == 'happy' :
        dfs[key]['label'] = 'joy'
        
    elif key == 'love' :
        dfs[key]['label'] = 'love'
        
    elif key == 'sad' :
        dfs[key]['label'] = 'sadness'
        
    elif key == 'surprise' :
        dfs[key]['label'] = 'surprise'
    else:
        continue

In [4]:
# Compile and Concatenate labelled twitter dataframes
compiled_csvs = pd.concat(dfs,ignore_index=True,axis=0)
compiled_csvs = compiled_csvs[['label','text']]
compiled_csvs['text'] = compiled_csvs['text'].apply(str)
print(f"Rows of compiled tweets: {len(compiled_csvs)} rows")

Rows of compiled tweets: 4558141 rows


In [5]:
# Remove Null Values and Drop Duplicate tweets
compiled_csvs = compiled_csvs.dropna(subset=['text'])
print(f"Rows before duplicate removal: {compiled_csvs.shape[0]}")
compiled_csvs.drop_duplicates(subset=['text'], keep='first',inplace=True,ignore_index=True)
print(f"Rows after duplicate removal: {compiled_csvs.shape[0]}")

Rows before duplicate removal: 4558141
Rows after duplicate removal: 191709


In [6]:
save_loc = "C:/Users/russe/Desktop/LDA_Topic_Modeling/data/Datasets/Emotion_Data/Compiled_Tweets/"
filename = "Labelled_Tweets.csv"

# Export compiled data
compiled_csvs.to_csv(save_loc+filename,index=None,header=True)
compiled_csvs.head()

Unnamed: 0,label,text
0,anger,https://t.co/IZSFiF8Rw9 #angry #Dont Make Me A...
1,anger,„Furiosa“ is finished… #wut #rage #projekt #an...
2,anger,#Freak TFG is #angry #afraid https://t.co/ql68...
3,anger,Gave up by NIN is such a good song to blast wh...
4,anger,@HDMOVIESOURCE @UniversalPics This seems fair ...


In [7]:
compiled_csvs['label'].value_counts()

sadness     46303
love        41803
anger       31335
fear        30585
surprise    27059
joy         12949
Name: label, dtype: int64

## Set Up Text Preprocessing

In [8]:
# NLP Text Processing Libraries
import re
import nltk
from emoji import demojize
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\russe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
def preprocess(texts, quiet=False):
    start = time()
    
    # Lowercasing
    texts = texts.str.lower()

    # Remove special chars
    texts = texts.str.replace(r"(http|@)\S+", "",regex=True)
    texts = texts.apply(demojize)
    texts = texts.str.replace(r"::", ": :",regex=True)
    texts = texts.str.replace(r"’", "'",regex=True)
    texts = texts.str.replace(r"[^a-z\':_]", " ",regex=True)

    # Remove repetitions
    pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
    texts = texts.str.replace(pattern, r"\1",regex=True)

    # Transform short negation form
    texts = texts.str.replace(r"(can't|cannot)", 'can not',regex=True)
    texts = texts.str.replace(r"n't", ' not',regex=True)

    # Remove stop words
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.remove('not')
    stopwords.remove('nor')
    stopwords.remove('no')
    texts = texts.apply(
    lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

    if not quiet:
        print("Time to clean up: {:.2f} sec".format(time() - start))

    return texts

## Run Preprocessing

In [10]:
compiled_csvs['text_cleaned'] = preprocess(texts=compiled_csvs.text,quiet=False)
compiled_csvs.head()

Time to clean up: 160.50 sec


Unnamed: 0,label,text,text_cleaned
0,anger,https://t.co/IZSFiF8Rw9 #angry #Dont Make Me A...,angry dont make angry wouldnt like im angry via
1,anger,„Furiosa“ is finished… #wut #rage #projekt #an...,furiosa finished wut rage projekt angry angrya...
2,anger,#Freak TFG is #angry #afraid https://t.co/ql68...,freak tfg angry afraid via assolini feeling ft...
3,anger,Gave up by NIN is such a good song to blast wh...,gave nin good song blast got fired got fired s...
4,anger,@HDMOVIESOURCE @UniversalPics This seems fair ...,seems fair respectful would've gone premium pr...


## Validate compiled tweets with Sentiment Model Ranging

In [11]:
# NLP Libraries
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nlp

# Tensorflow Libraries
import tensorflow as tf
print(f'Tensorflow Version: {tf.version.VERSION}')
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Dropout
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import Bidirectional
from keras.callbacks import EarlyStopping
from keras.layers import GlobalAvgPool1D

# Auxilliary Libraries
import pickle
import random

Tensorflow Version: 2.6.0


In [12]:
# Load trained Sentiment Analysis Model with weights and Optimizers
model_file = "C:/Users/russe/Desktop/LDA_Topic_Modeling/data/Models/Sentiment_Analysis/gru_model.h5"
sentiment_model = tf.keras.models.load_model(model_file)

# Load trained Sentiment Model Tokenizer
tokenizer_path = "C:/Users/russe/Desktop/LDA_Topic_Modeling/data/Pickles/Sentiment140/tokenizer.pkl"
with open(tokenizer_path,'rb') as file:
    tokenizer = pickle.load(file)

# Show the model architecture
sentiment_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          2000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 256)          253440    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                8224      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,261,697
Trainable params: 2,261,697
Non-trainable params: 0
______________________________________________

In [13]:
def get_score_range(mean):
    if mean < 0.5:
        return (float("{0:.4f}".format(0.0)), float("{0:.4f}".format(mean)))
    return (float("{0:.4f}".format(mean)), float("{0:.4f}".format(1.0)))

In [14]:
# Process Raw text data & run sentiment analysis scoring
cleaned_texts = compiled_csvs.text_cleaned
predict_sequences = [text.split() for text in cleaned_texts]
list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)
x_predict = pad_sequences(list_tokenized_predict, maxlen=100)

# Assign model predictions to each row of data
result = sentiment_model.predict(x_predict)
compiled_csvs['Sentiment Score'] = result

In [15]:
# Group by emotion label & store in dictionary format
label_group = compiled_csvs.groupby('label')
label_group = dict(list(label_group))

# assign emotion ranges with dictionary
data_list = []
emotion_ranges = {}

for key,val in label_group.items():
    #calculate mean, std & sentiment value range for each label cluster
    mean = label_group[key]['Sentiment Score'].mean()
    std = label_group[key]['Sentiment Score'].std()
    low, high = get_score_range(mean)
    # save ranges for each emotion
    emotion_ranges[key] =  [low,high]
    print(f"{key.capitalize()}: Score Range: {low} - {high}")
    # Keep only rows with correct label within range
    validated_data = label_group[key][np.all([(label_group[key]['Sentiment Score'] >= low), (label_group[key]['Sentiment Score'] <= high)], axis=0)]
    data_list.append(validated_data)

# Combine all validated, label-grouped data
validated_data = pd.concat(data_list,ignore_index=True)
print("")
print(f"Number of validated rows of data: {validated_data.shape[0]}")

Anger: Score Range: 0.0 - 0.4441
Fear: Score Range: 0.0 - 0.4861
Joy: Score Range: 0.9027 - 1.0
Love: Score Range: 0.8499 - 1.0
Sadness: Score Range: 0.0 - 0.4655
Surprise: Score Range: 0.5855 - 1.0

Number of validated rows of data: 105872


In [16]:
validated_data

Unnamed: 0,label,text,text_cleaned,Sentiment Score
0,anger,https://t.co/IZSFiF8Rw9 #angry #Dont Make Me A...,angry dont make angry wouldnt like im angry via,0.135011
1,anger,Gave up by NIN is such a good song to blast wh...,gave nin good song blast got fired got fired s...,0.056593
2,anger,@HDMOVIESOURCE @UniversalPics This seems fair ...,seems fair respectful would've gone premium pr...,0.193763
3,anger,Pearls of spirituality for #anger .USE S.O.S\n...,pearls spirituality anger use whenever feel an...,0.333703
4,anger,"FOR THE LOVE OF EVERYTHING LEARN TO REPLY, NOT...",love everything learn reply not reply respondi...,0.180410
...,...,...,...,...
105867,surprise,@nftmomentum @SolatoPotato New NFT coming soon...,new nft coming soon :exploding_head: humanoids...,0.861667
105868,surprise,@Vesperskill @SnoopDogg @CozomoMedici 🤯 congra...,:exploding_head: congrats,0.977566
105869,surprise,@Fcxky_Wowly New NFT coming soon! 🤯 Humanoids ...,new nft coming soon :exploding_head: humanoids...,0.861667
105870,surprise,@mikiowatanabe6 crazy w these ones!! 🤯,crazy w ones :exploding_head:,0.775524


In [17]:
validated_data['label'].value_counts()

love        28864
sadness     22495
anger       15862
fear        14980
surprise    14048
joy          9623
Name: label, dtype: int64

In [18]:
validated_data.shape

(105872, 4)

In [19]:
save_loc = "C:/Users/russe/Desktop/LDA_Topic_Modeling/data/Datasets/Emotion_Data/Compiled_Tweets/"
filename = "validated_data.csv"
validated_data.to_csv(save_loc+filename, header=True, index=False)

## Check currently Validated Data being run on Task Scheduler

In [20]:
read_data = pd.read_csv(save_loc+filename)
read_data['label'].value_counts()

love        28864
sadness     22495
anger       15862
fear        14980
surprise    14048
joy          9623
Name: label, dtype: int64