In [1]:
import os
import glob
import pandas as pd

In [2]:
path = r"C:\Users\russe\Desktop\LDA_Topic_Modeling\data\Datasets\Emotion_Data"
extension = 'csv'
os.chdir(path)
result = glob.glob(f'*.{extension}')
print(result)

['#angry.csv', '#fear.csv', '#happy.csv', '#love.csv', '#rage.csv', '#sad.csv', '#surprise.csv', 'DNN_Train_Data.csv']


In [3]:
# Load All Dataframes and store into variables in a dictionary
dfs = {os.path.splitext(os.path.basename(f)[1:-4])[0]: pd.read_csv(f,names=['id', 'date', 'user', 'text'],low_memory=False) for f in glob.glob('*.csv')}

# Assign Labels to twitter dataframes based on search queries
for key in dfs.keys():   
    if key == 'rage':
        dfs[key]['label'] = 'anger'
    
    if key == 'angry':
        dfs[key]['label'] = 'anger'
        
    elif key == 'fear' :
        dfs[key]['label'] = 'fear'
        
    elif key == 'happy' :
        dfs[key]['label'] = 'joy'
        
    elif key == 'love' :
        dfs[key]['label'] = 'love'
        
    elif key == 'sad' :
        dfs[key]['label'] = 'sad'
        
    elif key == 'surprise' :
        dfs[key]['label'] = 'surprise'
    else:
        continue

In [4]:
# Compile and Concatenate labelled twitter dataframes
compiled_csvs = pd.concat(dfs,ignore_index=True,axis=0)
compiled_csvs = compiled_csvs[['label','text']]
compiled_csvs['text'] = compiled_csvs['text'].apply(str)
print(f"Rows of compiled tweets: {len(compiled_csvs)} rows")

Rows of compiled tweets: 4190535 rows


In [5]:
# Remove Null Values and Drop Duplicate tweets
compiled_csvs = compiled_csvs.dropna(subset=['text'])
print(compiled_csvs.shape)
compiled_csvs.drop_duplicates(subset=['text'], keep='first',inplace=True,ignore_index=True)
print(compiled_csvs.shape)

(4190535, 2)
(176011, 2)


In [6]:
save_loc = "C:/Users/russe/Desktop/LDA_Topic_Modeling/data/Datasets/Emotion_Data/Compiled_Tweets/"
filename = "Labelled_Tweets.csv"

In [7]:
# Export compiled data
compiled_csvs.to_csv(save_loc+filename,index=None,header=True)
compiled_csvs.head()

Unnamed: 0,label,text
0,anger,https://t.co/IZSFiF8Rw9 #angry #Dont Make Me A...
1,anger,„Furiosa“ is finished… #wut #rage #projekt #an...
2,anger,#Freak TFG is #angry #afraid https://t.co/ql68...
3,anger,Gave up by NIN is such a good song to blast wh...
4,anger,@HDMOVIESOURCE @UniversalPics This seems fair ...


In [8]:
compiled_csvs['label'].value_counts()

sad         46309
love        41890
anger       31336
surprise    27063
fear        18217
joy         11196
Name: label, dtype: int64

In [9]:
# compiled_csvs.groupby('query_label').sample(n=5, random_state=1)

## New Tweet Preprocessing

In [10]:
import re
from time import time
import nltk
from emoji import demojize
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\russe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Time Start
start = time()


# Raw Text Data
texts = compiled_csvs.text

# Lowercase Text Conversion
texts = texts.str.lower()

# Special Character Removal
texts = texts.str.replace(r"(http|@)\S+", "",regex=True)
texts = texts.apply(demojize)
texts = texts.str.replace(r"::", ": :",regex=True)
texts = texts.str.replace(r"’", "'",regex=True)
texts = texts.str.replace(r"[^a-z\':_]", " ",regex=True)

# Repetition removal
pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
texts = texts.str.replace(pattern, r"\1",regex=True)

# Transform short negation form
texts = texts.str.replace(r"(can't|cannot)", 'can not',regex=True)
texts = texts.str.replace(r"n't", ' not',regex=True)

# Stopword Removal
stopwords = nltk.corpus.stopwords.words('english')
## Keep Negation-Relevant wording
stopwords.remove('not')
stopwords.remove('nor')
stopwords.remove('no')
## Apply
texts = texts.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

# Time End
print("Time to clean up: {:.2f} sec".format(time() - start))

Time to clean up: 94.13 sec


In [12]:
compiled_csvs['text_cleaned'] = texts
compiled_csvs.head()

Unnamed: 0,label,text,text_cleaned
0,anger,https://t.co/IZSFiF8Rw9 #angry #Dont Make Me A...,angry dont make angry wouldnt like im angry via
1,anger,„Furiosa“ is finished… #wut #rage #projekt #an...,furiosa finished wut rage projekt angry angrya...
2,anger,#Freak TFG is #angry #afraid https://t.co/ql68...,freak tfg angry afraid via assolini feeling ft...
3,anger,Gave up by NIN is such a good song to blast wh...,gave nin good song blast got fired got fired s...
4,anger,@HDMOVIESOURCE @UniversalPics This seems fair ...,seems fair respectful would've gone premium pr...
