# This notebook contains the pre-processing steps:

- clean the data (remove hashtag, mentions of accounts with @) and lift the different declinations of a word (each word is stemmed)
- tokenization of the tweets and padding removal
- proceed to the embedding of word tokens with the GloVe vectors
- set up the embedding layers for the LSTM-CNN neural network 

### Libraries importations

In [65]:
import numpy as np
import pandas as pd
import pickle as pk
import re, sys, os, csv
from many_stop_words import get_stop_words
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, Concatenate, Embedding

## Cleaning of the tweets data

In [66]:
stop_words = list(get_stop_words('en'))         #About 900 stop words
nltk_words = list(stopwords.words('english'))   #About 150 stop words
stop_words.extend(nltk_words)

def words(text): return re.findall('[a-z]+', text.lower())

dictionary = Counter(words(open('dataset/wordlists/merged.txt').read()))
max_word_length = max(map(len, dictionary))
total = float(sum(dictionary.values()))

def clean_tweet( tweet):
        tweet = tweet.lower()
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def remove_stopwords(word_list):
        filtered_tweet=""
        for word in word_list:
            word = word.lower() 
            if word not in stopwords.words("english"):
                filtered_tweet=filtered_tweet + " " + word        
        return filtered_tweet.lstrip()
    
def change_label(label):
    if label == "empty":return 0
    elif label == "sadness":return 4
    elif label == "enthusiasm":return 1
    elif label == "neutral":return 0
    elif label == "worry":return 3
    elif label == "surprise":return 2
    elif label == "love":return 2
    elif label == "fun":return 2
    elif label == "hate":return 4
    elif label == "happiness":return 1
    elif label == "boredom":return 0
    elif label == "relief":return 2
    elif label == "anger":return 4

In [67]:
data_train = pd.read_csv('dataset/data/text_emotion.csv', sep=',')
print("Tweet dataset shape:",data_train.shape)
print(data_train.sentiment[0],":",data_train.content[0])

Tweet dataset shape: (40000, 4)
empty : @tiffanylue i know  i was listenin to bad habit earlier and i started freakin at his part =[


In [68]:
def rewrite_clean_data(dirty_data):
    dataWriter = csv.writer(open('data.csv', 'w'), delimiter=',',lineterminator="\n")
    for i in range(len(dirty_data)):
        tweet= clean_tweet(dirty_data.content[i])
        tweet = remove_stopwords(tweet.split())
        if change_label(dirty_data.sentiment[i]) != 4:      #removal of anger, hate and sadness as unlikely to be found in marketing e-mails
            dataWriter.writerow([tweet, str(change_label(dirty_data.sentiment[i]))])
    print("Cleaning process is completed! Clean data stored in data.csv")
    
#rewrite_clean_data(data_train)

Cleaning process is completed! Clean data stored in data.csv


### Initialisation of parameters

In [69]:
MAX_NB_WORDS = 40000 # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 30 # max length of text (words) including padding
testing_split = 0.3
EMBEDDING_DIM = 200 # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "dataset/glove/glove.twitter.27B."+str(200)+"d.txt"
print("Parameters for the embedding and pre-processing:\n",
      MAX_NB_WORDS,MAX_SEQUENCE_LENGTH+5,
      testing_split,EMBEDDING_DIM,"\n",
      GLOVE_DIR)

Parameters for the embedding and pre-processing:
 40000 35 0.3 200 
 dataset/glove/glove.twitter.27B.200d.txt


### Creating the X,Y vectors to be used for the training phase

In [70]:
texts, labels = [], []
print("Reading from the csv file...", end="")
with open('data.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        texts.append(row[0])
        labels.append(row[1])
print("Done!")

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
with open('tokenizer.pkl', 'wb') as handle:
    pk.dump(tokenizer, handle, protocol=pk.HIGHEST_PROTOCOL)
    
#We need to save the tokenizer because it will be re-used during the training phase of the neural network
print("Succesfully save the word tokenizer to file: tokenizer.pkl")


Reading from the csv file...Done!
Succesfully save the word tokenizer to file: tokenizer.pkl


In [71]:
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data_int = pad_sequences(sequences, padding='pre', maxlen=(MAX_SEQUENCE_LENGTH-5))
data = pad_sequences(data_int, padding='post', maxlen=(MAX_SEQUENCE_LENGTH))

Found 27496 unique tokens.


In [72]:
labels = to_categorical(np.asarray(labels)) # convert to one-hot encoding vectors
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


indices = np.arange(data.shape[0])
# Shuffling step to make sure data is in a randomized order
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

Shape of data tensor: (33402, 30)
Shape of label tensor: (33402, 4)


In [73]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=testing_split, random_state=42)

print('Number of entries in each category:')
print("Training:\n",y_train.sum(axis=0))
print("Testing:\n",y_test.sum(axis=0))

Number of entries in each category:
Training:
 [6733. 4161. 6548. 5939.]
Testing:
 [2911. 1807. 2783. 2520.]
