In [18]:
import pandas as pd
import numpy as np
import re
from time import time
import nltk
from emoji import demojize
nltk.download('stopwords')
import pickle

import tensorflow as tf
from tensorflow import keras
print(tf.version.VERSION)

from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

2.6.0


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\russe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
columns = ['label', 'tweet'] 
raw_data = pd.read_csv('data/Datasets/Sentiment140.csv', usecols=columns)
raw_data.head()

Unnamed: 0,label,tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


### Preprocess the texts:

- Lowercase Text Conversion 
- Special character Removal: links, usernames and transform emojis to text
- Repetition removal (e.g. Helloooooo => Hello)
- Stopword Removal

In [3]:
# Time Start
start = time()

# Raw Text Data
texts = raw_data.tweet

# Lowercase Text Conversion
texts = texts.str.lower()

# Special Character Removal
texts = texts.str.replace(r"(http|@)\S+", "",regex=True)
texts = texts.apply(demojize)
texts = texts.str.replace(r"::", ": :",regex=True)
texts = texts.str.replace(r"’", "'",regex=True)
texts = texts.str.replace(r"[^a-z\':_]", " ",regex=True)

# Repetition removal
pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
texts = texts.str.replace(pattern, r"\1",regex=True)

# Transform short negation form
texts = texts.str.replace(r"(can't|cannot)", 'can not',regex=True)
texts = texts.str.replace(r"n't", ' not',regex=True)

# Stopword Removal
stopwords = nltk.corpus.stopwords.words('english')
## Keep Negation-Relevant wording
stopwords.remove('not')
stopwords.remove('nor')
stopwords.remove('no')
## Apply
texts = texts.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

# Time End
print("Time to clean up: {:.2f} sec".format(time() - start))

# Re-assign cleaned up texts to dataframe column
raw_data.tweet = texts
clean_data = raw_data.copy()

Time to clean up: 430.39 sec


### Tokenize
Transform the text corpus to a vector representation

- word_count: Number of words to use

In [4]:
word_count = 10000

In [5]:
# Initialize Tokenizer with number of words and lowercasing
tokenizer = Tokenizer(num_words=word_count, lower=True)
# Fit and apply tokenizer to text data
tokenizer.fit_on_texts(clean_data.tweet)

file_location='data/Pickles/Sentiment140/tokenizer.pkl'
with open(file_location,'wb') as file:
    pickle.dump(tokenizer, file)

### Data Train/Test Split

In [6]:
# Create empty dataframes
train = pd.DataFrame(columns=['label', 'tweet'])
validation = pd.DataFrame(columns=['label', 'tweet'])

# Populate dataframes by unique label values
for label in clean_data.label.unique():
    label_data = clean_data[clean_data.label == label]
    # Equalize distribution by splitting grouped label data
    train_data, validation_data = train_test_split(label_data, test_size=0.3)
    # Append to respective dataframes
    train = pd.concat([train, train_data])
    validation = pd.concat([validation, validation_data])

### Sentiment Analysis Model Building - GRU: Bidirectional Layering

In [7]:
# Tensorflow-Specific Libraries for Modelling
from tensorflow.keras.layers import Input, Embedding, GRU
from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D
from tensorflow.keras.layers import Bidirectional, Dense
from tensorflow.keras.models import Sequential

In [8]:
# Specify Parameters for Use (Fine-Tune as-needed)
input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)
embedding_dim = 200
input_length = 100
gru_units = 128
gru_dropout = 0.1
recurrent_dropout = 0.1
dropout = 0.1

In [9]:
# Sequential
model = Sequential()
# Embedding Layers
model.add(Embedding(input_dim=input_dim,
                    output_dim=embedding_dim,
                    input_shape=(input_length,)
                    ))
# Bidirectional Layer
model.add(Bidirectional(GRU(gru_units,
                            return_sequences=True,
                            dropout=gru_dropout,
                            recurrent_dropout=recurrent_dropout)))
# Pooling Layer
model.add(GlobalMaxPooling1D())
# Dense Layer - Relu Activation
model.add(Dense(32, activation='relu'))
# Dropout gating
model.add(Dropout(dropout))
# Dense Layer - Sigmoid Activation
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          2000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 256)          253440    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                8224      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,261,697
Trainable params: 2,261,697
Non-trainable params: 0
______________________________________________

### Prep Input Data for Model Training

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
# Separate sentences from prepped data
train_sequences = [text.split() for text in train.tweet]
validation_sequences = [text.split() for text in validation.tweet]

# Tokenize sentences for sequence padding
list_tokenized_train = tokenizer.texts_to_sequences(train_sequences)
list_tokenized_validation = tokenizer.texts_to_sequences(validation_sequences)

# Pad Sequences
x_train = pad_sequences(list_tokenized_train, maxlen=input_length)
x_validation = pad_sequences(list_tokenized_validation, maxlen=input_length)

# Replace labels
y_train = train.label.replace(4, 1)
y_validation = validation.label.replace(4, 1)

### Model Training

In [12]:
epochs = 1
batch_size = 128

model.fit(x=x_train,
          y=y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_validation, y_validation)
         )



<keras.callbacks.History at 0x28d154c02e0>

In [15]:
# Save Entire Model including weights and Optimizer settings
model_file = 'data/Models/Sentiment_Analysis/gru_model.h5'
model.save(model_file)

In [19]:
# Recreate the exact same model, including its weights and the optimizer
new_model = tf.keras.models.load_model(model_file)

# Show the model architecture
new_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          2000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 100, 256)          253440    
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                8224      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,261,697
Trainable params: 2,261,697
Non-trainable params: 0
______________________________________________