In [138]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import SGD
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize 
import preprocessor as p

In [2]:
# Get and randomize data with pandas and numpy
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
stop_words = stopwords.words('english')
sentiment_tweet_dataframe = pd.read_csv('sentiment-tweet-data.csv', encoding=DATASET_ENCODING, names=DATASET_COLUMNS)
sentiment_tweet_dataframe = sentiment_tweet_dataframe.reindex(np.random.permutation(sentiment_tweet_dataframe.index))

In [55]:
def preprocess_text(text):
    ''' Preprocesses text by removing special characters, removing urls, lowercasing text, 
        removing stop words, and stemming the rest
            Args:
                text: string of the a tweets text
            Returns:
                A string of the text with the special characters and urls removed, loswercased text, 
                stopwords removed, and stemming of words
    '''
    text = p.clean(text)
    text = re.sub(r'@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+', ' ', text.lower()).strip()
    new_text = [token for token in word_tokenize(text) if token not in stop_words and len(token) > 1]
    return ' '.join(new_text)

In [4]:
def preprocess_value(value):
    ''' Preprocesses values by making sure they are either 0 or 1 
            Args:
                value: integer that is either 0 or 4
            Returns:
                The given value or 1 if the value is 4
    '''
    if value == 4: return 1
    else: return value

In [5]:
# Uses the above methods to process the data
def preprocess_data(sentiment_tweet_dataframe):
    ''' Prepares features from sentiment_tweet_data for model use.
            Args: 
                sentiment_tweet_data: A pandas dataframe of sentiment140 twitter data from kaggle
            Returns:
                A dataframe with features to be used for the model.
    '''
    selected_features = sentiment_tweet_dataframe[['target', 'text']]
    processed_data = selected_features.copy()
    # Remove links and secial characters from the lowercased text
    processed_data['text'] = processed_data['text'].apply(lambda x: preprocess_text(x))
    processed_data = processed_data.dropna()
    processed_data['target'] = processed_data['target'].apply(lambda x: preprocess_value(x))
    return processed_data

In [6]:
# Split data into training, validation, and test data
np.random.seed(42)
training_data, validation_data, test_data = np.split(sentiment_tweet_dataframe, \
    [int(.6*len(sentiment_tweet_dataframe)), int(.8*len(sentiment_tweet_dataframe))])

In [56]:
# Convert processed training, validation, and test data with nan rows removed into tensorflow dataset for input to model
training_dataframe = preprocess_data(training_data)
validation_dataframe = preprocess_data(validation_data)
test_dataframe = preprocess_data(test_data)

In [15]:
# Drop rows with nan values from dataframes
training_dataframe = training_dataframe.dropna()
validation_dataframe = validation_dataframe.dropna()
test_dataframe = test_dataframe.dropna()

In [68]:
# Get tensorflow hub emmbedding layer
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)

In [149]:
# Build the model
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()

model.compile(optimizer=SGD(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

Model: "sequential_40"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_4 (KerasLayer)   (None, 20)                400020    
_________________________________________________________________
dense_69 (Dense)             (None, 16)                336       
_________________________________________________________________
dense_70 (Dense)             (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [150]:
# Train and test the model
history = model.fit(training_dataframe['text'].values, training_dataframe['target'].values, 
                    validation_data=(validation_dataframe['text'].values, validation_dataframe['target'].values),
                    batch_size=512, epochs=20, verbose=1)
results = model.evaluate(test_dataframe['text'].values, test_dataframe['target'].values, verbose=2)
for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

Train on 960000 samples, validate on 320000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
320000/1 - 10s - loss: 0.5926 - accuracy: 0.7443
loss: 0.517
accuracy: 0.744


In [151]:
# Save the model
model.save('twitter_sa_model.h5') 