In [11]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import SGD
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize 
import preprocessor as p

In [12]:
# Get and randomize data with pandas and numpy
DATASET_TRAIN_COLUMNS = ["Sentiment", "ID", "Date", 'Query', 'Sender', 'SentimentText']
DATASET_ENCODING = "ISO-8859-1"
stop_words = stopwords.words('english')
sentiment_train_dataframe = pd.read_csv('sentiment-tweet-data.csv', encoding=DATASET_ENCODING, names=DATASET_TRAIN_COLUMNS)

sentiment_train_dataframe = sentiment_train_dataframe.reindex(np.random.permutation(sentiment_train_dataframe.index))

In [13]:
sentiment_test_dataframe.head()

In [14]:
def preprocess_text(text):
    ''' Preprocesses text by removing special characters, removing urls, lowercasing text, 
        removing stop words, and stemming the rest
            Args:
                text: string of the a tweets text
            Returns:
                A string of the text with the special characters and urls removed, loswercased text, 
                stopwords removed, and stemming of words
    '''
    text = p.clean(text)
    text = re.sub(r'@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+', ' ', text.lower()).strip()
    new_text = [token for token in word_tokenize(text) if token not in stop_words and len(token) > 1]
    return ' '.join(new_text)

In [15]:
def preprocess_value(value):
    ''' Preprocesses values by making sure they are either 0 or 1 
            Args:
                value: integer that is either 0 or 4
            Returns:
                The given value or 1 if the value is 4
    '''
    if value == 4: return 1
    else: return int(value)

In [16]:
# Uses the above methods to process the data
def preprocess_data(sentiment_tweet_dataframe):
    ''' Prepares features from sentiment_tweet_data for model use.
            Args: 
                sentiment_tweet_data: A pandas dataframe of sentiment140 twitter data from kaggle
            Returns:
                A dataframe with features to be used for the model.
    '''
    selected_features = sentiment_tweet_dataframe[['Sentiment', 'SentimentText']]
    processed_data = selected_features.copy()
    # Remove links and secial characters from the lowercased text
    processed_data['SentimentText'] = processed_data['SentimentText'].apply(lambda x: preprocess_text(x))
    processed_data = processed_data.dropna()
    # processed_data['target'] = processed_data['target'].apply(lambda x: preprocess_value(x))
    return processed_data

In [17]:
# Split data into training, validation, and test data
np.random.seed(42)
training_data, validation_data, test_data = np.split(sentiment_train_dataframe, \
    [int(.6*len(sentiment_train_dataframe)), int(.8*len(sentiment_train_dataframe))])

In [18]:
# Convert processed training, validation, and test data with nan rows removed into tensorflow dataset for input to model
training_dataframe = preprocess_data(training_data)
validation_dataframe = preprocess_data(validation_data)
test_dataframe = preprocess_data(test_data)

In [19]:
# Drop rows with nan values from dataframes
training_dataframe = training_dataframe.dropna()
validation_dataframe = validation_dataframe.dropna()
test_dataframe = test_dataframe.dropna()

In [20]:
# Get tensorflow hub emmbedding layer
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)

In [None]:
# Build the model
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(40, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train and test the model
history = model.fit(training_dataframe['SentimentText'].values, training_dataframe['Sentiment'].values, 
                    validation_data=(validation_dataframe['SentimentText'].values, validation_dataframe['Sentiment'].values),
                    batch_size=512, epochs=100, verbose=1)
results = model.evaluate(test_dataframe['SentimentText'].values, test_dataframe['Sentiment'].values, verbose=2)
for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

In [None]:
# Save the model
model.save('twitter_sa_model.h5') 