In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import SGD
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize 
import preprocessor as p

In [2]:
# Get and randomize data with pandas and numpy
DATASET_TRAIN_COLUMNS = ["id", "target", "text"]
DATASET_TEST_COLUMNS = ["id", "text"]
DATASET_ENCODING = "ISO-8859-1"
stop_words = stopwords.words('english')
sentiment_train_dataframe = pd.read_csv('sentiment-data/train.csv', encoding=DATASET_ENCODING, names=DATASET_TRAIN_COLUMNS)
sentiment_test_dataframe = pd.read_csv('sentiment-data/test.csv', encoding=DATASET_ENCODING, names=DATASET_TEST_COLUMNS)

sentiment_train_dataframe = sentiment_train_dataframe.reindex(np.random.permutation(sentiment_train_dataframe.index))
sentiment_test_dataframe = sentiment_test_dataframe.reindex(np.random.permutation(sentiment_test_dataframe.index))

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
sentiment_test_dataframe.head()

Unnamed: 0,id,text
69791,69802,@AndreaDeneen I dunno.... I just want food lol
140510,140521,@crashpixie I'm hoping it's an announcement th...
241214,241225,@Hotdogmonster67 i wanna come too!
100790,100801,@BeckyDMBR Couldn't decide if I wanted to list...
129881,129892,@Chris_Gorham I'm on pins and needles and cann...


In [4]:
def preprocess_text(text):
    ''' Preprocesses text by removing special characters, removing urls, lowercasing text, 
        removing stop words, and stemming the rest
            Args:
                text: string of the a tweets text
            Returns:
                A string of the text with the special characters and urls removed, loswercased text, 
                stopwords removed, and stemming of words
    '''
    text = p.clean(text)
    text = re.sub(r'@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+', ' ', text.lower()).strip()
    new_text = [token for token in word_tokenize(text) if token not in stop_words and len(token) > 1]
    return ' '.join(new_text)

In [5]:
def preprocess_value(value):
    ''' Preprocesses values by making sure they are either 0 or 1 
            Args:
                value: integer that is either 0 or 4
            Returns:
                The given value or 1 if the value is 4
    '''
    if value == 4: return 1
    else: return value

In [6]:
# Uses the above methods to process the data
def preprocess_data(sentiment_tweet_dataframe):
    ''' Prepares features from sentiment_tweet_data for model use.
            Args: 
                sentiment_tweet_data: A pandas dataframe of sentiment140 twitter data from kaggle
            Returns:
                A dataframe with features to be used for the model.
    '''
    selected_features = sentiment_tweet_dataframe[['target', 'text']]
    processed_data = selected_features.copy()
    # Remove links and secial characters from the lowercased text
    processed_data['text'] = processed_data['text'].apply(lambda x: preprocess_text(x))
    processed_data = processed_data.dropna()
    # processed_data['target'] = processed_data['target'].apply(lambda x: preprocess_value(x))
    return processed_data

In [7]:
# Split data into training, validation, and test data
np.random.seed(42)
training_data, validation_data, test_data = np.split(sentiment_train_dataframe, \
    [int(.6*len(sentiment_train_dataframe)), int(.8*len(sentiment_train_dataframe))])

In [8]:
# Convert processed training, validation, and test data with nan rows removed into tensorflow dataset for input to model
training_dataframe = preprocess_data(training_data)
validation_dataframe = preprocess_data(validation_data)
test_dataframe = preprocess_data(test_data)

In [9]:
# Drop rows with nan values from dataframes
training_dataframe = training_dataframe.dropna()
validation_dataframe = validation_dataframe.dropna()
test_dataframe = test_dataframe.dropna()

In [10]:
# Get tensorflow hub emmbedding layer
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], dtype=tf.string, trainable=True)

OSError: SavedModel file does not exist at: /var/folders/9j/x9fj91zs6qlctjpcvcmg1gfm0000gn/T/tfhub_modules/510580b203329a4a95dfdfefd838bdcd202f0d13/{saved_model.pbtxt|saved_model.pb}

In [None]:
# Build the model
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.summary()

model.compile(optimizer=SGD(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train and test the model
history = model.fit(training_dataframe['text'].values, training_dataframe['target'].values, 
                    validation_data=(validation_dataframe['text'].values, validation_dataframe['target'].values),
                    batch_size=512, epochs=20, verbose=1)
results = model.evaluate(test_dataframe['text'].values, test_dataframe['target'].values, verbose=2)
for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

In [None]:
# Save the model
model.save('twitter_sa_model.h5') 