In [1]:
import tensorflow as tf
import tensorflow_hub as hub

import matplotlib.pyplot as plt
import pandas as pd
import re
import numpy as np

In [2]:
data = pd.read_csv('tweets.csv',encoding='latin-1',names=['polarity','id','date','query','username','text'])

In [3]:
data = data.sample(frac = 1,random_state=25) 

In [4]:
data.head()

Unnamed: 0,polarity,id,date,query,username,text
1310518,4,2013329963,Tue Jun 02 22:25:12 PDT 2009,NO_QUERY,JeffreyNam,working add oil
1259386,4,1998107018,Mon Jun 01 18:00:16 PDT 2009,NO_QUERY,NKANGEL74,@KristianaNKOTB you're welcome
897897,4,1693461347,Sun May 03 22:37:21 PDT 2009,NO_QUERY,allyfish87,"is going to bed, work in the morning boo but t..."
148032,0,1882992096,Fri May 22 07:49:50 PDT 2009,NO_QUERY,drcharlii,@sparky_habbo - uni &amp; assignments happened...
743318,0,2266811610,Sun Jun 21 09:08:26 PDT 2009,NO_QUERY,elizuhhbef,Can't wait to have chinese food! Still disappo...


In [5]:
def tweet_cleaner(text):
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[A-Za-z0-9./]+'
    combined_pat = r'|'.join((pat1, pat2))
    stripped = re.sub(combined_pat, '', text)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    return lower_case.strip()

In [6]:
v_cleaner = np.vectorize(tweet_cleaner)
texts = v_cleaner(data['text'].values)
texts

array(['working add oil', 'you re welcome',
       'is going to bed  work in the morning boo but then gets to see my b',
       ...,
       'waiting around for renders to finish in case there s any tweaks to do  then tomorrow    rhino charge',
       'i m  good  suppose to went to work but plans changed  so just been chilling taking it easy',
       'so i was just nominated by  and you  i don t make the final list of nominees'],
      dtype='<U244')

In [7]:
labels = data['polarity'].values == 4
labels

array([ True,  True,  True, ...,  True,  True, False])

In [8]:
BATCH_SIZE = 128
AUTOTUNE = tf.data.AUTOTUNE
examples = len(data.index)
train_val = int(np.ceil(0.6*examples))
val_test = int(np.ceil(0.8*examples))

In [9]:
train_ds = tf.data.Dataset.from_tensor_slices((texts[:train_val],labels[:train_val])).batch(BATCH_SIZE).prefetch(AUTOTUNE)
val_ds = tf.data.Dataset.from_tensor_slices((texts[train_val:val_test],labels[train_val:val_test])).batch(BATCH_SIZE).prefetch(AUTOTUNE)
test_ds = tf.data.Dataset.from_tensor_slices((texts[val_test:],labels[val_test:])).batch(BATCH_SIZE).prefetch(AUTOTUNE)

In [10]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(5):
    print(f'Review: {text_batch.numpy()[i]}')
    label = label_batch.numpy()[i]
    print(label)

Review: b'working add oil'
True
Review: b'you re welcome'
True
Review: b'is going to bed  work in the morning boo but then gets to see my b'
True
Review: b'habbo   uni  amp  assignments happened  goodnight mr sparkles'
False
Review: b'can t wait to have chinese food  still disappointed ocharleys stopped making jambalaya pasta'
False


In [31]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128-with-normalization/1", output_shape=[128],
                           input_shape=[], dtype=tf.string)









In [36]:
model = tf.keras.Sequential([
    hub_layer,
#     tf.keras.layers.Dense(64, activation='relu'),
#     tf.keras.layers.Dense(128,activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
])

In [37]:
num_epochs = 50
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                                 optimizer=tf.keras.optimizers.Adam(),
                                 metrics=['accuracy'])
model.summary()
# history = model.fit(train_ds, epochs=num_epochs, 
#                       validation_data=val_ds)

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_4 (KerasLayer)   (None, 128)               124642688 
Total params: 124,642,688
Trainable params: 0
Non-trainable params: 124,642,688
_________________________________________________________________


In [None]:
dataset_name = 'sentiment140'
saved_model_path = './{}_hub'.format(dataset_name.replace('/', '_'))
model.save(saved_model_path, include_optimizer=False)