## 1. Imports

In [1]:
import os
import pandas as pd
import shutil
import json
import numpy as np
import random
from matplotlib import pyplot as plt
from random import sample
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print(f'Tensorflow version {tf.version.VERSION}')
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


Tensorflow version 2.7.1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## 2. Datareader

In [3]:
# 1.6 mil tweets but only positive or negative
df = pd.read_csv('data/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None)
df.columns = ['emotion', 'id', 'timestamp', 'query', 'username', 'tweet']
df.drop(['id', 'query', 'username'],axis=1)
df_sample = df.sample(frac = 1)

df2 = pd.read_csv('data/sentimentData.csv')
df2.columns = ['tweet', 'emotion']
df2.head(10)

Unnamed: 0,tweet,emotion
0,"""Ben Smith / Smith (concussion) remains out of...",1
1,Sorry bout the stream last night I crashed out...,1
2,Chase Headley's RBI double in the 8th inning o...,1
3,@user Alciato: Bee will invest 150 million in ...,2
4,@user LIT MY MUM 'Kerry the louboutins I wonde...,2
5,"""\"""""""" SOUL TRAIN\"""""""" OCT 27 HALLOWEEN SPECIA...",2
6,So disappointed in wwe summerslam! I want to s...,0
7,"""This is the last Sunday w/o football .....,NF...",2
8,@user @user CENA & AJ sitting in a tree K-I-S-...,1
9,@user Well said on HMW. Can you now address wh...,1


## 3. Tokenizer

In [4]:
dataframe = df2
fraction_df = 1
len_df = int(dataframe.shape[0])
vocab_size = int(5e5)
training_size = int(0.7 * len_df)
max_length = 80
embedding_dim = 16
trunc_type = 'post'

emos = dataframe['emotion'].to_numpy()
#emos = emos // 2
#timestamps = dataframe['timestamp'].to_numpy()
tweets = dataframe['tweet'].to_numpy()

train_sentences = tweets[0:training_size//fraction_df]
test_sentences = tweets[training_size//fraction_df:len_df//fraction_df]
train_emos = emos[0:training_size//fraction_df]
test_emos = emos[training_size//fraction_df:len_df//fraction_df]
train_emos = to_categorical(train_emos, 3)
test_emos = to_categorical(test_emos, 3)

#train_timestamp = timestamps[0:training_size//fraction_df]
#train_timestamp = timestamps[training_size//fraction_df:len_df//fraction_df]

tokenizer = Tokenizer(num_words = vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=max_length,
                             padding='post', truncating=trunc_type)
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length,
                            padding='post', truncating=trunc_type)

## 4. Model

In [5]:
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length=max_length))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='sigmoid'))

opt = tf.keras.optimizers.Adam(learning_rate=5e-4)

model.compile(loss='categorical_crossentropy',optimizer=opt, metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 64)            32000000  
                                                                 
 bidirectional (Bidirectiona  (None, 256)              197632    
 l)                                                              
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dense_1 (Dense)             (None, 3)                 195       
                                                                 
Total params: 32,214,275
Trainable params: 32,214,275
Non-trainable params: 0
_________________________________________________________________


In [6]:
num_epochs = 10
history = model.fit(train_padded, train_emos, epochs=num_epochs,
                    validation_data=(test_padded, test_emos), verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10

KeyboardInterrupt: 

## 5. Graphs

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")