<a href="https://colab.research.google.com/github/RonaldYou/Sigma/blob/master/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%tensorflow_version 2.x
%load_ext tensorboard

In [2]:
import tensorflow as tf
import numpy as np 
import pandas as pd
import io
import datetime, os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.lib.io.tf_record import TFRecordWriter
from tensorflow.python.keras.callbacks import TensorBoard

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%cd gdrive/My\ Drive/IgnitionHacks/Sigma

In [5]:
# -----------------------------------------------------------
# NLP Model that trains for sentiment analysis
#
# 08/22/2020
# Kenneth Ruan
# Ignition Hacks 2020
# -----------------------------------------------------------

# Link to the cleaned data in github
# data_url = 'https://raw.githubusercontent.com/RonaldYou/Sigma/master/clean_data.csv'

# Loading the data from the url
# dataframe = pd.read_csv(data_url)

# Loading from local disk
dataframe = pd.read_csv('clean_data.csv')

# Storing the data from the dataframe
ids = dataframe['ID']
users = dataframe['User']
tweets = dataframe['text']
sentiment = dataframe['Sentiment']

In [6]:
# Determine the amount of data used for training/validation
DATA_LENGTH = len(tweets)
TRAINING_RATIO = 0.8
TRAINING_INDEX = int(DATA_LENGTH*TRAINING_RATIO)

# Split data
training_tweets = tweets[:TRAINING_INDEX]
testing_tweets = tweets[TRAINING_INDEX:]

training_labels = sentiment[:TRAINING_INDEX]
testing_labels = sentiment[TRAINING_INDEX:]

# Stores all unique words to determine vocabulary size
vocab = set()

for entry in training_tweets[:TRAINING_INDEX]:
  for word in entry.split(" "):
    vocab.add(word)


In [None]:
# Parameters for the Embedding Layer in the model
vocab_size = len(vocab)
print(vocab_size)
embedding_dim = 32
MAX_TWEET_LENGTH = 140  # Maximum of 140 words if a tweet is 280 characters

# Constants for Tokenizer
OOV_TOKEN = '<OOV>' # Generic character used as a placeholder for words 'Outside Of Vocabulary"
PAD_TYPE = 'pre'
TRUNC_TYPE = 'pre'

# -----------------------------------------------------------
# Tokenizer
#
# The neural network takes input in the form of numbers representing features
# of a body of text. Tokenizer is utilized here to map all of the words to a 
# number. Each tweet is then changed into a sequence of numbers and padded
# for a consistent shape.
# -----------------------------------------------------------

tokenizer = Tokenizer(num_words=vocab_size, oov_token=OOV_TOKEN)
tokenizer.fit_on_texts(training_tweets)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_tweets)
training_padded = pad_sequences(training_sequences, maxlen=MAX_TWEET_LENGTH,
                                padding=PAD_TYPE, truncating=TRUNC_TYPE)

testing_sequences = tokenizer.texts_to_sequences(testing_tweets)
testing_padded = pad_sequences(testing_sequences, maxlen=MAX_TWEET_LENGTH,
                                padding=PAD_TYPE, truncating=TRUNC_TYPE)


In [8]:
# Convert lists to numpy arrays for Tensorflow 2.x compatibility
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [9]:
# -----------------------------------------------------------
# Batches
# 
# When training with a Neural Network, the data is loaded
# into RAM for quick access. However, our training dataset
# is 800 000 elements so it isn't possible to store all of
# those elements inside the RAM simultaneously nor would it
# be very efficient.
#
# Tensorflow offers a solution to this in the form of
# a data pipeline. Data pipelines can be thought of as a 
# physical pipe where elements are passed through at a 
# controlled rate as to not overwhelm the system.
#
# In order to create this data pipeline we use TensorFlow's
# Dataset module. We batch the data into groups to be fed
# into the RAM and randomize the order of elements to prevent
# overfitting.
# -----------------------------------------------------------

# Conversion to tensors to create dataset
training_padded = tf.convert_to_tensor(training_padded)
training_labels = tf.convert_to_tensor(training_labels)

dataset = tf.data.Dataset.from_tensor_slices((training_padded,training_labels))

# -----------------------------------------------------------
# Buffer size to shuffle the dataset
#
# TF data supports potentially infinite sequences,
# so instead of shuffling the whole dataset, a buffer is
# maintained to control how much is shuffled
# -----------------------------------------------------------
BUFFER_SIZE = 10000
BATCH_SIZE = 64

data = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


In [10]:
# -----------------------------------------------------------
# Model Architecture

# Embedding - Converts words into vectors in transformed
# space to represent the data more meaningfully as opposed
# to a 'Bag of Words' where words are mapped to an ID.

# Dropout - The dropout layer randomly sets inputs to 0, this
# helps introduce some randomness in order to prevent
# overfitting

# LSTM - Long Short-Term Memory is a layer specialized for 
# taking into account the context from a sequence. This helps
# with tasks such as NLP, as language is very context
# dependant.

# GlobalAveragePooling1D - Pooling layers are used to reduce
# the number of parameters in a model. This helps avoid
# overfitting to the training data. We are performing binary
# classification so it is only 1-dimensional.

# Dense - Dense layers represent a layer of neurons
# that is fully connected to the previous layer. The weights
# are adjusted during training as with all the other layers.
#
# Activation Types
# 
# Relu - All negative values become 0, positive values
# remain the same.
#
# Sigmoid - Values are transformed into one that is between
# 0.0 to 1.0, used to determine the probability for the final
# output
# -----------------------------------------------------------

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=MAX_TWEET_LENGTH),
  tf.keras.layers.SpatialDropout1D(0.25),
  tf.keras.layers.LSTM(50, dropout=0.5, recurrent_dropout=0.3),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 140, 32)           9578368   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 140, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 50)                16600     
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 51        
Total params: 9,595,019
Trainable params: 9,595,019
Non-trainable params: 0
_________________________________________________________________


In [11]:
# -----------------------------------------------------------
# Model Callbacks
#
# The code below is used to setup callbacks in the neural
# network. These callbacks provide information and files
# that are important for keeping a neural network project
# organized and easy-to-use
# -----------------------------------------------------------

# Create logs of stats such as loss and accuracy to pass to the TensorBoard
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard = TensorBoard(log_dir=logdir, update_freq = 1250)

# Setting up checkpoints so that trained weights can be stored
checkpoint_dir = r'training_checkpoints/v3'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
    save_freq='epoch')

In [None]:
%tensorboard --logdir logs # Application used to monitor training progress

In [None]:
num_epochs = 2
model.load_weights(tf.train.latest_checkpoint('training_checkpoints/v2'))
history = model.fit(data, epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=1, callbacks=[tensorboard, checkpoint_callback])

In [12]:
# Creating the model to load in the trained weights
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=MAX_TWEET_LENGTH),
  tf.keras.layers.SpatialDropout1D(0.25),
  tf.keras.layers.LSTM(50, dropout=0.5, recurrent_dropout=0.3),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

#Loads the weights in from the most recent save
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
model.build(tf.TensorShape([1, None]))

# METHOD 1: Loading the data from the url
# judge_data_url = 'https://raw.githubusercontent.com/RonaldYou/Sigma/master/clean_jdata.csv'
# judge_dataframe = pd.read_csv(data_url)

# METHOD 2: Loading data from local disk
judge_dataframe = pd.read_csv('clean_jdata.csv')

# Storing the data from the dataframe
judge_ids = judge_dataframe['ID']
judge_users = judge_dataframe['User']
judge_tweets = judge_dataframe['text']

# Tokenizing the sequences
judge_sequences = tokenizer.texts_to_sequences(judge_tweets)
judge_padded = pad_sequences(judge_sequences, maxlen=MAX_TWEET_LENGTH,
                                padding=PAD_TYPE, truncating=TRUNC_TYPE)

# Convert lists into dataset format
judge_padded = np.array(np.expand_dims(judge_padded,1))
judge_padded = tf.convert_to_tensor(judge_padded)
judge_dataset = tf.data.Dataset.from_tensor_slices(judge_padded)

In [None]:
# Prepare data from original csv to format the submission
original_data = pd.read_csv('contestant_judgment.csv')
og_ids = original_data['ID']
og_users = original_data['User']
og_tweets = original_data['Text']

# Perform predictions on judge dataset
p = model.predict(judge_dataset, batch_size=32, verbose=1)


In [None]:
# Write information to submission.csv
import csv

with open('submission.csv', 'w', newline='', encoding='utf-8') as newfile:
  csvwriter = csv.writer(newfile)
  csvwriter.writerow(["ID", "User", "Text", "Sentiment"]) #Headers for csv
 
  for idx in range(len(judge_dataset)):
    csvwriter.writerow([og_ids[idx], og_users[idx], og_tweets[idx], int(round(p[idx][0]))])
    print(idx)
    