## Imports

In [1]:
############### Imports ###############

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import random
import time
import numpy as np
import pandas as pd
import datetime
from pathlib import Path
from functions import clean
import matplotlib.pyplot as plt

### Preprocessing ###
import nltk
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.lancaster import LancasterStemmer

### Tensorflow ###
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

### API ###
import requests
import base64
import tweepy

In [2]:
%%capture
from tqdm.notebook import tqdm as tqdm
tqdm().pandas()

In [3]:
%reload_ext tensorboard
tf.get_logger().setLevel('ERROR')

## Pipeline Parameters

In [4]:
datasetSize = 1600000
trainingPart = 0.8
testingPart =  0.2
split = int(datasetSize*trainingPart)
tokenizer = None

embedding_dim = 100    # glove6b100
max_length = 20        # max lenght of a tweet
trunc_type='post'      # it will cut the tweet if it is longer than 20
padding_type='post'    # it will add zeros at the end the tweets if is smaller than 20
oov_tok = "<OOV>"      # for unseen words 

## Reading Dataset

In [5]:
def read_dataset():
    if Path("./data/cleaned_sentiment.csv").is_file():
        df = pd.read_csv("./data/cleaned_sentiment.csv", encoding='latin')
        df = df.replace(np.nan, '', regex=True)
    else:
        df = pd.read_csv("./data/training.1600000.processed.noemoticon.csv", encoding='latin')
        df.columns = ['Label','Id','Date','Query','Name','Text']
        df = df.drop(columns=['Date', 'Query', 'Name', 'Id'])
        df['Label'] = df['Label'].replace(4, 1)
    
    return df

## Data Cleaning

In [6]:
def clean_dataset(df):
    
    if not Path("./data/cleaned_sentiment.csv").is_file():
        tokenizer = Tokenizer()
        rawData = df['Text'].to_numpy()
        rawLabels = df['Label'].to_numpy()
        tokenizer = TweetTokenizer(strip_handles=True)
        selectedData = []
        corpus = []

        print("Replacing Contractions and Clean Data :")
        with tqdm(total=datasetSize*1.5) as pbar: # The dataset is sorted by labels so we make sure to get our 2 labels in our subset
            for i in range(int(datasetSize/2)):
                selectedData.append([rawData[i], rawLabels[i]])
                selectedData.append([rawData[len(rawData)-i-1], rawLabels[len(rawData)-i-1]])
                pbar.update(1)
            for i in range(datasetSize):
                corpus.append([clean(selectedData[i][0]), selectedData[i][1]])
                pbar.update(1)
        cleaned_df = pd.DataFrame(corpus, columns =['Text', 'Label'])
        cleaned_df.to_csv('./data/cleaned_sentiment.csv', index=True)
    else:
        rawData = df['Text'].to_numpy()
        rawLabels = df['Label'].to_numpy()
        selectedData = []
        corpus = []
        
        for elt in rawData:
            if not isinstance(elt, str):
                print(elt)
        
        for i in range(int(datasetSize/2)):
            selectedData.append([rawData[i], rawLabels[i]])
            selectedData.append([rawData[len(rawData)-i-1], rawLabels[len(rawData)-i-1]])
        for i in range(datasetSize):
            corpus.append([selectedData[i][0], selectedData[i][1]])
                
    return corpus
        

## Dataset Split

In [7]:
def create_padded_sequences(corpus):
    
    sentences=[]
    labels=[]
    
    random.shuffle(corpus)

    for x in range(datasetSize):
        sentences.append(corpus[x][0])
        labels.append(corpus[x][1])
        
    # Using tf tokenizer
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)

    word_index = tokenizer.word_index
    vocab_size=len(word_index)

    # Passing with tensorflow tools
    sequences = tokenizer.texts_to_sequences(sentences)
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    
    return padded, labels, tokenizer, word_index, vocab_size

def split_dataset(padded, labels, split):
    
    test_sequences = padded[split:len(padded)]
    test_labels = labels[split:len(labels)]

    training_sequences = padded[0:split]
    training_labels = labels[0:split]
    
    return training_sequences, training_labels, test_sequences, test_labels


## Import GloVe 100 Dimensions Embedding sequences

In [8]:
def create_embeddings_matrix():
    
    embeddings_index = {};

    with open('./data/glove.6B.100d.txt') as f:
        for line in f:
            values = line.split();
            word = values[0];
            coefs = np.asarray(values[1:], dtype='float32');
            embeddings_index[word] = coefs;

    embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word);
        if embedding_vector is not None:
            embeddings_matrix[i] = embedding_vector;
            
    return embeddings_matrix

## Training model

In [9]:
def train_baseline_model(training_sequences, training_labels, test_sequences, test_labels, embeddings_matrix, vocab_size, num_epochs):

    baseline_model = tf.keras.Sequential([
            tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
            tf.keras.layers.SimpleRNN(128),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    if Path("./data/baseline_weights.h5").is_file():
        baseline_model.load_weights("./data/baseline_weights.h5")

    baseline_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    baseline_model.summary()

    training_padded = np.array(training_sequences)
    training_labels = np.array(training_labels)
    testing_padded = np.array(test_sequences)
    testing_labels = np.array(test_labels)

    logs_base_dir = "./logs"
    
    os.makedirs(logs_base_dir, exist_ok=True)
    logdir = os.path.join(logs_base_dir, "baseline-"+datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

    baseline_history = baseline_model.fit(training_padded, 
                        training_labels, 
                        epochs=num_epochs, 
                        validation_data=(testing_padded, testing_labels),
                        batch_size = 512,
                        verbose=1,
                        callbacks=[tensorboard_callback])
    
    print("Training Complete")
    
    baseline_model.save_weights("./data/baseline_weights.h5", True)
    
    return baseline_model, baseline_history

def train_model(training_sequences, training_labels, test_sequences, test_labels, embeddings_matrix, vocab_size, num_epochs):

    model = tf.keras.Sequential([
            tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Bidirectional(LSTM(units=64, return_sequences=True)),
            tf.keras.layers.Bidirectional(LSTM(units=128)),
            tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
            tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    if Path("./data/myWeights.h5").is_file():
        model.load_weights("./data/myWeights.h5")

    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()

    training_padded = np.array(training_sequences)
    training_labels = np.array(training_labels)
    testing_padded = np.array(test_sequences)
    testing_labels = np.array(test_labels)

    logs_base_dir = "./logs"
    os.makedirs(logs_base_dir, exist_ok=True)
    logdir = os.path.join(logs_base_dir, "LSTM-" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

    history = model.fit(training_padded, 
                        training_labels, 
                        epochs=num_epochs, 
                        validation_data=(testing_padded, testing_labels),
                        batch_size = 512,
                        verbose=1,
                        callbacks=[tensorboard_callback])
    print("Training Complete")
    
    model.save_weights("./data/myWeights.h5", True)
    
    return model, history


## Training

In [21]:
df = read_dataset()
corpus = clean_dataset(df)
padded, labels, tokenizer, word_index, vocab_size = create_padded_sequences(corpus)
training_sequences, training_labels, test_sequences, test_labels  = split_dataset(padded, labels, split)
embeddings_matrix = create_embeddings_matrix()
model, history = train_model(training_sequences, training_labels, test_sequences, test_labels, embeddings_matrix, vocab_size, 1)
baseline_model, baseline_history = train_baseline_model(training_sequences, training_labels, test_sequences, test_labels, embeddings_matrix, vocab_size, 1)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 20, 100)           41577900  
_________________________________________________________________
dropout_3 (Dropout)          (None, 20, 100)           0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 20, 128)           84480     
_________________________________________________________________
bidirectional_7 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_6 (Dense)              (None, 64)                16448     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 41,942,061
Trainable params: 364,161
Non-trainable params: 41,577,900
____________________________________

## Pipeline Function

In [10]:
def create_tokenizer():
    df = read_dataset()
    corpus = clean_dataset(df)
    padded, labels, tokenizer, word_index, vocab_size = create_padded_sequences(corpus)
    return tokenizer, vocab_size, word_index

def pipeline(s, tokenizer, vocab_size):
    
    tf.keras.backend.clear_session()
    
    model = tf.keras.Sequential([
            tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Bidirectional(LSTM(units=64, return_sequences=True)),
            tf.keras.layers.Bidirectional(LSTM(units=128)),
            tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)),
            tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    if Path("./data/myWeights.h5").is_file():
        model.load_weights("./data/myWeights.h5")

    model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

    sentence = clean(s)

    sequences = tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    padded = np.asarray(padded)
    
    val = model.predict(padded)[0][0]
    res = ""
    
    if val > 0.9:
        res = "Very Positive"
    elif val > 0.7:
        res = "Positive"
    elif val < 0.1:
        res = "Very Negative"
    elif val < 0.3:
        res = "Negative"
    else:
        res = "Neutral"

    print(s, ":", res, val)

In [11]:
### Testing the Pipeline ###

if tokenizer is None:
    tokenizer, vocab_size, word_index = create_tokenizer()
    embeddings_matrix = create_embeddings_matrix()

pipeline("You were the chosen one, you were supposed to destroy siths, not join them !", tokenizer, vocab_size)
pipeline("I hate you !!!", tokenizer, vocab_size)
pipeline("You were my brother Anakin, I loved you", tokenizer, vocab_size)

You were the chosen one, you were supposed to destroy siths, not join them ! : Neutral 0.6172293
I hate you !!! : Very Negative 0.073474854
You were my brother Anakin, I loved you : Positive 0.7988903


## Full Pipeline With API Call

In [22]:
api_key = 'ckyDjZhsNFL71ATy2GxinMXKA'
api_secret_key = 'SURpuD1Pwwytmj6bBTj9zWXMrRNUVTUuAGPadc5UTGMOaGshej'
access_token = '2830156159-WacIqOzbncMAJX8tzKKZfytAdPAVqFbbiihbWht'
access_secret_token = 'u5YIhre5YWLHe0f63iKgb0TTHljDaqnyzC5QEVF0zQvWf'

auth = tweepy.OAuthHandler(api_key, api_secret_key)
auth.set_access_token(access_token, access_secret_token)

api = tweepy.API(auth)
places = api.geo_search(query="USA", granularity="country")
place_id = places[0].id

tweets = api.search(q="place:%s" % place_id)

if tokenizer is None:
    tokenizer, vocab_size, word_index = create_tokenizer()
    embeddings_matrix = create_embeddings_matrix()
    
for tweet in tweets:
    print(pipeline(tweet.text, tokenizer, vocab_size))

@keithtaylor21 Anthony Epps was the best! I’m from Marion County so I’m biased! : Positive 0.7533767
None
Out of all this I still be feeling alone sometimes : Very Negative 0.042525433
None
@blueheartedly https://t.co/Ipvojw76Iy : Neutral 0.54378545
None
We all need this reminder https://t.co/pS6UdihmxR : Neutral 0.3880049
None
My buddy! https://t.co/Uo1frewITE : Neutral 0.44178596
None
I think I turned into @experiment_719!!! I have a lot of Starbucks cups coming in the mail soon. 💚✨ : Neutral 0.47046798
None
deter and maybe apprehend some of these miscreants. Some operators have bad attitudes and need to check that cause… https://t.co/iAKVlPepDH : Neutral 0.3751005
None
@BriannamThe Coming from you, gorgeous 🥰 : Very Positive 0.9610776
None
Thank you @Cook4Rep for not placing politics before MO kids and MO public schools. https://t.co/zMZR2Intkw : Very Positive 0.98375225
None


In [13]:
%tensorboard --logdir "./logs"