# Sentiment Analysis of Airline Tweets with Vanilla RNN

No autograd packages will be used in this implementation

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import gensim
from gensim import corpora, models, similarities

In [2]:
class parameter:
    #base class for all parameters
    def __init__(self, data, gradient):
        self.data = data
        self.grad = gradient


## The RNN Layer

In [3]:
#for simplicity, this RNN cell will be a many-to-one RNN
class RNN_cell:
    def __init__(self, W_x, W_h, bias):
        #our goal is to create a graph dynamically
        assert W_x.shape[0] == W_h.shape[0]
        self.W_x = parameter(W_x, np.zeros(W_x.shape))
        self.W_h = parameter(W_h, np.zeros(W_h.shape))
        self.bias = parameter(bias, np.zeros(bias.shape))
        
        self.forward = {}
        self.backward = {}
        
    def forward_pass(self, h_t_1, x_t):
        self.x_t = x_t
        self.h_t_1 = h_t_1
        self.forward['f1'] = (self.W_h.data @ h_t_1 + self.W_x.data @ x_t) + self.bias.data
        self.forward['out'] = np.tanh(self.forward['f1'])
        
        
    def backward_pass(self, in_grad):
        self.backward['t1'] = (1 - np.tanh(self.forward['f1'])**2 ) * in_grad
        self.backward['out'] = self.W_h.data.T @ self.backward['t1']
        
        self.W_x.grad = self.backward['t1'] @ self.x_t.T
        self.W_h.grad = self.backward['t1'] @ self.h_t_1.T
        self.bias.grad = self.backward['t1']
    
    def update_parameters(self, learning_rate):
        self.W_x.data = self.W_x.data - learning_rate * self.W_x.grad
        self.W_h.data = self.W_h.data - learning_rate * self.W_h.grad
        self.bias.data = self.bias.data - learning_rate * self.bias.grad
        
        #zero gradients
        self.W_x.grad = np.zeros(self.W_x.data.shape)
        self.W_h.grad = np.zeros(self.W_h.data.shape)
        self.bias.data = np.zeros(self.bias.data.shape)

class linear_layer:
    def __init__(self, in_dim, out_dim):
        self.M = {}
        self.M['W'] = parameter(data= 0.1*np.random.normal(size = [out_dim, in_dim]), 
                                gradient= np.zeros([out_dim, in_dim]))
        self.M['b'] = parameter(data= 0.1*np.random.normal(size = [out_dim, 1]), 
                                gradient= np.zeros([out_dim, 1]))
        self.forward = {}
        self.backward = {}
    
    def forward_pass(self, X):
        self.X = X
        self.forward['f1'] = np.matmul(self.M['W'].data, self.X)
        self.forward['out'] = self.forward['f1'] +  self.M['b'].data
        
    def backward_pass(self, in_grad):
        self.backward['t2'] = in_grad
        self.backward['out'] = np.matmul(self.M['W'].data.T, self.backward['t2'])

        self.M['W'].grad += np.matmul(self.backward['t2'], self.X.T)
        self.M['b'].grad += np.sum(self.backward['t2'], axis = 1, keepdims= True)

    
    def update_parameters(self, epsilon):
        for key in self.M:
            
            param = self.M[key]
            
            assert (param.grad).shape == (param.data).shape
            
            param.data = param.data - epsilon * param.grad/self.forward['f1'].shape[1]
            #zero the gradients
            param.grad = np.zeros(np.shape(param.grad))

class entropylosswithlogits:
    def __init__(self):
        self.forward = {}
        self.backward = {}
        self.X = 0
        self.y = 0
    
    def forward_pass(self, X, y):
        self.X = X
        self.y = y
        self.forward['f1'] = np.exp(self.X) / np.tile(np.sum(np.exp(self.X), axis=0), (3,1) )
        self.forward['f2'] = -y * np.log(self.forward['f1'])
        self.forward['out'] = np.sum(self.forward['f2'])
        
    def backward_pass(self):
        self.backward['out'] = self.forward['f1'] - self.y

    def update_parameters(self, epsilon):
        pass

## Load in our data and preprocess it

In [4]:
Tweets = pd.read_csv('twitter-airline-sentiment/Tweets.csv')

In [5]:
Tweets = Tweets[['airline','airline_sentiment','text']]

In [6]:
Airlines = set(Tweets['airline'])
list_airlines = []
for airline in Airlines:
    list_airlines.append(re.sub("[^a-zA-Z]","",airline).lower())
    list_airlines.append(re.sub("[^a-zA-Z]","",airline).lower()+'air')
list_airlines=set(list_airlines)

In [7]:
def tweet_to_words(raw_tweet, list_airlines):
    letters_only = re.sub("[^a-zA-Z]", " ",raw_tweet) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops]
    meaningful_words1 = [w for w in meaningful_words if not w in set(list_airlines)]
    return( " ".join( meaningful_words1 )) 

In [8]:
Tweets['Clean'] = Tweets['text'].apply(tweet_to_words, list_airlines = list_airlines)

In [9]:
X = Tweets['Clean'].values.tolist()
corpus = [nltk.word_tokenize(tweet) for tweet in X]
model = gensim.models.Word2Vec(corpus, min_count = 1, size = 64)

In [10]:
sentiment_map = {"neutral":0, "positive":1, "negative":2}

def onehot(integer, n):
    temp = np.zeros((n,1))
    temp[integer,0] = 1
    return temp

Tweets['airline_sentiment_num'] = Tweets['airline_sentiment'].map(sentiment_map)


## A Function to help us to clip the gradients

In [11]:
#for gradient clipping
def restrict_to(X,restriction):
    return (np.minimum(X,restriction)* (X > 0) ) + \
            (np.maximum(X,-restriction)* (X < 0))


## Training the model

### Note: In this notebook, I am training and testing on the same dataset, as this implementation is unable to deal with words that did not appear in the training set. 

In [12]:
def training(Data, model, epochs, epsilon):
    W_h = 0.1*np.random.normal(size = (64,64))
    W_x = 0.1*np.random.normal(size = (64,64))
    bias = 0.1*np.random.normal(size = (64,1))

    
    num_examples = Data.shape[0]
    
    #this is the fully connected layer - this seems confusing....
    FCC = linear_layer(in_dim= 64,out_dim=3)
    #entropy loss
    Loss = entropylosswithlogits()
    for epoch in range(epochs):
        loss = 0
        for index in range(num_examples):

            Sentence = nltk.word_tokenize(Data.loc[index,'Clean'].lower())

            label = onehot(Data.loc[index,'airline_sentiment_num'],3)

            #initialize the hidden state as a vector of zeros
            h_t = np.zeros((64,1))

            Dictionary = {}

            #This is the forward pass

            #########################################################################
            for i,word in enumerate(Sentence):

                word_rep = model.wv.__getitem__(word)[:,None]
                Dictionary[i] = RNN_cell(W_h=W_h, W_x=W_x, bias=bias)
                Dictionary[i].forward_pass(h_t_1 = h_t, x_t = word_rep)
                h_t = Dictionary[i].forward['out']
                
            FCC.forward_pass(h_t)
            Loss.forward_pass(FCC.forward['out'], y = label)
            #########################################################################

            #This is the backward pass
            Loss.backward_pass()
            FCC.backward_pass(Loss.backward['out'])
            in_grad = FCC.backward['out']

            W_x_grad = 0
            W_h_grad = 0
            bias_grad = 0

            #########################################################################
            for i in list(reversed(np.arange(len(Sentence)))):
                Dictionary[i].backward_pass(in_grad)
                #clip the gradients!
                in_grad = restrict_to(Dictionary[i].backward['out'],1)
                #we need to sum up all the gradients: This is a little clunky though.
                W_x_grad += Dictionary[i].W_x.grad
                W_h_grad += Dictionary[i].W_h.grad
                bias_grad += Dictionary[i].bias.grad

            #########################################################################

            W_x -= epsilon * W_x_grad
            W_h -= epsilon * W_h_grad
            bias -= epsilon * bias_grad

            FCC.update_parameters(epsilon)
            #add the loss to print out
            loss += Loss.forward['out']
        print("epoch = ",epoch, " loss = ", loss)
        loss = 0
    return(FCC, W_h, W_x, bias)
        

epochs = 3
FCC, W_h, W_x, bias = training(Tweets,model,epochs, 0.004)

epoch =  0  loss =  11940.348019767858
epoch =  1  loss =  11704.74883545681
epoch =  2  loss =  11576.172000177667


## Testing the accuracy of the trained model

In [14]:
def test(W_h, W_x, bias, FCC, Data):
    num_examples = Data.shape[0]
    score = 0
    for index in range(num_examples):

        Sentence = nltk.word_tokenize(Data.loc[index,'Clean'].lower())

        label = onehot(Data.loc[index,'airline_sentiment_num'],3)

        #initialize the hidden state as a vector of zeros
        h_t = np.zeros((64,1))

        Dictionary = {}

        #This is the forward pass

        #########################################################################
        for i,word in enumerate(Sentence):

            word_rep = model.wv.__getitem__(word)[:,None]
            Dictionary[i] = RNN_cell(W_h=W_h, W_x=W_x, bias=bias)
            Dictionary[i].forward_pass(h_t_1 = h_t, x_t = word_rep)
            h_t = Dictionary[i].forward['out']

        FCC.forward_pass(h_t)
        pred = np.argmax(FCC.forward['out'].squeeze() )
        actual = np.argmax(label.squeeze())
        
        score = score + (actual == pred)
        
        #########################################################################
    print('accuracy = ', score/num_examples)
test(W_h, W_x, bias, FCC, Tweets)

accuracy =  0.6601775956284153
