# packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
import matplotlib.pyplot as plt

In [None]:
# import the Python class that we wrote for text pre-processing
from preprocess import preprocess

# PART ONE: data load-in and pre-processing

In [None]:
training = pd.read_csv('sentiment140/training.csv', encoding='ISO-8859-1',
                      header=None) # load-in data
training.columns = ['label','id','time','query','username','text'] # remame the columns
training.head(5) # display the first 5 columns

In [None]:
processor = preprocess(lemma=True, stem=False, delstop=True) # innitialize the processor

In [None]:
# eg 1
print('before:\n',training['text'][0],'\n-----------\n')
print('after:\n',processor.process(training['text'][0]))

In [None]:
# eg 2
print('before:\n',training['text'][12],'\n-----------\n')
print('after:\n',processor.process(training['text'][12]))

In [None]:
# extract 40,000 instances
reduced_data = pd.concat([training[0:20000],training[800000:820000]])
reduced_data.index = range(40000)

In [None]:
# random splitting
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8,
                                 random_state=928)

In [None]:
reduced_data['status'] = None
for train_index, test_index in splitter.split(reduced_data['text'], reduced_data['label']):
    reduced_data['status'][train_index] = 't'
    reduced_data['status'][test_index] = 'v'

In [None]:
# pre-process: lemma=True, delstop=True
reduced_data['lemma&delstop'] = reduced_data.progress_apply(lambda x: processor.process(x['text']), axis=1)

In [None]:
# pre-process: lemma=True, delstop=False
processor2 = preprocess(lemma=True, stem=False, delstop=False) 
reduced_data['lemma'] = reduced_data.progress_apply(lambda x: processor2.process(x['text']), axis=1) 

In [None]:
# pre-process: lemma=False, delstop=True
processor3 = preprocess(lemma=False, stem=False, delstop=True) 
reduced_data['delstop'] = reduced_data.progress_apply(lambda x: processor3.process(x['text']), axis=1) 

In [None]:
# pre-process: lemma=False, delstop=False
processor4 = preprocess(lemma=False, stem=False, delstop=False) 
reduced_data['none'] = reduced_data.progress_apply(lambda x: processor4.process(x['text']), axis=1) 

In [None]:
reduced_data.to_csv('40k_split_processed.csv',index=False) # save the processed data

# PART TWO: Exploratory analysis

In [None]:
data = pd.read_csv('40k_split_processed.csv') # load in the pre-processed data

In [None]:
# this is a function for drawing wordclouds,
#  which is customized for our data
from custom_wordcloud import custom_wordcloud as cwc

In [None]:
# further drop some meaningless words to improve visualization
drops = ['quot','today','day','now','ing',' go ',' got ']

In [None]:
# drawing wordclouds for positive and negative tweets separately
fig=plt.figure(figsize=(10,10),dpi=300)
ax1=fig.add_subplot(121)
ax1.imshow(cwc(data=data, label='positive', drop=drops, color='autumn'))
plt.axis("off")
ax2=fig.add_subplot(122)
ax2.imshow(cwc(data=data, label='negative', drop=drops, color='winter'))
plt.axis("off")
fig.show()

# PART THREE: Naive Bayes

In [None]:
# import the Class where we implement Naive Bayes from scratch
from naive_bayes import *

In [None]:
# get training and validation data
train_set = data[data['status']=='t']
train_set.index = range(len(train_set))
test_set = data[data['status']=='v']
test_set.index = range(len(test_set))

In [None]:
# a grid to search
techniques = ['lemma&delstop','lemma','delstop','none']
vocab_sizes = [100,500,1000,2000,5000]

In [None]:
# create a dataframe to store results
NB_results = pd.DataFrame(np.zeros((5,4)))
NB_results.index = vocab_sizes
NB_results.columns = techniques

In [None]:
# initialize processors
processors = {}
for t in techniques:
    lemma, delstop = False, False
    if 'lemma' in t:
        lemma=True
    if 'delstop' in t:
        delstop=True
    processors[t] = preprocess(lemma=lemma, stem=False, delstop=delstop)

In [None]:
# grid search
for i,t in enumerate(techniques):
    model = naive_bayes(5, None) # words that appear no more than five times are discarded
    for j,v in enumerate(vocab_sizes):
        print('='*10+t+' '+str(v)+'='*10)
        model.vocab_size = v
        model.train(train_set[t][train_set['label']==4], train_set[t][train_set['label']==0]) # training
        # path='NB/'+str(t)+str(v)+'.png'
        result = model.evaluate(test_set, processors[t], prob=False, roc=False) # evaluation
        NB_results.iloc[j,i] = result['AUC']

In [None]:
NB_results.to_csv('NB_AUC.csv') # save the result

# PART FOUR: LSTM

In [None]:
from mxnet.contrib import text
from mxnet import nd, autograd
import collections
from mxnet.gluon import data as gdata,loss as gloss, utils as gutils, nn, rnn
import d2lzh as d2l
from mxnet import init, gluon
import mxnet as mx
import time

In [None]:
# import the functions that we need to train a neural network
from utilities import *

In [None]:
# define the neural network architecture
class lstm(nn.Block):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers_lstm, num_layers_ffn, **kwargs):
        super(lstm, self).__init__(**kwargs)
        
        # number of hidden units
        self.num_hiddens = num_hiddens
        
        # lstm layer
        self.embedding = nn.Embedding(len(vocab), embed_size) # create the projection layer
        self.encoder = rnn.LSTM(hidden_size=num_hiddens, num_layers=num_layers_lstm, bidirectional=True,
                               input_size=embed_size)
        
        # output layers
        self.ffn = nn.Sequential()
        for k in range(num_layers_ffn):
            self.ffn.add(nn.Dense(units=128, activation='relu'))
        self.output = nn.Dense(units=2)
    
    def forward(self, inputs):
        embeddings = self.embedding(inputs.T) # project token indices to word embeddings
        outputs = self.encoder(embeddings).transpose((1,0,2)) # (#words, batch_size, 2*num_hiddens)->(batch_size,#words,2*num_hiddens)
        outputs = nd.concat(outputs[:,0,self.num_hiddens:2*self.num_hiddens].squeeze(), 
                  outputs[:,outputs.shape[1]-1,0:self.num_hiddens].squeeze(), dim=1) # keep the encoding of the first and last tokens (batch_size, 2*num_hiddens)
        
        outputs = self.ffn(outputs) # (batch_size, 128)
        outputs = self.output(outputs) # (batch_size, 2)
        return(outputs)

In [None]:
results = pd.DataFrame()
results['technique'] = None
results['num_hiddens'] = None
results['num_layers_lstm'] = None
results['num_layers_ffn'] = None
results['epoch'] = None
results['AUC'] = None

In [None]:
# grid search
loss = gloss.SoftmaxCrossEntropyLoss()
techniques = ['lemma&delstop','lemma','delstop','none']
nums_hiddens = [64,128,256]
nums_layers_lstm = [1,2]
nums_layers_fnn = [1,2]
ctx = d2l.try_gpu()
for t in techniques:
    iter_train, iter_test, emb, vocab = get_features(t)
    for h in nums_hiddens:
        for l1 in nums_layers_lstm:
            for l2 in nums_layers_fnn:
                print(t,' ', h, ' ', l1, ' ', l2)
                ctx.empty_cache()
                net1 = lstm(vocab=vocab, embed_size=300, num_hiddens=h, num_layers_lstm=l1, num_layers_ffn=l2)
                mx.random.seed(2022) # set the random seed to ensure replicability
                net1.initialize(init.Xavier(), ctx=ctx)
                net1.embedding.weight.set_data(emb.idx_to_vec)
                trainer = gluon.Trainer(net1.collect_params(), 'adam', {'learning_rate': 0.001, 'wd':0.00001})
                train(iter_train, iter_test, net1, trainer, ctx, 10)

In [None]:
results.to_csv('lstm_AUC.csv', index=False) # save the result

# PART FIVE: BERT

In [None]:
import gluonnlp as nlp

In [None]:
# get the vocabulary of BERT
_,vocab = nlp.model.get_model(name='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased',
                    pretrained=True,use_pooler=False,use_classifier=False,
                    use_token_type_embed=False, use_decoder=False)

In [None]:
# substitute our special tokens "USERNAME" and "URL" into the vocab
index1 = vocab.token_to_idx["[unused10]"]
vocab.token_to_idx['USERNAME'] = index1
index2 = vocab.token_to_idx["[unused11]"]
vocab.token_to_idx['URL'] = index2

In [None]:
# define the model architecture
class BERT(nn.Block):
    def __init__(self, num_layers_ffn, **kwargs):
        super(BERT, self).__init__(**kwargs)
        
        # bert encoder
        self.encoder,_ = nlp.model.get_model(name='bert_12_768_12',
                                           dataset_name='book_corpus_wiki_en_uncased',
                                           pretrained=True,
                                           ctx=mx.gpu(),
                                           use_pooler=False,
                                           use_classifier=False,
                                           use_token_type_embed=False,
                                           use_decoder=False,
                                           dropout=0)
        # output layers
        self.ffn = nn.Sequential()
        for k in range(num_layers_ffn):
            self.ffn.add(nn.Dense(units=128, activation='relu'))
        self.output = nn.Dense(units=2)
    
    def forward(self, inputs):
        N = inputs.shape[0]
        token_types = nd.zeros((N, max_len), ctx=mx.gpu())
        outputs = self.encoder(inputs, token_types)
        outputs = outputs.mean(axis=1) # average pooling
        outputs = self.ffn(outputs) # (batch_size, 128)
        outputs = self.output(outputs) # (batch_size, 2)
        return(outputs)

In [None]:
results = pd.DataFrame()
results['technique'] = None
results['num_layers_ffn'] = None
results['epoch'] = None
results['AUC'] = None

In [None]:
# grid search
loss = gloss.SoftmaxCrossEntropyLoss()
techniques = ['none','lemma&delstop','lemma','delstop']
nums_layers_fnn = [1,2]
ctx = d2l.try_gpu()
for t in techniques:
    iter_train, iter_test = get_features_2(t)
    for l2 in nums_layers_fnn:
        print(t,' ',l2)
        ctx.empty_cache()
        net2 = BERT(num_layers_ffn=l2)
        mx.random.seed(2022) # set the random seed to ensure replicability
        net2.initialize(init.Xavier(), ctx=ctx)
        trainer = gluon.Trainer(net2.collect_params(), 'adam', {'learning_rate': 0.0001})
        train(iter_train, iter_test, net2, trainer, ctx, 10)

In [None]:
results.to_csv('BERT_AUC.csv', index=False) # save the result

# PART SIX: Visualization of grid search results

## 1. Naive Bayes

In [None]:
AUC_NB = pd.read_csv('NB_AUC.csv')

In [None]:
fig, ax = plt.subplots(figsize=(8,6)) 
ax.spines['right'].set_visible(False) 
ax.spines['top'].set_visible(False) 
colors = ['orange','green','purple','blue','red']
for i in range(0,5):
    plt.plot(AUC_NB.iloc[i,1:5], label=AUC_NB.iloc[i,0],color=colors[i])
plt.legend(title='size of vocabulary')
plt.xticks([0,1,2,3],['lemmatization and \nstopword deletion',
           'lemmatization',
           'stopword deletion',
           'neither'],fontsize=10)
plt.xlabel('text pre-processing technique',fontsize=15)
plt.ylabel('AUC',fontsize=15)
plt.savefig('nb.png',dpi=200)

## 2. LSTM

In [None]:
AUC_lstm = pd.read_csv('lstm_AUC.csv')

In [None]:
# 1 lstm layer, 1 output layer
fig, ax = plt.subplots(figsize=(8,6)) 
ax.spines['right'].set_visible(False) 
ax.spines['top'].set_visible(False) 
for i in [64,128,256]:
    index1 = AUC_lstm['num_layers_lstm']==1
    index2 = AUC_lstm['num_layers_ffn']==1
    index3 = AUC_lstm['num_hiddens']==i
    sub = AUC_lstm[index1*index2*index3]
    sub = sub.groupby('technique').max()
    sub = sub.loc[['lemma&delstop','lemma','delstop','none'],:]
    plt.plot(sub.AUC,label=i)
plt.legend(title='number of hidden units',loc='lower right')
plt.xticks([0,1,2,3],['lemmatization and \nstopword deletion',
           'lemmatization',
           'stopword deletion',
           'neither'],fontsize=10)
plt.xlabel('text pre-processing technique',fontsize=15)
plt.ylabel('AUC',fontsize=15)
plt.savefig('lstm1.png',dpi=200)

In [None]:
# 1 lstm layer, 2 output layer
fig, ax = plt.subplots(figsize=(8,6))
ax.spines['right'].set_visible(False) 
ax.spines['top'].set_visible(False)
for i in [64,128,256]:
    index1 = AUC_lstm['num_layers_lstm']==1
    index2 = AUC_lstm['num_layers_ffn']==2
    index3 = AUC_lstm['num_hiddens']==i
    sub = AUC_lstm[index1*index2*index3]
    sub = sub.groupby('technique').max()
    sub = sub.loc[['lemma&delstop','lemma','delstop','none'],:]
    plt.plot(sub.AUC,label=i)
plt.legend(title='number of hidden units',loc='lower right')
plt.xticks([0,1,2,3],['lemmatization and \nstopword deletion',
           'lemmatization',
           'stopword deletion',
           'neither'],fontsize=10)
plt.xlabel('text pre-processing technique',fontsize=15)
plt.ylabel('AUC',fontsize=15)
plt.savefig('lstm2.png',dpi=200)

In [None]:
# 2 lstm layer, 1 output layer
fig, ax = plt.subplots(figsize=(8,6)) 
ax.spines['right'].set_visible(False) 
ax.spines['top'].set_visible(False)
for i in [64,128,256]:
    index1 = AUC_lstm['num_layers_lstm']==2
    index2 = AUC_lstm['num_layers_ffn']==1
    index3 = AUC_lstm['num_hiddens']==i
    sub = AUC_lstm[index1*index2*index3]
    sub = sub.groupby('technique').max()
    sub = sub.loc[['lemma&delstop','lemma','delstop','none'],:]
    plt.plot(sub.AUC,label=i)
plt.legend(title='number of hidden units',loc='lower right')
plt.xticks([0,1,2,3],['lemmatization and \nstopword deletion',
           'lemmatization',
           'stopword deletion',
           'neither'],fontsize=10)
plt.xlabel('text pre-processing technique',fontsize=15)
plt.ylabel('AUC',fontsize=15)
plt.savefig('lstm3.png',dpi=200)

In [None]:
# 2 lstm layer, 2 output layer
fig, ax = plt.subplots(figsize=(8,6)) 
ax.spines['right'].set_visible(False) 
ax.spines['top'].set_visible(False) 
for i in [64,128,256]:
    index1 = AUC_lstm['num_layers_lstm']==2
    index2 = AUC_lstm['num_layers_ffn']==2
    index3 = AUC_lstm['num_hiddens']==i
    sub = AUC_lstm[index1*index2*index3]
    sub = sub.groupby('technique').max()
    sub = sub.loc[['lemma&delstop','lemma','delstop','none'],:]
    plt.plot(sub.AUC,label=i)
plt.legend(title='number of hidden units',loc='lower right')
plt.xticks([0,1,2,3],['lemmatization and \nstopword deletion',
           'lemmatization',
           'stopword deletion',
           'neither'],fontsize=10)
plt.xlabel('text pre-processing technique',fontsize=15)
plt.ylabel('AUC',fontsize=15)
plt.savefig('lstm4.png',dpi=200)

## 3. BERT

In [None]:
AUC_BERT = pd.read_csv('BERT_AUC.csv')

In [None]:
fig, ax = plt.subplots(figsize=(8,6)) 
ax.spines['right'].set_visible(False) 
ax.spines['top'].set_visible(False) 
for i in [1,2]:
    sub = AUC_BERT[AUC_BERT['num_layers_ffn']==i]
    sub = sub.groupby('technique').max()
    sub = sub.loc[['lemma&delstop','lemma','delstop','none'],:]
    plt.plot(sub.AUC,label=i)
plt.legend(title='number of output layers',loc='lower right')
plt.xticks([0,1,2,3],['lemmatization and \nstopword deletion',
           'lemmatization',
           'stopword deletion',
           'neither'],fontsize=10)
plt.xlabel('text pre-processing technique',fontsize=15)
plt.ylabel('AUC',fontsize=15)
plt.savefig('bert.png',dpi=200)

# PART SEVEN: final training and evaluation

## 1. data preparation

In [None]:
final_test = pd.read_csv('test.csv',header=None)
final_test.columns = ['label','query','time','ip','username','text']
final_test = final_test[final_test['label']!=2] # only keep positive and negative examples
final_test.index = range(len(final_test))

In [None]:
# training set
words = [str(st).split(' ') for st in data['none']]
words_idx = [vocab[x] for x in words]
max_len = max([len(words[i]) for i in range(len(words))])
features_train = nd.array([pad(x) for x in words_idx])
labels_train = nd.array([1 if data['label'][i]==4 else 0 for i in range(len(data))])

dataset_train = gdata.ArrayDataset(features_train,labels_train)
iter_train = gdata.DataLoader(dataset_train,256,shuffle=True) 

In [None]:
# test set
processor = preprocess(lemma=False, stem=False, delstop=False)
final_test['none'] = [processor.process(final_test['text'][i]) for i in range(len(final_test))]

words = [str(st).split(' ') for st in final_test['none']]
words_idx = [vocab[x] for x in words]
features_test = nd.array([pad(x) for x in words_idx])

labels_test = nd.array([1 if final_test['label'][i]==4 else 0 for i in range(len(final_test))])

dataset_test = gdata.ArrayDataset(features_test,labels_test)
iter_test = gdata.DataLoader(dataset_test,256,shuffle=False)

## 2. model training and evaluation

In [None]:
net3 = BERT(num_layers_ffn=1)
mx.random.seed(2022) # set the random seed to ensure replicability
net3.initialize(init.Xavier(), ctx=ctx)
trainer = gluon.Trainer(net3.collect_params(), 'adam', {'learning_rate': 0.0001})
train(iter_train, iter_test, net3, trainer, ctx, 10)