In [1]:
#read data with format
import pandas as pd

#math calculation
import numpy as np

#sql
import sqlite3

#clean text
from bs4 import BeautifulSoup
import re
from nltk.tokenize import WordPunctTokenizer

#Sampling split
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split

#Algorithm
from sklearn.linear_model import LogisticRegression
from sklearn import svm

#RNN
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.models import Sequential
from keras.layers import CuDNNLSTM, Dense, Bidirectional
from keras.utils import plot_model
from keras import optimizers
from keras.wrappers.scikit_learn import KerasClassifier

#utils
import math
import time
from tqdm import tqdm #progress bar
import random
import os
from sklearn.metrics import accuracy_score

print("Finish import libraries")

Using TensorFlow backend.


Finish import libraries


In [2]:
#Google Colab
#connect gdrive
from google.colab import drive
from os.path import join
drive.mount('/content/gdrive')
FILEPATH = '/content/gdrive/My Drive/twitter-sentiment-analysis/'

#w2v custom lib
from importlib.machinery import SourceFileLoader
word2vecReaderUtils = SourceFileLoader('word2vecReaderUtils', join(FILEPATH, 'word2vecReaderUtils.py')).load_module()
word2vecReader = SourceFileLoader('word2vecReader', join(FILEPATH, 'word2vecReader.py')).load_module()
from word2vecReader import Word2Vec

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
conn = sqlite3.connect(join(FILEPATH, 'database.sqlite'))
df = pd.read_sql_query("select * from Tweets;", conn)
print("Finish loading dataset")

Finish loading dataset


In [0]:
#CONFIG
#GENERAL
SPLIT_RATIO = 0.2
K_FOLDS = 5
SEED = 6

#Logistic Regression
SOLVER_LR = 'liblinear'
MULTI_CLASS_LR = 'ovr'
MAX_ITER_LR = 500

#SVM
KERNEL_SVM = 'linear'

#RNN
BATCH_SIZE = 64
EPOCHS = 8
INPUT_SIZE = 400 #fixed, number of features in embedding
LEARNING_RATE = 0.01
MOMENTUM_RATE = 0.9
DROPOUT_RATE = 0.25
STEP_PER_EPOCH = 1000

def seed_torch(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


PREPROCESSING DATA

Clean text by remove misspell word, @..., website link, etc

Load Word2Vec model and perform word embedding (skip if dont want to retrain)

In [5]:
df['encode_airline_sentiment'] = (df['airline_sentiment'] != 'negative').astype(int)
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
misspell_dict = {"aren't" : "are not",
                "can't" : "cannot",
                "couldn't" : "could not",
                "didn't" : "did not",
                "doesn't" : "does not",
                "don't" : "do not",
                "hadn't" : "had not",
                "hasn't" : "has not",
                "haven't" : "have not",
                "he'd" : "he would",
                "he'll" : "he will",
                "he's" : "he is",
                "i'd" : "I would",
                "i'll" : "I will",
                "i'm" : "I am",
                "isn't" : "is not",
                "it's" : "it is",
                "it'll":"it will",
                "i've" : "I have",
                "let's" : "let us",
                "mightn't" : "might not",
                "mustn't" : "must not",
                "shan't" : "shall not",
                "she'd" : "she would",
                "she'll" : "she will",
                "she's" : "she is",
                "shouldn't" : "should not",
                "that's" : "that is",
                "there's" : "there is",
                "they'd" : "they would",
                "they'll" : "they will",
                "they're" : "they are",
                "they've" : "they have",
                "we'd" : "we would",
                "we're" : "we are",
                "weren't" : "were not",
                "we've" : "we have",
                "what'll" : "what will",
                "what're" : "what are",
                "what's" : "what is",
                "what've" : "what have",
                "where's" : "where is",
                "who'd" : "who would",
                "who'll" : "who will",
                "who're" : "who are",
                "who's" : "who is",
                "who've" : "who have",
                "won't" : "will not",
                "wouldn't" : "would not",
                "you'd" : "you would",
                "you'll" : "you will",
                "you're" : "you are",
                "you've" : "you have",
                "'re": " are",
                "wasn't": "was not",
                "we'll":" will",
                "didn't": "did not",
                "tryin'":"trying"}
misspell_pattern = re.compile(r'\b(' + '|'.join(misspell_dict.keys()) + r')\b')

def text_cleaner(text):
    clean_text = BeautifulSoup(text, 'lxml').get_text()
    clean_text = re.sub(combined_pat, '', clean_text)
    clean_text = re.sub(www_pat, '', clean_text)
    
    lower_case = clean_text.lower()
    misspell_handled = misspell_pattern.sub(lambda x: misspell_dict[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", misspell_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

cleaned_data = []

for text in df['text']:
    cleaned_data.append(text_cleaner(text))
    
df['clean_text'] = cleaned_data
df['text_len'] = [len(t) for t in df['clean_text']]
print('Finish cleaning text')

Finish cleaning text


In [0]:
def load_trained_w2v_embedding():
    print("Loading the model, this can take some time...")
    model_embed = Word2Vec.load_word2vec_format(join(FILEPATH, 'word2vec_twitter_model.bin'), binary=True)
    print("The vocabulary size is: "+str(len(model_embed.vocab)))
    model_embed_vocab = list(model_embed.vocab.keys())
    
    return model_embed, model_embed_vocab

In [0]:
def perform_embedding():
    embedded = np.zeros((len(df), INPUT_SIZE))
    i = 0
    for example in df['clean_text']:
        embedded_vector = np.zeros((1, INPUT_SIZE))
        for word in example:
            if word in model_embed_vocab:
                embedded_vector = np.add(embedded_vector, np.asarray(model_embed.__getitem__(word)))
        embedded[i] = embedded_vector
        print('example ' + str(i + 1) + ' finished embedding')
        i+=1
        
    np.save(join(FILEPATH, 'embedded.npy'), embedded)
    print('Finish perform word embedding')

In [8]:
#Load pretrained embedding
model_embed, model_embed_vocab = load_trained_w2v_embedding()


Loading the model, this can take some time...
The vocabulary size is: 3039345


In [0]:
#Perform embedding on own dataset (for LogReg and SVM algorithm)
perform_embedding()

In [0]:
embedding_index = {}
for word in model_embed_vocab:
    embedding_index[word] = np.asarray(model_embed.__getitem__(word))
del model_embed, model_embed_vocab #save ram

Load processed data into input and perfrom prediction

In [0]:
X = np.load(join(FILEPATH, 'embedded.npy'))
y = df['encode_airline_sentiment'].values

folds = StratifiedKFold(n_splits=K_FOLDS, shuffle=True, random_state=SEED)

def evaluate_logistic_regression(X, y, folds):
    clf = LogisticRegression(random_state=SEED, solver=SOLVER_LR,multi_class=MULTI_CLASS_LR, max_iter=MAX_ITER_LR)
    score = cross_val_score(clf, X, y, cv=folds)
    return score

def evaluate_svm(X, y, folds):
    svm_linear = svm.SVC(kernel=KERNEL_SVM, random_state=SEED)
    score = cross_val_score(svm_linear, X, y, cv=folds)
    return score

In [0]:
max_len = -1
for text in df['clean_text']:
    max_len = max(max_len, len(text.split()))

def text_to_array(text):
    empyt_emb = np.zeros(INPUT_SIZE)
    text = text.split()[:max_len]
    embeds = [embedding_index.get(x, empyt_emb) for x in text]
    embeds+= [empyt_emb] * (max_len - len(embeds))
    return np.array(embeds)

def create_model():
    model = Sequential()
    model.add(Bidirectional(CuDNNLSTM(max_len, return_sequences=True),
                            input_shape=(max_len, INPUT_SIZE)))
    model.add(Bidirectional(CuDNNLSTM(max_len)))
    model.add(Dense(1, activation="sigmoid"))
    
    plot_model(model, to_file='model.png')

#     sgd = optimizers.SGD(lr=LEARNING_RATE, momentum=MOMENTUM_RATE, nesterov=True)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model

def evaluate_RNN(fold):
    X = np.array([text_to_array(X_text) for X_text in df['clean_text'].values])
    y = df['encode_airline_sentiment'].values
    
    model_wrapper = KerasClassifier(build_fn=create_model,
                                   epochs=EPOCHS,
                                   batch_size=BATCH_SIZE,
                                   verbose=True)
    
    score = cross_val_score(model_wrapper, X, y, cv=fold)
    return score

In [12]:
score = evaluate_logistic_regression(X, y, folds)
print(score)

[0.71532091 0.72049689 0.72868485 0.70614641 0.72790055]


In [13]:
score = evaluate_RNN(folds)
print(score)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
[0.85334714 0.83609386 0.846738   0.82907459 0.83183702]


In [14]:
score = evaluate_svm(X, y, folds)
print(score)

[0.7073844  0.72463768 0.73110114 0.70476519 0.73549724]
