***Warning: DO run this notebook on Google Colab (Pro preferred) instead of local environments.***

In [1]:
# Import packages.
import warnings
warnings.filterwarnings('ignore')
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"
import re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

from sklearn.manifold import TSNE
from gensim.parsing.preprocessing import preprocess_string
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
root_dir = 'gdrive/Shareddrives/MADS-Capstone-haizhou/assets/'
df_train = pd.read_csv(root_dir + 'df_train.csv')
df_dev = pd.read_csv(root_dir + 'df_dev.csv')
df_test = pd.read_csv(root_dir + 'df_test.csv')

In [4]:
# Preprocess Strings (Lemmatize, Remove Stopwords, etc.)
df_train['tweet_text'] = df_train['tweet_text'].apply(lambda x:" ".join(preprocess_string(x)))
df_dev['tweet_text'] = df_dev['tweet_text'].apply(lambda x:" ".join(preprocess_string(x)))
df_test['tweet_text'] = df_test['tweet_text'].apply(lambda x:" ".join(preprocess_string(x)))

# map labels to ordinal values
unique_labels = df_train['class_label'].unique()
dict_label = dict(zip(unique_labels,np.arange(len(unique_labels))))
y_train = df_train['class_label'].map(dict_label)
y_dev = df_dev['class_label'].map(dict_label)
y_test = df_test['class_label'].map(dict_label)

In [5]:
# clean the texts for further tokenization.
df_train['tweet_text'] = ' ' + df_train['tweet_text'].astype(str)
df_dev['tweet_text'] = ' ' + df_dev['tweet_text'].astype(str)
df_test['tweet_text'] = ' ' + df_test['tweet_text'].astype(str)
X_train = df_train.tweet_text
X_dev = df_dev.tweet_text
X_test = df_test.tweet_text

In [6]:
# Tokenize documents based on dict-like mapping.
vocab_size = 5000
tokenizer = Tokenizer(num_words = vocab_size,oov_token = '<OOV>')
tokenizer.fit_on_texts(X_train)

# Use the trained tokenizer to convert the documents to sequences.
train_sequences = tokenizer.texts_to_sequences(X_train)
dev_sequences = tokenizer.texts_to_sequences(X_dev)
test_sequences = tokenizer.texts_to_sequences(X_test)

In [7]:
length_train = [len(sequence) for sequence in train_sequences]
length_dev = [len(sequence) for sequence in dev_sequences]
length_test = [len(sequence) for sequence in test_sequences]
max_length = max(length_train+length_dev+length_test)
max_length

80

In [8]:
# Padding: Add zeros to the end of the sequences, to ensure same length accross sequences.
train_padded = pad_sequences(train_sequences, maxlen = max_length, padding = 'post', truncating = 'post')
dev_padded = pad_sequences(dev_sequences, maxlen = max_length, padding = 'post', truncating = 'post')
test_padded = pad_sequences(test_sequences, maxlen = max_length, padding = 'post', truncating='post')

In [9]:
embedding_dim = 100
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim),   # Embedding layer
    tf.keras.layers.LSTM(embedding_dim),   # LSTM layer
    tf.keras.layers.Dense(10,activation = 'softmax')   # Final fully-connected layer
])
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [10]:
# What is the model's initial performance on the dev set?
model.evaluate(dev_padded,y_dev)



[2.3101727962493896, 0.071390800178051]

In [11]:
# I have saved the trained model into the shared drive.
# model.load_weights(root_dir+'BiLSTM-checkpoints/checkpoint.ckpt')

# with open(root_dir+'BiLSTM-checkpoints/history.pickle','rb') as handle:
#   history = pickle.load(handle)

# Here's how I fitted the model in the first place.
# Although in saving the history, I also added the epoch numbers and epoch-0 performance (in other words, not exactly the commented out code below).   

history = model.fit(train_padded, y_train, epochs=2,
                    validation_data=(dev_padded, y_dev), verbose=1)
model.save_weights(root_dir+'LSTM-checkpoints/checkpoint.ckpt')
with open(root_dir+'LSTM-checkpoints/history.pickle', 'wb') as handle:
    pickle.dump(model.history.history, handle, protocol=pickle.HIGHEST_PROTOCOL)

Epoch 1/2
Epoch 2/2


In [12]:
# Just a simple test: will trainin one more epoch help improve the model? Answer is NO!
model2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,embedding_dim),   # Embedding layer
    tf.keras.layers.LSTM(embedding_dim),   # LSTM layer
    tf.keras.layers.Dense(10,activation = 'softmax')   # Final fully-connected layer
])
model2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])
model2.load_weights(root_dir+'LSTM-checkpoints/checkpoint.ckpt')
history2 = model2.fit(train_padded, y_train, epochs=1,validation_data=(dev_padded, y_dev), verbose=1)

