# Recurrent Neural Networks for text sentiment classification

Given a sentence, we are going to classify whether this sentence has negative meaning. Negative meaning will have label == 1, positive meaning will have label == 0.

In [None]:
import os
from pathlib import Path
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split

import w2v
from utils import *
from preprocess import Preprocess
from model import buildModel, testing, BiLstmTuner

AUTOTUNE = tf.data.experimental.AUTOTUNE

%load_ext autoreload
%autoreload 2

In [None]:
path_prefix = Path.cwd()
print(path_prefix)

In [None]:
data_path = path_prefix.joinpath('data/')
model_path = path_prefix.joinpath('model/')
data_path.mkdir(exist_ok = True)
model_path.mkdir(exist_ok = True)

## Download Dataset
[Dataset](https://www.kaggle.com/c/ml2020spring-hw4)

There are three .txt files -- training_label.txt、training_nolabel.txt、testing_data.txt

- training_label.txt: training data with labels（0 or 1)
    - +++$+++ is separating symbols
    - e.g., 1 +++$+++ are wtf ... awww thanks !

- training_nolabel.txt：training data without labels
    - We will use this training data for semi-supervised learning
    - ex: hates being this burnt !! ouch

- testing_data.txt： Predict with testing data 

    >id,text

    >0,my dog ate our dinner . no , seriously ... he ate it .

    >1,omg last day sooon n of primary noooooo x im gona be swimming out of school wif the amount of tears am gona cry

    >2,stupid boys .. they ' re so .. stupid !

### Download dataset if not have any

In [None]:
%cd $data_path

if not os.path.exists('training_label.txt') or\
    not os.path.exists('training_nolabel.txt') or\
    not os.path.exists('testing_data.txt'):
    print("Dataset is incompleted . Downloading")
    # Method1
    !wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1dPHIl8ZnfDz_fxNd2ZeBYedTat2lfxcO' -O 'training_label.txt'
    !wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1x1rJOX_ETqnOZjdMAbEE2pqIjRNa8xcc' -O 'training_nolabel.txt'
    !wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=16CtnQwSDCob9xmm6EdHHR7PNFNiOrQ30' -O 'testing_data.txt'

    # Method2
    # !gdown --id '1lz0Wtwxsh5YCPdqQ3E3l_nbfJT1N13V8' --output data.zip
    # !unzip data.zip
    # !ls
else:
    print("data is all set")
   
%cd $path_prefix

In [None]:
# this is for filtering the warnings
import warnings
warnings.filterwarnings('ignore')

## Train Word2Vec Model

In [None]:
w2v_path = path_prefix.joinpath('model/w2v_all.model') 

if not w2v_path.exists():
    print("Train Word2Vec model via gensim")
    data_folder_path = './data'
    model_folder_path = './model'

    w2v.main(data_folder_path, model_folder_path)
else:
    print("Pretrained Word2Vec model exists")

## Preprocess dataset

### Setup paths and configurations

In [None]:
# Preset the paths to dataset
train_with_label = os.path.join(path_prefix, 'data/training_label.txt')
train_no_label = os.path.join(path_prefix, 'data/training_nolabel.txt')
testing_data = os.path.join(path_prefix, 'data/testing_data.txt')

# Configuration
sen_len = 20
batch_size = 128

### Read dataset from folder

In [None]:
# Read 'training_label.txt' and 'training_nolabel.txt'
print("loading training data ...")
X_train_lable, y_train_lable = load_training_data(train_with_label)
X_train, X_val, y_train, y_val = train_test_split(X_train_lable, 
                                                  y_train_lable, 
                                                  test_size = 0.1)

train_x_no_label = load_training_data(train_no_label)

In [None]:
print(f"Positive rate in training dataset: {np.sum(y_train) / len(y_train)}")
print(f"Positive rate in validation dataset: {np.sum(y_val) / len(y_val)}")

### Build the preprocessor

In [None]:
# Preprocess the training data
preprocessor = Preprocess(sen_len, w2v_path = str(w2v_path))
embedding = preprocessor.make_embedding(load = True)
X_train_idx = preprocessor.sentences_word2idx(X_train)
X_val_idx = preprocessor.sentences_word2idx(X_val)

In [None]:
print(f"Pretrained embedding matrix shape: {embedding.shape}")

### Preprocess training and validation datasets

In [None]:
X_train_idx_dataset = tf.data.Dataset.from_tensor_slices(X_train_idx)
y_train_dataset = tf.data.Dataset.from_tensor_slices(y_train)
train_dataset = tf.data.Dataset.zip((X_train_idx_dataset, y_train_dataset))

X_val_idx_dataset = tf.data.Dataset.from_tensor_slices(X_val_idx)
y_val_dataset = tf.data.Dataset.from_tensor_slices(y_val)
val_dataset = tf.data.Dataset.zip((X_val_idx_dataset, y_val_dataset))

train_dataset = train_dataset.batch(batch_size)
val_dataset   = val_dataset.batch(batch_size)

train_dataset = train_dataset.cache().prefetch(AUTOTUNE)
val_dataset   = val_dataset.cache().prefetch(AUTOTUNE)

In [None]:
for x_batch, y_batch in train_dataset.take(1):
    print(f"x_batch shape: {x_batch.shape}")
    print(f"y_batch shape: {y_batch.shape}")    

## Train a bidirectional LSTM model

### Method1

#### Build the model

In [None]:
train_embedding = False # fix embedding during training
hidden_dim1 = 64
hidden_dim2 = 64
dp_rate = 0.5
lr = 0.001
epochs = 5

In [None]:
model = buildModel(embedding, train_embedding, sen_len, hidden_dim1, hidden_dim2, dp_rate, lr)

model.summary()

#### Train the model

In [None]:
checkpoint_filepath = os.path.join(path_prefix, 'ckpt/')
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_best_only=True)

In [None]:
history = model.fit(train_dataset, 
                      validation_data=val_dataset, 
                      epochs = epochs, 
                      callbacks=[model_checkpoint_callback])

### Method2 - with Kerastuner

In [None]:
import IPython
from kerastuner.tuners import RandomSearch

class ClearTrainingOutput(tf.keras.callbacks.Callback):
    def on_train_end(*args, **kwargs):
        IPython.display.clear_output(wait = True)
    
tuner = RandomSearch(
    BiLstmTuner(embedding, train_embedding, sen_len),
    objective='val_accuracy',
    max_trials = 10,
    executions_per_trial = 3,
    directory = os.path.join(path_prefix, 'tuner_dir'),
    project_name = 'tsc')

tuner.search(train_dataset,
             epochs = 5,
             validation_data = val_dataset,
             verbose = 0,
             callbacks = [ClearTrainingOutput()],)

## Testing

### Preprocess test dataset

In [None]:
print("loading testing data ...")
X_test = load_testing_data(testing_data)
X_test_idx = preprocessor.sentences_word2idx(X_test)

In [None]:
test_dataset = tf.data.Dataset.from_tensor_slices(X_test_idx)
test_dataset = test_dataset.batch(batch_size)
test_dataset = test_dataset.cache().prefetch(AUTOTUNE)

### Load the best model

#### Method1

In [None]:
print('\nload model ...')
best_model = tf.keras.models.load_model(checkpoint_filepath)

#### Method2

In [None]:
# Get the optimal hyperparameters
best_model = tuner.get_best_models()[0]
best_model.summary()

### Make prediction

In [None]:
outputs = testing(best_model, test_dataset)

In [None]:
# Write the result to a CSV file
tmp = pd.DataFrame({"id":[str(i) for i in range(len(X_test))],"label":outputs})
print("save csv ...")
tmp.to_csv(os.path.join(path_prefix, 'predict.csv'), index=False)
print("Finish Predicting")