# Recurrent Neural Networks

The project classifies the sentences into positive and negative sentiments


In [None]:
path_prefix = './'

### Dataset
Therea are 3 datasets: training_label.txt、training_nolabel.txt、testing_data.txt

- training_label.txt: training data with label (sentences start with 0 or 1，and "+++$+++" seperates label and the sentence)
    - e.g., 1 +++$+++ are wtf ... awww thanks !

- training_nolabel.txt：training data without label (for semi-supervised learning)
    - ex: hates being this burnt !! ouch

- testing_data.txt：testing data used to determine its sentiment

    >id,text

    >0,my dog ate our dinner . no , seriously ... he ate it .

    >1,omg last day sooon n of primary noooooo x im gona be swimming out of school wif the amount of tears am gona cry

    >2,stupid boys .. they ' re so .. stupid !

In [None]:
# this is for filtering the warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
root_dir='./'

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../data/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
count = 0
import os
for dirname, _, filenames in os.walk(root_dir):
    for filename in filenames:
        print(filename)
        count+=1
print(count)

data.zip
testing_data.txt
training_nolabel.txt
training_label.txt
active_config
config_sentinel
.last_survey_prompt.yaml
.last_opt_in_prompt.yaml
.last_update_check.json
gce
.metricsUUID
17.27.22.039013.log
17.27.27.315162.log
17.27.07.888058.log
17.27.43.241792.log
17.26.49.689206.log
17.27.42.676144.log
config_default
README.md
anscombe.json
mnist_test.csv
california_housing_train.csv
mnist_train_small.csv
california_housing_test.csv
24


In [None]:
path_train='training_label.txt'
with open(path_train, 'r') as f:
  lines = f.readlines()
  lines = [line.strip('\n').split(' ') for line in lines]
  print(lines[0])

['1', '+++$+++', 'are', 'wtf', '...', 'awww', 'thanks', '!']


In [None]:
path_test = 'testing_data.txt'
with open(path_test, 'r') as f:
  # lines = f.readlines()
  # lines = [line.strip('\n') for line in lines]
  # print(lines[0])
  print(f.read(100))

id,text
0,my dog ate our dinner . no , seriously ... he ate it .
1,omg last day sooon n of primary n


### Utils: some functions that will be frequently used in later steps

In [None]:
# utils.py

import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F

def load_training_data(path='training_label.txt'):
    # load the training data with and without label
    if 'training_label' in path:
        with open(path, 'r') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split(' ') for line in lines]
        x = [line[2:] for line in lines]
        y = [line[0] for line in lines]
        return x, y
    else:
        with open(path, 'r') as f:
            lines = f.readlines()
            x = [line.strip('\n').split(' ') for line in lines]
        return x

def load_testing_data(path='testing_data'):
    # load testing data
    with open(path, 'r') as f:
        lines = f.readlines()
        X = ["".join(line.strip('\n').split(",")[1:]).strip() for line in lines[1:]]
        X = [sen.split(' ') for sen in X]
    return X

def evaluation(outputs, labels): # pandas series: optputs, labels
    # outputs => probability (float)
    # labels => labels
    outputs[outputs>=0.5] = 1 # positive
    outputs[outputs<0.5] = 0 # negative
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

### Train Word to Vector: word embedding
This training uses CPU which means it takes a few minutes. 

The block trains the word2vector word embedding.

In [None]:
# w2v.py
# 
import os
import numpy as np
import pandas as pd
import argparse
from gensim.models import word2vec
from gensim.models import Word2Vec

def train_word2vec(x):
    # 訓練 word to vector 的 word embedding
    # model = word2vec.Word2Vec(x, size=250, window=5, min_count=5, workers=12, iter=10, sg=1)
    model = word2vec.Word2Vec(x, size=250, window=5, min_count=10, workers=12, iter=10, sg=1)
    return model

if __name__ == "__main__":
    print("loading training data ...")
    train_x, y = load_training_data('training_label.txt')
    train_x_no_label = load_training_data('training_nolabel.txt') # no label training makes the model "learn more words"

    print("loading testing data ...")
    test_x = load_testing_data('testing_data.txt')

    # model = train_word2vec(train_x + train_x_no_label + test_x) # in order to save time
    model = train_word2vec(train_x + test_x)
    
    print("saving model ...")
    # model.save(os.path.join(path_prefix, 'model/w2v_all.model'))
    model.save(os.path.join(path_prefix, 'w2v_all.model'))

loading training data ...
loading testing data ...
saving model ...


In [None]:
# the words in the word embedding
for i, val in enumerate(model.wv.vocab):
  print(i, val)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
10650 skunk
10651 gee
10652 techniques
10653 producer
10654 connecticut
10655 tr
10656 1000th
10657 makeover
10658 waxed
10659 headline
10660 oth
10661 significant
10662 screening
10663 polaroid
10664 ribbon
10665 blister
10666 1hr
10667 waffle
10668 spaced
10669 beaten
10670 wentz
10671 locally
10672 rewind
10673 gallery
10674 ghetto
10675 wand
10676 tor
10677 keeper
10678 eagle
10679 plates
10680 bowls
10681 reggie
10682 expire
10683 souls
10684 lick
10685 deaths
10686 federer
10687 ci
10688 ned
10689 stabbed
10690 clicked
10691 colleagues
10692 chrysler
10693 sou
10694 stoned
10695 closely
10696 und
10697 shelby
10698 hhaha
10699 soldiers
10700 macy
10701 classics
10702 political
10703 eatin
10704 les
10705 hysterical
10706 sr
10707 71
10708 }
10709 motivation
10710 dwarf
10711 tã
10712 lincoln
10713 homecoming
10714 repaired
10715 blowing
10716 cola
10717 diner
10718 sbs
10719 ambitious
10720 suncream
10721 vacations


### Data Preprocess: contains a class for data-preprocessing


In [None]:
# preprocess.py

from torch import nn
from gensim.models import Word2Vec

class Preprocess():

    def __init__(self, sentences, sen_len, w2v_path="./w2v.model"):
        self.w2v_path = w2v_path # path to the embedded words
        self.sentences = sentences # sentences inputted

        self.sen_len = sen_len 
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []

    # read the Word2Vec trained model
    def get_w2v_model(self):
        self.embedding = Word2Vec.load(self.w2v_path) # load the trained model
        self.embedding_dim = self.embedding.vector_size # vector dimension

    # index the word into idx2word and word2idx, and add its representation vector (random) into embedding matirx
    def add_embedding(self, word):
        # word = either "<PAD>" or "<UNK>"
        vector = torch.empty(1, self.embedding_dim) # 1 row * dim columns with random initial values

        torch.nn.init.uniform_(vector)

        self.word2idx[word] = len(self.word2idx) 
        # the word as key and the index as value (the i th unique word has value/index i, if there's a duplicate then we use the last index)
        
        self.idx2word.append(word) # the i th one is the word we want

        # self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], axis = 0)


    def make_embedding(self, load=True):
        print("Get embedding ...")

        # load the well-trained Word2vec word embedding
        if load:
            print("loading word to vec model ...")
            self.get_w2v_model()
        else:
            raise NotImplementedError

        # make a word2idx dictionary
        # make a idx2word list
        # make word2vector list
        for i, word in enumerate(self.embedding.wv.vocab): # index and word
            print('get words #{}'.format(i+1), end='\r')
            #e.g. self.word2index['he'] = 1 
            #e.g. self.index2word[1] = 'he'
            #e.g. self.vectors[1] = 'he' vector
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        print('')
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        # add "<PAD>" and "<UNK>" into embedding 
        self.add_embedding("<PAD>") # empty space
        self.add_embedding("<UNK>") # unknown word
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix

    # make the sentences the same length
    def pad_sequence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == self.sen_len
        return sentence

    # transform the words into index
    def sentence_word2idx(self):
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            print('sentence count #{}'.format(i+1), end='\r')
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx["<UNK>"])
            # make the sentences the same length
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)

    def labels_to_tensor(self, y):
        # transform labels into tensor
        y = [int(label) for label in y]
        return torch.LongTensor(y)



### Dataset: overload functions for data.Dataset used in model training 

In [None]:
# data.py
# overload functions of data.Dataset '__init__', '__getitem__', '__len__'
# dataloader can be used to decorate datasets after overloading 
import torch
from torch.utils import data

class TwitterDataset(data.Dataset):
    """
    Expected data shape like:(data_num, data_len)
    Data can be a list of numpy array or a list of lists
    input data shape : (data_num, seq_len, feature_dim)
    
    __len__ will return the number of data
    """
    def __init__(self, X, y):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

### Model: define the LSTM model

In [None]:
# model.py

import torch
from torch import nn
class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
        super(LSTM_Net, self).__init__()
        # make embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        # determine whether to fix embedding, if fix_embedding = False then in the training process embedding will also get trained
        self.embedding.weight.requires_grad = False if fix_embedding else True
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        # nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Sequential( nn.Dropout(dropout),
                                        nn.Linear(hidden_dim, 1),
                                        nn.Sigmoid(),
                                        )
        
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        # the dimension of x: (batch, seq_len, hidden_size)
        # select the last layer (hidden state) of LSTM as output
        x = x[:, -1, :] 
        # print(len(x)) # len(x)  = 128
        x = self.classifier(x)
        return x

### Train: the block trains the model

In [None]:
# train.py

import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print('\nstart training, parameter total:{}, trainable:{}\n'.format(total, trainable))
    model.train() # set the model to train mode which means that optimizer can update model parameters
    criterion = nn.BCELoss() # define the loss functuion: binary cross entropy loss
    t_batch = len(train) 
    v_batch = len(valid) 
    optimizer = optim.Adam(model.parameters(), lr=lr) 
    total_loss, total_acc, best_acc = 0, 0, 0
    for epoch in range(n_epoch):
        print('# epoch = ', epoch)
        total_loss, total_acc = 0, 0
        # training
        for i, (inputs, labels) in enumerate(train):

            inputs = inputs.to(device, dtype=torch.long) 
            labels = labels.to(device, dtype=torch.float) 

            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze() # remove the redundant dimension of outputs to feed it into criterion()

            loss = criterion(outputs, labels) # calculate training loss
            loss.backward() # calculate loss gradient
            optimizer.step() #  update parameters
            correct = evaluation(outputs, labels) # calculate training accuracy
            total_acc += (correct / batch_size)
            total_loss += loss.item()
            print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
            	epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
        print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))

        # validation
        model.eval() # set the model to train mode which means that model parameters will not get updated
        with torch.no_grad():
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(valid):
                inputs = inputs.to(device, dtype=torch.long) 
                labels = labels.to(device, dtype=torch.float) 
                outputs = model(inputs) 
                outputs = outputs.squeeze() 
                loss = criterion(outputs, labels) 
                correct = evaluation(outputs, labels)
                total_acc += (correct / batch_size)
                total_loss += loss.item()

            print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
            if total_acc > best_acc:
                # if validation result is better than all its previous results, then save the result as the best model
                best_acc = total_acc
                #torch.save(model, "{}/val_acc_{:.3f}.model".format(model_dir,total_acc/v_batch*100))
                torch.save(model, "{}/ckpt.model".format(model_dir))
                print('saving model with acc {:.3f}'.format(total_acc/v_batch*100))
        print('-----------------------------------------------')
        model.train() # train the model in the next epoch

### Test: make predictions using the model with trained parameters

In [None]:
# test.py

import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def testing(batch_size, test_loader, model, device):
    model.eval()
    ret_output = []
    with torch.no_grad():
        for i, inputs in enumerate(test_loader):
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs>=0.5] = 1 # positive
            outputs[outputs<0.5] = 0 # negative
            ret_output += outputs.int().tolist()
    
    return ret_output

### Main

####part1

In [None]:
# main.py
import os
import torch
import argparse
import numpy as np
from torch import nn
from gensim.models import word2vec
from sklearn.model_selection import train_test_split

print( 'torch is currently using GPU:', torch.cuda.is_available() )
# determine whether there is GPU available for trainig
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# paths of datasets
train_with_label = os.path.join(path_prefix, 'training_label.txt')
train_no_label = os.path.join(path_prefix, 'training_nolabel.txt')
testing_data = os.path.join(path_prefix, 'testing_data.txt')

# path of word embedding model
w2v_path = os.path.join(path_prefix, 'w2v_all.model') 

# model_dir = os.path.join(path_prefix, 'model/') # model directory for checkpoint model
model_dir = path_prefix # model directory for checkpoint model

print("loading data ...") # read 'training_label.txt' and 'training_nolabel.txt' 
train_x, y = load_training_data(train_with_label)
train_x_no_label = load_training_data(train_no_label)


# preprocess input and labels labels
sen_len = 20
preprocess = Preprocess(train_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
train_x = preprocess.sentence_word2idx()
y = preprocess.labels_to_tensor(y)


torch is currently using GPU: True
loading data ...
Get embedding ...
loading word to vec model ...
get words #15650
total words: 15652


####part2

In [None]:
# mute parameters
# define the length of the sentences, whether to fix embeddin, the batch size, the number of epoch, and the learning rate 
'''
sen_len = 20
fix_embedding = True # fix embedding during training
batch_size = 128
epoch = 6
lr = 0.001
'''

fix_embedding = True # fix embedding during training
batch_size = 128
epoch = 8
lr = 0.001

# define the LSTM model
# LSTM_Net(embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True) 
#model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
model = model.to(device) # device = 'cuda', use GPU to train the LSTM model

# split the dataset into training and validation 
X_train, X_val, y_train, y_val = train_x[:180000], train_x[180000:], y[:180000], y[180000:]

# package the dataset with dataloader
train_dataset = TwitterDataset(X=X_train, y=y_train)
val_dataset = TwitterDataset(X=X_val, y=y_val)

# tranform data into batch of tensors
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True,
                                            num_workers = 8)

val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 8)

# training
training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model, device)


start training, parameter total:4154351, trainable:241351

# epoch =  0

Train | Loss:0.50175 Acc: 74.661
Valid | Loss:0.45656 Acc: 78.110 
saving model with acc 78.110
-----------------------------------------------
# epoch =  1

Train | Loss:0.44417 Acc: 79.051
Valid | Loss:0.44013 Acc: 79.180 
saving model with acc 79.180
-----------------------------------------------
# epoch =  2

Train | Loss:0.42899 Acc: 79.924
Valid | Loss:0.43051 Acc: 79.633 
saving model with acc 79.633
-----------------------------------------------
# epoch =  3

Train | Loss:0.41604 Acc: 80.745
Valid | Loss:0.42735 Acc: 80.016 
saving model with acc 80.016
-----------------------------------------------
# epoch =  4

Train | Loss:0.40466 Acc: 81.352
Valid | Loss:0.42366 Acc: 80.135 
saving model with acc 80.135
-----------------------------------------------
# epoch =  5

Train | Loss:0.39365 Acc: 81.960
Valid | Loss:0.42745 Acc: 79.971 
-----------------------------------------------
# epoch =  6

Train |

### Predict and Write to csv file

In [None]:
# start to testing data 
print("loading testing data ...")
test_x = load_testing_data(testing_data)
preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = TwitterDataset(X=test_x, y=None)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 8)
print('\nload model ...')
model = torch.load(os.path.join(model_dir, 'ckpt.model'))
outputs = testing(batch_size, test_loader, model, device)

# write csv
tmp = pd.DataFrame({"id":[str(i) for i in range(len(test_x))],"label":outputs})
print("save csv ...")
tmp.to_csv(os.path.join(path_prefix, 'predict.csv'), index=False)
print("Finish Predicting")

loading testing data ...
Get embedding ...
loading word to vec model ...
get words #15650
total words: 15652
sentence count #200000
load model ...
save csv ...
Finish Predicting
