# Text Sentiment Classification By LSTM

### Task Allocation
#### WU XIAOQING: Att_BiLSTM, Semi-supervised learning, parameter adjustment
#### LI YISHAN: schedular,parameter adjustment
#### ZHENG DAYI:   free rider

In [1]:
import warnings
warnings.filterwarnings('ignore')
path_prefix = './'

## Data Loading

In [2]:
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F

def load_training_data(path='Train_label.txt'):
    # Read training data
    if 'Train_label' in path:
        with open(path, 'r',encoding='UTF-8') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split(' ') for line in lines]
        x = [line[2:] for line in lines]
        y = [line[0] for line in lines]
        return x, y
    else:
        with open(path, 'r',encoding='UTF-8') as f:
            lines = f.readlines()
            x = [line.strip('\n').split(' ') for line in lines]
        return x

def load_testing_data(path='Test.txt'):
    # Read testing data
    with open(path, 'r',encoding='UTF-8') as f:
        lines = f.readlines()
        X = ["".join(line.strip('\n').split(",")[1:]).strip() for line in lines[1:]]
        X = [sen.split(' ') for sen in X]
    return X

def evaluation(outputs, labels):
    #outputs => probability (float)
    #labels => labels
    outputs[outputs>=0.5] = 1 # Negtive Sentiment
    outputs[outputs<0.5] = 0 # Positive Sentiment
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

## Train Word to Vector

In [3]:
import os
import numpy as np
import pandas as pd
import argparse
from gensim.models import word2vec

def train_word2vec(x):
    model = word2vec.Word2Vec(x, vector_size=250, window=5, min_count=5, workers=12)
    return model

if __name__ == "__main__":
    print("loading training data ...")
    train_x, y = load_training_data('Train_label.txt')
    train_x_no_label = load_training_data('Train_nolabel.txt')

    print("loading testing data ...")
    test_x = load_testing_data('Test.txt')

    model = train_word2vec(train_x + train_x_no_label + test_x)
    
    print("saving model ...")
    model.save(os.path.join(path_prefix, 'w2v_all.model'))

loading training data ...
loading testing data ...
saving model ...


## Data Preprocess


In [4]:
from torch import nn
from gensim.models import Word2Vec

class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path="./w2v.model"):
        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
    def get_w2v_model(self):
        # load word to vector model
        self.embedding = Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size
    def add_embedding(self, word):
        # add word into embedding
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
    def make_embedding(self, load=True):
        print("Get embedding ...")
        if load:
            print("loading word to vec model ...")
            self.get_w2v_model()
        else:
            raise NotImplementedError

        for i, word in enumerate(self.embedding.wv.key_to_index):
            print('get words #{}'.format(i+1), end='\r')
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding.wv[word])
        print('')
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        self.add_embedding("<PAD>")
        self.add_embedding("<UNK>")
        print("total words: {}".format(len(self.embedding_matrix)))
        return self.embedding_matrix
    def pad_sequence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            pad_len = self.sen_len - len(sentence)
            for _ in range(pad_len):
                sentence.append(self.word2idx["<PAD>"])
        assert len(sentence) == self.sen_len
        return sentence
    def sentence_word2idx(self):
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            print('sentence count #{}'.format(i+1), end='\r')
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx["<UNK>"])
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)
    def labels_to_tensor(self, y):
        # turn labels into tensors
        y = [int(label) for label in y]
        return torch.LongTensor(y)

## Dataset

In [5]:
import torch
from torch.utils import data

class TwitterDataset(data.Dataset):
    """
    Expected data shape like:(data_num, data_len)
    Data can be a list of numpy array or a list of lists
    input data shape : (data_num, seq_len, feature_dim)
    
    __len__ will return the number of data
    """
    def __init__(self, X, y):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

## LSTM Model

In [6]:
import torch
from torch import nn
class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
        super(LSTM_Net, self).__init__()
        # embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0),embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        # Whether fix embedding
        self.embedding.weight.requires_grad = False if fix_embedding else True
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Sequential( nn.Dropout(dropout),
                          nn.Linear(hidden_dim, 64),
                          nn.Dropout(dropout),
                          nn.Linear(64, 1),
                          nn.Sigmoid() )
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        # dimension of x (batch, seq_len, hidden_size)
        x = x[:, -1, :] 
        x = self.classifier(x)
        return x

In [7]:
class Atten_BiLSTM(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
        super(Atten_BiLSTM, self).__init__()
        # embedding layer
        self.embedding = torch.nn.Embedding(embedding.size(0), embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        # Whether the embedding is fixed. If the fix_embedding is False, the embedding will be trained
        self.embedding.weight.requires_grad = False if fix_embedding else True
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.classifier = nn.Sequential(nn.Dropout(dropout),
                                        nn.Linear(hidden_dim, 128),
                                        nn.Dropout(dropout),
                                        nn.Linear(128, 64),
                                        nn.Dropout(dropout),
                                        nn.Linear(64, 16),
                                        nn.Dropout(dropout),
                                        nn.Linear(16, 1),
                                        nn.Sigmoid())
        self.attention_layer = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.Softmax()
        )

    def attention(self, output, hidden):
        # output  (batch_size, seq_len, hidden_size * num_direction)
        # hidden (batch_size, num_layers * num_direction, hidden_size)

        output = output[:,:,:self.hidden_dim] + output[:,:,self.hidden_dim:] # (batch_size, seq_len, hidden_size)

        hidden = torch.sum(hidden, dim=1)
        hidden = hidden.unsqueeze(1) # (batch_size, 1, hidden_size)

        atten_w = self.attention_layer(hidden) # (batch_size, 1, hidden_size)
        m = nn.Tanh()(output) # (batch_size, seq_len, hidden_size)

        atten_context = torch.bmm(atten_w, m.transpose(1, 2))

        softmax_w = F.softmax(atten_context, dim=-1)

        context = torch.bmm(softmax_w, output)

        return context.squeeze(1)

    def forward(self, inputs):
        inputs = self.embedding(inputs)

        # x (batch, seq_len, hidden_size)
        # hidden (num_layers *num_direction, batch_size, hidden_size)
        x, (hidden, _) = self.lstm(inputs, None)
        hidden = hidden.permute(1, 0, 2) # (batch_size, num_layers *num_direction, hidden_size)

        # atten_out [batch_size, 1, hidden_dim]
        atten_out = self.attention(x, hidden)
        return self.classifier(atten_out)

## Define Training

In [8]:
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import ConcatDataset,Subset,DataLoader

def training(batch_size, n_epoch, lr, model_dir, train, valid, model, device):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('\nstart training, parameter total:{}, trainable:{}\n'.format(total, trainable))
    model.train() # set training mode
    criterion = nn.BCELoss() # Define loss function
    t_batch = len(train) 
    v_batch = len(valid) 
    training=train
    optimizer = optim.Adam(model.parameters(), lr=lr) # set optimizer as SGD (you can change it)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=5, verbose=True)
    total_loss, total_acc, best_acc = 0, 0, 0
    for epoch in range(n_epoch):
        total_loss, total_acc = 0, 0
        model.train()
        # For training
        t_batch = len(training)
        for i, (inputs, labels) in enumerate(training): 
            inputs = inputs.to(device, dtype=torch.long) # set device "cuda"
            labels = labels.to(device, dtype=torch.float) # set device "cuda"
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            correct = evaluation(outputs, labels) # calculate accuracy
            total_acc += (correct / len(training))
            total_loss += loss.item()
            print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
            	epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
        print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))

        # For validation
        model.eval() # set validation mode
        with torch.no_grad():
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(valid):
                inputs = inputs.to(device, dtype=torch.long) # set device "cuda"
                labels = labels.to(device, dtype=torch.float) # set device "cuda"
                outputs = model(inputs)
                outputs = outputs.squeeze()
                loss = criterion(outputs, labels)
                correct = evaluation(outputs, labels)
                total_acc += (correct / batch_size)
                total_loss += loss.item()

            print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
            scheduler.step(total_acc)
            if total_acc > best_acc:
                # if the result of validation is better than previous model, save the new model
                best_acc = total_acc
                torch.save(model, "{}/ckpt.model".format(model_dir))
                print('saving model with acc {:.3f}'.format(total_acc/v_batch*100))
                #### add Semi-supervised learning
                if (total_acc/v_batch > 0.8):
                    dataset = get_pseudo_labels(semi_dataset, model)
                    training = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False,drop_last=True)
                    print("update semi_dataset")
        print('-----------------------------------------------')
        
        model.train()

## Testing

In [9]:
import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F

def testing(batch_size, test_loader, model, device):
    model.eval()
    ret_output = []
    with torch.no_grad():
        for i, inputs in enumerate(test_loader):
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs>=0.5] = 1
            outputs[outputs<0.5] = 0
            ret_output += outputs.int().tolist()
    
    return ret_output

In [10]:
def add_label(outputs, threshold=0.99):
    idx = (outputs>=threshold) | (outputs<1-threshold)
    outputs[outputs>=threshold] = 1 # 大于等于 threshold 为正面
    outputs[outputs<1-threshold] = 0 # 小于 threshold 为负面
    return outputs.long(), idx

def get_pseudo_labels(dataset, model):
    # This functions generates pseudo-labels of a dataset using given model.
    # It returns an instance of DatasetFolder containing images whose prediction confidences exceed a given threshold.
    # You are NOT allowed to use any models trained on external data for pseudo-labeling.
    device = "cuda" if torch.cuda.is_available() else "cpu"
    data_loader = torch.utils.data.DataLoader(dataset = dataset, batch_size = batch_size, shuffle = True, num_workers = 0)    
    # Make sure the model is in eval mode.
    model.eval()
    # Define softmax function.
    softmax = nn.Softmax(dim=-1)
    
    idx = []
    labels = []

    # Iterate over the dataset by batches.
    for i, (inputs) in enumerate(data_loader):
        with torch.no_grad():
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            label, idd = add_label(outputs)
            for j,x in enumerate(outputs):
                if idd[j]==True:
                    idx.append(i * batch_size + j)
                    labels.append(int(label[j]))
    dataset = TwitterDataset(Subset(dataset, idx), torch.LongTensor(labels))
    dataset = ConcatDataset([train_dataset, dataset]) 
          ### merge new set with training set
    # # Turn off the eval mode.
    model.train()
    return dataset

## Parameter setting + Train

In [24]:
# main.py
import os
import torch
import argparse
import numpy as np
from torch import nn
from gensim.models import word2vec
from sklearn.model_selection import train_test_split


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# set data path
train_with_label = os.path.join(path_prefix, 'Train_label.txt')
train_no_label = os.path.join(path_prefix, 'Train_nolabel.txt')
testing_data = os.path.join(path_prefix, 'Test.txt')
w2v_path = os.path.join(path_prefix, 'w2v_all.model')


sen_len = 30
fix_embedding = True # fix embedding during training
batch_size = 256
epoch = 20
lr = 0.001
model_dir = path_prefix

print("loading data ...")
train_x, y = load_training_data(train_with_label)
train_x_no_label = load_training_data(train_no_label)

# Preprocessing
preprocess1 = Preprocess(train_x, sen_len, w2v_path=w2v_path)
preprocess2 = Preprocess(train_x_no_label, sen_len, w2v_path=w2v_path)
embedding = preprocess1.make_embedding(load=True)
embedding = preprocess2.make_embedding(load=True)
train_x = preprocess1.sentence_word2idx()
y = preprocess1.labels_to_tensor(y)
train_x_no_label = preprocess2.sentence_word2idx()
semi_dataset = TwitterDataset(X=train_x_no_label, y=None)

#model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=250, num_layers=1, dropout=0.5, fix_embedding=fix_embedding)
model=Atten_BiLSTM(embedding, embedding_dim=250, hidden_dim=300, num_layers=3)
model = model.to(device) 

X_train, X_val, y_train, y_val = train_x[:130000], train_x[130000:], y[:130000], y[130000:]

train_dataset = TwitterDataset(X=X_train, y=y_train)
val_dataset = TwitterDataset(X=X_val, y=y_val)

# transfor data into batch of tensors
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = batch_size,
                                            shuffle = True,
                                            num_workers = 0)

val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 0)

# Begin Training
training(batch_size, epoch, lr, model_dir, train_loader, val_loader, model, device)

loading data ...
Get embedding ...
loading word to vec model ...
get words #23000
total words: 23002
Get embedding ...
loading word to vec model ...
get words #23000
total words: 23002
sentence count #149882
start training, parameter total:11543041, trainable:5792541

[ Epoch1: 508/508 ] loss:0.441 acc:67.578 
Train | Loss:0.49796 Acc: 38.534
Valid | Loss:0.44233 Acc: 78.826 
saving model with acc 78.826
-----------------------------------------------
[ Epoch2: 508/508 ] loss:0.424 acc:66.406 
Train | Loss:0.43706 Acc: 40.493
Valid | Loss:0.42070 Acc: 80.544 
saving model with acc 80.544
-----------------------------------------------
[ Epoch3: 508/508 ] loss:0.406 acc:66.797 
Train | Loss:0.40573 Acc: 41.427
Valid | Loss:0.41683 Acc: 80.834 
saving model with acc 80.834
-----------------------------------------------
[ Epoch4: 508/508 ] loss:0.435 acc:64.453 
Train | Loss:0.37504 Acc: 42.221
Valid | Loss:0.41981 Acc: 80.869 
saving model with acc 80.869
-------------------------------

## Predict and save to csv file

In [22]:
print("loading testing data ...")
test_x = load_testing_data(testing_data)
preprocess = Preprocess(test_x, sen_len, w2v_path=w2v_path)
embedding = preprocess.make_embedding(load=True)
test_x = preprocess.sentence_word2idx()
test_dataset = TwitterDataset(X=test_x, y=None)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = batch_size,
                                            shuffle = False,
                                            num_workers = 0)
print('\nload model ...')#Predict-and-save-to-csv-file
model = torch.load(os.path.join(model_dir, 'ckpt.model'))
outputs = testing(batch_size, test_loader, model, device)

# save as csv
tmp = pd.DataFrame({"id":[str(i) for i in range(len(test_x))],"labels":outputs})
print("save csv ...")
tmp.to_csv(os.path.join(path_prefix, 'predict.csv'), index=False)
print("Finish Predicting")

loading testing data ...
Get embedding ...
loading word to vec model ...
get words #23000
total words: 23002
sentence count #49800
load model ...
save csv ...
Finish Predicting


# Hint
* Optimizer
* learning rate
* epoch
* batch size
* Activation function
* Self-Training for unlabel training data