In [1]:
!gdown --id '1lz0Wtwxsh5YCPdqQ3E3l_nbfJT1N13V8' --output data.zip
!unzip data.zip

Downloading...
From: https://drive.google.com/uc?id=1lz0Wtwxsh5YCPdqQ3E3l_nbfJT1N13V8
To: /content/data.zip
45.1MB [00:00, 67.8MB/s]
Archive:  data.zip
  inflating: training_label.txt      
  inflating: testing_data.txt        
  inflating: training_nolabel.txt    


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.nn.functional as F

Three datasets:
- training_label.txt:
    - e.g., 1 +++$+++ are wtf ... awww thanks !
- training_nolabel.txt
    - e.g., hates being this burnt !! ouch
- testing_data.txt
    ```
    id,text
    0,my dog ate our dinner . no , seriously ... he ate it .
    1,omg last day sooon n of primary noooooo x im gona be swimming out of school wif the amount of tears am gona cry
    ```


## Load the data

In [4]:
def load_training_data(path, labeled):
    if labeled:
        with open(path, 'r') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split(' ') for line in lines]
        x = [line[2:] for line in lines]
        y = [int(line[0]) for line in lines]
        return x, y    
    else:
        with open(path, 'r') as f:
            lines = f.readlines()
        x = [line.strip('\n').split(' ') for line in lines]
        return x
def load_testing_data(path):
    with open(path, 'r') as f:
        lines = f.readlines()
    x = ["".join(line.strip('\n').split(',')[1:]).strip() for line in lines[1:]]
    x = [sen.split(' ') for sen in x]
    return x

## Train Word2Vec

In [9]:
print("loading training data ... ", end='')
train_x, train_y = load_training_data('training_label.txt', True)
train_x_no_label = load_training_data('training_nolabel.txt', False)
print("done.")

print("loading testing data ... ", end='')
test_x = load_testing_data('testing_data.txt')
print("done.")

loading training data ... done.
loading testing data ... done.


In [10]:
from gensim.models import word2vec

print('training word2vec model ... ', end='')
# model = word2vec.Word2Vec(train_x + test_x, size=250, window=5, min_count=5, workers=12, iter=10, sg=1)
model = word2vec.Word2Vec(train_x + test_x + train_x_no_label, size=250, window=5, min_count=5, workers=12, iter=10, sg=1)
print('done.')

print('saving word2vec model ... ', end='')
model.save('model/word2vec_all.model')
print('done.')

training word2vec model ... done.
saving word2vec model ... done.


In [11]:
from gensim.models import word2vec
# Test word2vec
embedding = word2vec.Word2Vec.load('model/word2vec_all.model')
embedding_dim = embedding.vector_size
model.wv.similar_by_word('haha', topn=20) # some test

[('hahaha', 0.8560881018638611),
 ('lol', 0.8301147222518921),
 ('hehe', 0.759688675403595),
 ('hahah', 0.7439364194869995),
 ('ahaha', 0.7049542665481567),
 ('lmao', 0.6756629943847656),
 ('hahahaha', 0.6402173638343811),
 ('hah', 0.6286949515342712),
 ('aha', 0.617099940776825),
 ('hahahah', 0.5905086994171143),
 ('yeahp', 0.5903643369674683),
 ('hehehe', 0.5889112949371338),
 ('xd', 0.5817236304283142),
 ('hahahahha', 0.5782143473625183),
 ('lmfao', 0.5755268931388855),
 ('hahaa', 0.5672986507415771),
 ('hahha', 0.5605295896530151),
 ('hhahahaha', 0.5558002591133118),
 ('nawh', 0.5549719333648682),
 ('ahah', 0.5528687238693237)]

## Data Preprocess

In [12]:
class Preprocess():
    def __init__(self, sentences, sen_len, w2v_path='model/word2vec_all.model'):
        self.w2v_path = w2v_path
        self.sentences = sentences
        self.sen_len = sen_len
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
    def get_w2v_model(self):
        self.embedding = word2vec.Word2Vec.load(self.w2v_path)
        self.embedding_dim = self.embedding.vector_size
    def add_embedding(self, word):
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)
    def make_embedding(self):
        self.get_w2v_model()
        for i, word in enumerate(self.embedding.wv.vocab):
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(self.embedding[word])
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        self.add_embedding('<PAD>')
        self.add_embedding('<UNK>')
        return self.embedding_matrix
    def pad_sequence(self, sentence):
        if len(sentence) > self.sen_len:
            sentence = sentence[:self.sen_len]
        else:
            for _ in range(self.sen_len - len(sentence)):
                sentence.append(self.word2idx['<PAD>'])
        return sentence
    def sentence_word2idx(self):
        sentence_list = []
        for i, sen in enumerate(self.sentences):
            sentence_idx = []
            for word in sen:
                if (word in self.word2idx.keys()):
                    sentence_idx.append(self.word2idx[word])
                else:
                    sentence_idx.append(self.word2idx['<UNK>'])
            sentence_idx = self.pad_sequence(sentence_idx)
            sentence_list.append(sentence_idx)
        return torch.LongTensor(sentence_list)

## Dataset

In [13]:
from torch.utils import data

class TwitterDataset(data.Dataset):
    def __init__(self, X, y):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None:
            return self.data[idx]
        else:
            return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

## Model

In [14]:
import torch.nn as nn

class LSTM_Net(nn.Module):
    def __init__(self, embedding, embedding_dim, hidden_dim, num_layers, dropout=0.5, fix_embedding=True):
        super(LSTM_Net, self).__init__()
        self.embedding = torch.nn.Embedding(embedding.size(0), embedding.size(1))
        self.embedding.weight = torch.nn.Parameter(embedding)
        self.embedding.weight.requires_grad = not fix_embedding
        self.embedding_dim = embedding.size(1)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = nn.Sequential(nn.Dropout(dropout), nn.Linear(hidden_dim, 1), nn.Sigmoid())
    
    def forward(self, inputs):
        inputs = self.embedding(inputs)
        x, _ = self.lstm(inputs, None)
        x = x[:, -1, :]
        x = self.classifier(x)
        return x

## Train

In [15]:
def evaluation(outputs, labels):
    outputs[outputs >= 0.5] = 1
    outputs[outputs < 0.5] = 0
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

In [16]:
def training(batch_size, n_epoch, lr, model_dir, train_loader, val_loader, model, device):
    loss_func = nn.BCELoss() # Binary Cross Entropy Loss
    train_batch = len(train_loader)
    val_batch = len(val_loader)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    best_acc = 0
    print('Start training ...')
    for epoch in range(n_epoch):
        total_loss, total_acc = 0, 0
        model.train()
        for i, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device, dtype=torch.long)
            labels = labels.to(device, dtype=torch.float)
            optimizer.zero_grad()
            outputs = model(inputs)
            outputs = outputs.squeeze()
            loss = loss_func(outputs, labels)
            loss.backward()
            optimizer.step()
            correct = evaluation(outputs, labels)
            total_acc += correct / batch_size
            total_loss += loss.item()
            print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
             	epoch+1, i+1, train_batch, loss.item(), correct*100/batch_size))
        print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/train_batch, total_acc/train_batch*100))

        model.eval()
        with torch.no_grad():
            total_loss, total_acc = 0, 0
            for i, (inputs, labels) in enumerate(val_loader):
                inputs = inputs.to(device, dtype=torch.long)
                labels = labels.to(device, dtype=torch.float)
                outputs = model(inputs)
                outputs = outputs.squeeze()
                loss = loss_func(outputs, labels)
                correct = evaluation(outputs, labels)
                total_acc += correct / batch_size
                total_loss += loss.item()
            print('valid | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/val_batch, total_acc/val_batch*100))
            if total_acc > best_acc:
                best_acc = total_acc
                print('saving model with acc {:.3f} ... '.format(total_acc/val_batch*100), end='')
                torch.save(model, "{}/best_model.model".format(model_dir))
                print('done.')
        print('--------------------------------------')

## Testing

In [28]:
def testing(batch_size, test_loader, model, device):
    model.eval()
    ret_output = []
    with torch.no_grad():
        for i, inputs in enumerate(test_loader):
            inputs = inputs.to(device, dtype=torch.long)
            outputs = model(inputs)
            outputs = outputs.squeeze()
            outputs[outputs >= 0.5] = 1
            outputs[outputs < 0.5] = 0
            ret_output += outputs.int().tolist()
    return ret_output

## Main

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_dir = 'model'

# super parameters
batch_size = 128
n_epoch = 10
lr = 0.001

# preprocess
preprocess = Preprocess(train_x, sen_len=20, w2v_path='model/word2vec_all.model')
embedding = preprocess.make_embedding()
train_x_idx = preprocess.sentence_word2idx()
train_y_tensor = torch.LongTensor(train_y)

# select validation data from training data
val_size = int(0.1 * len(train_x_idx))
X_train, X_val = train_x_idx[val_size:], train_x_idx[:val_size]
y_train, y_val = train_y_tensor[val_size:], train_y_tensor[:val_size]

train_dataset = TwitterDataset(X_train, y_train)
val_dataset = TwitterDataset(X_val, y_val)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=8)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=batch_size,
                                           shuffle=False,
                                           num_workers=8)

model = LSTM_Net(embedding, embedding_dim=250, hidden_dim=150, num_layers=1, dropout=0.5)
model.to(device)

training(batch_size, n_epoch, lr, model_dir, train_loader, val_loader, model, device)

## Predict

In [29]:
print('Predicting for test data ...')

# preprocess
preprocess = Preprocess(test_x, sen_len=20, w2v_path='model/word2vec_all.model')
embedding = preprocess.make_embedding()
test_x_idx = preprocess.sentence_word2idx()
test_dataset = TwitterDataset(X=test_x_idx, y=None)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=batch_size,
                                           shuffle=False,
                                           num_workers=8)

print('loading model ...', end='')
model = torch.load('model/best_model.model')
print('done.')

outputs = testing(batch_size, test_loader, model, device)

df = pd.DataFrame({"id":[str(i) for i in range(len(test_x))],"label":outputs})
print("saving csv ...")
df.to_csv('predict.csv', index=False)

print("Finish Predicting")

Predicting for test data ...
loading model ...done.
saving csv ...
Finish Predicting
