# multi-label tweets classification:

In [8]:
import pandas as pd
import json
import torch   
from torchtext import data 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random
import os
from torchtext.vocab import Vectors
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from ast import literal_eval
from sklearn.preprocessing import MultiLabelBinarizer
import re

import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
dataset = pd.read_csv("pre-processed data/new_label_dataset.csv")[['content', 'categories']]

# convert string to list.
dataset['categories'] = dataset['categories'].apply(lambda x:literal_eval(x))

# seperate 'categories' column.
cat_list = []
for i in dataset['categories']:
    cat_list.append(i)

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(cat_list)
categories = mlb.classes_


for i, cat in enumerate(categories):
    dataset[cat] = labels[:, i]
    
dataset.to_csv("pre-processed data/new_label_dataset_1.csv")

In [3]:
print(categories)
print(labels.shape)

['Advice' 'CleanUp' 'ContextualInformation' 'Discussion' 'Donations'
 'EmergingThreats' 'Factoid' 'FirstPartyObservation' 'GoodsServices'
 'Hashtags' 'InformationWanted' 'Irrelevant' 'Location' 'MovePeople'
 'MultimediaShare' 'NewSubEvent' 'News' 'Official' 'OriginalEvent'
 'SearchAndRescue' 'Sentiment' 'ServiceAvailable' 'ThirdPartyObservation'
 'Volunteer' 'Weather']
(37293, 25)


In [4]:
torch.manual_seed(2020)

# loading custom dataset
TEXT = data.Field(sequential=True, tokenize=lambda x: x.split(), lower=True)
LABEL = data.Field(sequential=False, use_vocab=False)

fields = [(v, LABEL) for v in categories]

fields = [(None, None), ('content', TEXT), (None, None)] + fields

dataset=data.TabularDataset(path = 'pre-processed data/new_label_dataset_1.csv',format = 'csv',fields = fields,skip_header = True)

print(vars(dataset.examples[0]))

{'content': ['philippine', 'flood', 'worsen', 'death', 'toll', 'hit', 'wake', 'gener'], 'Advice': '0', 'CleanUp': '0', 'ContextualInformation': '0', 'Discussion': '0', 'Donations': '0', 'EmergingThreats': '0', 'Factoid': '1', 'FirstPartyObservation': '0', 'GoodsServices': '0', 'Hashtags': '0', 'InformationWanted': '0', 'Irrelevant': '0', 'Location': '0', 'MovePeople': '0', 'MultimediaShare': '0', 'NewSubEvent': '0', 'News': '1', 'Official': '0', 'OriginalEvent': '0', 'SearchAndRescue': '0', 'Sentiment': '0', 'ServiceAvailable': '0', 'ThirdPartyObservation': '1', 'Volunteer': '0', 'Weather': '0'}


In [5]:
tr_X, te_X = dataset.split(split_ratio=0.8, random_state = random.seed(2020))
tr_x, val_x = tr_X.split(split_ratio=0.7, random_state = random.seed(2020))

# load downloaded glove word embedding.
cache = '.vector_cache'
if not os.path.exists(cache): os.mkdir(cache)
vectors = Vectors(name='./glove.840B.300d.txt', cache=cache)

# create vocab.
TEXT.build_vocab(tr_X, min_freq=3, vectors=vectors)

print("Size of TEXT vocabulary:",len(TEXT.vocab))

print("Top words: ", TEXT.vocab.freqs.most_common(5))  

Size of TEXT vocabulary: 6543
Top words:  [('shoot', 3323), ('earthquake', 2530), ('people', 2370), ('philippine', 2132), ('school', 2119)]


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((tr_x, val_x), 
                                                            batch_sizes = (64, 64),
                                                            sort_key = lambda x: len(x.content),
                                                            sort_within_batch=False,
                                                            repeat=False,
                                                            device=device)

In [7]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
            self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x 

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper

            #if self.y_vars is None: # we will concatenate y into a single tensor
            y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            yield (x, y)

    def __len__(self):
            return len(self.dl)

train_dl = BatchWrapper(train_iterator, "content", list(categories))
valid_dl = BatchWrapper(valid_iterator, "content", list(categories))

In [51]:
class LSTM(nn.Module):
    def __init__(self, hidden_dim=30, emb_dim=300):
        super().__init__()
        
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=0.2, bidirectional = True)
        self.fc = nn.Linear(hidden_dim*2, 25)

    def forward(self, seq):
        embed = self.embedding(seq)
        hidden, _ = self.lstm(embed)
        embed1 = hidden[-1,:,:] # torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        output = self.fc(embed1)
        preds = torch.sigmoid(output) # sigmoid must be used before nn.BCEWithLogitsLoss()
        return preds

In [52]:
hidden_dim = 30
nl = 3

model = LSTM(hidden_dim, emb_dim=300)

opt = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-8)
loss_func = nn.BCEWithLogitsLoss()

# pre-trained Glove.
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

epochs = 30

for epoch in range(epochs):
    tr_loss = 0.0
    model.train() # turn on training mode
    for x, y in train_dl: 
        opt.zero_grad()
        preds = model(x).squeeze()
        loss = loss_func(preds, y)
        loss.backward()
        opt.step()
        tr_loss += loss.item() * x.size(0)

    tr_loss /= len(train_dl)
    
    val_loss = 0.0
    model.eval() # turn on training mode
    index = 0
    with torch.no_grad():
        for x, y in valid_dl:
            preds = model(x).squeeze()  
            acc = torch.abs(preds - y).view(-1)
            acc = (1. - acc.sum() / acc.size()[0]) * 100
            loss = loss_func(preds, y)
            val_loss += loss.item() * x.size(0)
            index += 1
    acc/=index
        
    val_loss /= len(valid_dl)

    print('Epoch: ',epoch,' Training Loss: ',tr_loss,' | Validation Loss: ',val_loss)

Epoch:  0  Training Loss:  20.931651260874688  | Validation Loss:  7.253626233339309
Epoch:  1  Training Loss:  20.263823487525322  | Validation Loss:  7.22932653597423
Epoch:  2  Training Loss:  20.148782229022512  | Validation Loss:  7.2226875045469825


KeyboardInterrupt: 

In [26]:
# from sklearn.metrics import hamming_loss
# y_pred = [1, 2, 3, 4]
# y_true = [2, 2, 3, 4]
# print(hamming_loss(y_true, y_pred))
 
#print(hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2))))


0.25
0.75


tutorial: http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/


In [None]:
# import torch

# dataset = {
#     "input": torch.tensor([[1, 2, 3, 24, 25, 25, 25, 25],
#                            [4, 5, 2, 6, 24, 25, 25, 25],
#                            [7, 8, 9, 24, 24, 25, 25, 25],
#                            [4, 10, 11, 12, 24, 25, 25, 25],
#                            [13, 14, 2, 15, 2, 16, 17, 24],
#                            [18, 19, 20, 21, 22, 23, 3, 24]], dtype=torch.long).permute(1, 0),
#      "target": torch.tensor([[1., 0., 1.],
#                              [0., 1., 0.],
#                              [1., 0., 1.],
#                              [0., 1., 0.],
#                              [1., 1., 0.],
#                              [0., 0., 1.]], dtype=torch.float32),
# }

# class MltcModel(torch.nn.Module):
#     def __init__(self, vocab_size, emb_dim, hid_dim, rnn_num_layers=1):
#         super().__init__()

#         self.embedding = torch.nn.Embedding(vocab_size, emb_dim)
#         self.rnn = torch.nn.GRU(emb_dim, hid_dim, bidirectional=True, num_layers=rnn_num_layers)
#         self.l1 = torch.nn.Linear(hid_dim * 2, 256) # * rnn_num_layers 512*2 = 1024
#         self.l2 = torch.nn.Linear(256, 3)

#     def forward(self, samples):
        
#         # samples word index.
#         embedded = self.embedding(samples)

#         _, last_hidden = self.rnn(embedded)

#         hidden_list = [last_hidden[i, :, :] for i in range(last_hidden.size()[0])] # 512
#         encoded = torch.cat(hidden_list, dim=1) # 1024
#         encoded = torch.nn.functional.relu(self.l1(encoded)) # 256
#         encoded = torch.nn.functional.sigmoid(self.l2(encoded)) # 2
#         return encoded

# model = MltcModel(26, 256, 512, rnn_num_layers=1)
# criterion = torch.nn.MultiLabelSoftMarginLoss()
# optimizer = torch.optim.Adam(model.parameters())

# for epoch in range(10):
#     optimizer.zero_grad()
#     output = model(dataset["input"])
#     loss = criterion(output, dataset["target"])
#     loss.backward()
#     optimizer.step()
#     with torch.no_grad():
#         acc = torch.abs(output - dataset["target"]).view(-1)
#         acc = (1. - acc.sum() / acc.size()[0]) * 100
#         print(f'Epoch({epoch+1}) loss: {loss.item()}, accuracy: {acc:.1f}%')