# multi-label tweets classification:

In [1]:
import pandas as pd
import json
import torch   
from torchtext import data 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random
import os
from torchtext.vocab import Vectors # downloaded word embedding
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from ast import literal_eval # convert string to list
from sklearn.preprocessing import MultiLabelBinarizer
import re

import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
dataset = pd.read_csv("pre-processed data/dataset_aug_cat.csv")[['content', 'categories']]

# convert string to list.
dataset['categories'] = dataset['categories'].apply(lambda x:literal_eval(x))

# seperate 'categories' column.
cat_list = []
for i in dataset['categories']:
    cat_list.append(i)

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(cat_list)
categories = mlb.classes_

# convert list to columns.
for i, cat in enumerate(categories):
    dataset[cat] = labels[:, i]
    
dataset.to_csv("pre-processed data/new_label_dataset_1.csv")

In [3]:
# for imbalance dataset.
weight = torch.tensor(labels.sum(axis=0)/sum(labels.sum(axis=0)))

weight

tensor([0.0190, 0.0030, 0.0175, 0.0285, 0.0642, 0.0182, 0.0628, 0.0422, 0.0125,
        0.0926, 0.0268, 0.0806, 0.0640, 0.0120, 0.0851, 0.0259, 0.0904, 0.0144,
        0.0316, 0.0197, 0.0858, 0.0129, 0.0576, 0.0085, 0.0242],
       dtype=torch.float64)

In [4]:
dataset[:1]

Unnamed: 0,content,categories,Advice,CleanUp,ContextualInformation,Discussion,Donations,EmergingThreats,Factoid,FirstPartyObservation,...,NewSubEvent,News,Official,OriginalEvent,SearchAndRescue,Sentiment,ServiceAvailable,ThirdPartyObservation,Volunteer,Weather
0,philippine flood worsen death toll hit wake ge...,"[ThirdPartyObservation, Factoid, News]",0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0


### torchtext:
<img src="image/torchtext.png" width="600">

torchtext.data
>
>* Field: 
* TabularDataset: 
* Example
* Iterator (Iterator, BucketIterator, BPTTIterator)

In [5]:
torch.manual_seed(2020)

# the same as system_2.
TEXT = data.Field(sequential=True, tokenize=lambda x: x.split(), lower=True)
LABEL = data.Field(sequential=False, use_vocab=False)

fields = [(v, LABEL) for v in categories]

fields = [(None, None), ('content', TEXT), (None, None)] + fields

dataset=data.TabularDataset(path = 'pre-processed data/new_label_dataset_1.csv',format = 'csv',fields = fields,skip_header = True)

print(vars(dataset.examples[0]))

{'content': ['philippine', 'flood', 'worsen', 'death', 'toll', 'hit', 'wake', 'gener'], 'Advice': '0', 'CleanUp': '0', 'ContextualInformation': '0', 'Discussion': '0', 'Donations': '0', 'EmergingThreats': '0', 'Factoid': '1', 'FirstPartyObservation': '0', 'GoodsServices': '0', 'Hashtags': '0', 'InformationWanted': '0', 'Irrelevant': '0', 'Location': '0', 'MovePeople': '0', 'MultimediaShare': '0', 'NewSubEvent': '0', 'News': '1', 'Official': '0', 'OriginalEvent': '0', 'SearchAndRescue': '0', 'Sentiment': '0', 'ServiceAvailable': '0', 'ThirdPartyObservation': '1', 'Volunteer': '0', 'Weather': '0'}


### use downloaded word embedding:

In [6]:
tr_X, te_X = dataset.split(split_ratio=0.8, random_state = random.seed(2020))
tr_x, val_x = tr_X.split(split_ratio=0.7, random_state = random.seed(2020))

# load downloaded glove word embedding.
cache = '.vector_cache'
if not os.path.exists(cache): os.mkdir(cache)
vectors = Vectors(name='./glove.840B.300d.txt', cache=cache)

# create vocab.
TEXT.build_vocab(tr_X, min_freq=3, vectors=vectors)

print("Size of TEXT vocabulary:",len(TEXT.vocab))

print("Top words: ", TEXT.vocab.freqs.most_common(5))  

Size of TEXT vocabulary: 7252
Top words:  [('nepal', 4937), ('help', 3879), ('earthquake', 3486), ('shoot', 3411), ('people', 2972)]


### BucketIterator:

通过sort_key，BucketIterator将长度相近的数据放到同一个batch内来sample，这样可以最小化每个batch需要padding的个数，提高计算效率。

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((tr_X, te_X), 
                                                            batch_sizes = (64, 64),
                                                            sort_key = lambda x: len(x.content),
                                                            sort_within_batch=False,
                                                            repeat=False,
                                                            device=device)

### python - yield:

**yield** keyword means this function is a generator.

In [20]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
            self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # x_var is content, y_var is label.

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # return value of x_var "content" in object batch.
            y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            yield (x, y)

    def __len__(self):
            return len(self.dl)

train_dl = BatchWrapper(train_iterator, "content", list(categories))
valid_dl = BatchWrapper(valid_iterator, "content", list(categories))

### nn.RELU and F.RELU

nn.ReLU作为一个层结构，必须添加到nn.Module容器中才能使用，而F.ReLU则作为一个函数调用。

In [25]:
class GRU(nn.Module):
    def __init__(self, hidden_dim=30, emb_dim=300):
        super().__init__()
        
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_dim, num_layers=1, dropout=0.2, bidirectional = True)
        self.fc = nn.Linear(hidden_dim*2, 64)
        self.fc1 = nn.Linear(64, 25)

    def forward(self, seq):
        embed = self.embedding(seq)
        _, hidden = self.gru(embed)
        embed1 = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
        output = self.fc(embed1)
        output1 = self.fc1(F.relu(output))
        preds = output1 # BCEWithlogitsLoss = sigmoid + BCELoss
        return preds

### reduction = 'none', 'elementwise_mean' and 'sum'

>* elementwise_mean: (default) the average of sum of loss.
* sum: the sum of loss.
* none: don't process.

In [30]:
def SGD(epochs=10, hidden_dim = 30):
    model = GRU(hidden_dim, emb_dim=300)

    optimizer = optim.SGD(model.parameters(), lr=0.1)
    criterion = nn.BCEWithLogitsLoss() # reduction='none'
    # nn.MultiLabelSoftMarginLoss() nn.BCELoss()

    # pre-trained Glove.
    pretrained_embeddings = TEXT.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings)

    for epoch in range(epochs):
        tr_loss, val_loss = 0.0, 0.0
        
        # training 
        model.train() 
        for x, y in train_dl: 
            optimizer.zero_grad()
            preds = model(x).squeeze()
            loss = criterion(preds, y) # loss = (loss * weight).mean()
            loss.backward()
            optimizer.step()
            tr_loss += loss.item()

        tr_loss /= len(train_dl)

        # testing
        model.eval()
        target_list, preds_list = [], []
        with torch.no_grad():
            for x, y in valid_dl:
                preds = model(x).squeeze()
                loss = criterion(preds, y) # loss = (loss * weight).mean()
                preds = (1/(1 + np.exp(-preds))).numpy() # preds.numpy()
                preds_list.extend(preds)
                target_list.extend(y.numpy())
                val_loss += loss.item()

        val_loss /= len(valid_dl)

        print('Epoch: ',epoch,' Training Loss: ',tr_loss,' | Validation Loss: ',val_loss)
        
    return target_list, preds_list

target_list, preds_list = SGD(100, 50)

Epoch:  0  Training Loss:  0.3089557653860387  | Validation Loss:  0.2607326168104036
Epoch:  1  Training Loss:  0.2541649442990857  | Validation Loss:  0.25984266320981925
Epoch:  2  Training Loss:  0.25183350246338754  | Validation Loss:  0.25849292943110835
Epoch:  3  Training Loss:  0.24796207018894037  | Validation Loss:  0.25617814487254126
Epoch:  4  Training Loss:  0.24194755275660426  | Validation Loss:  0.2515416602940249
Epoch:  5  Training Loss:  0.23626670148362022  | Validation Loss:  0.2473175660569287
Epoch:  6  Training Loss:  0.23212100363946775  | Validation Loss:  0.24310403947646803
Epoch:  7  Training Loss:  0.22843348322350415  | Validation Loss:  0.23801832359570724
Epoch:  8  Training Loss:  0.2250609983528174  | Validation Loss:  0.2329971644829011
Epoch:  9  Training Loss:  0.22216781854275255  | Validation Loss:  0.22986725052080212
Epoch:  10  Training Loss:  0.21981724837648214  | Validation Loss:  0.22760144167045165
Epoch:  11  Training Loss:  0.21779864

Epoch:  93  Training Loss:  0.14217944086749346  | Validation Loss:  0.16105896688600968
Epoch:  94  Training Loss:  0.14188917657639968  | Validation Loss:  0.16029452757369836
Epoch:  95  Training Loss:  0.1416239685546413  | Validation Loss:  0.15974805155802055
Epoch:  96  Training Loss:  0.14126552657997804  | Validation Loss:  0.1625413619114097
Epoch:  97  Training Loss:  0.14087379014155485  | Validation Loss:  0.15991073538213085
Epoch:  98  Training Loss:  0.14062563159446306  | Validation Loss:  0.15996853288637816
Epoch:  99  Training Loss:  0.14036240254876164  | Validation Loss:  0.15929422611315575


In [31]:
preds_te_arr = np.array(preds_list).reshape((-1, 25))

preds_te_arr[preds_te_arr >= 0.5] = 1
preds_te_arr[preds_te_arr < 0.5] = 0

te_y = np.array(target_list).reshape((-1, 25))

print('F1-Score:', f1_score(te_y,preds_te_arr,average='macro'))

F1-Score: 0.4452084292337929


tutorial:

* data processing in pytorch: http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

* imbalance dataset in pytorch: https://discuss.pytorch.org/t/multi-label-multi-class-class-imbalance/37573/9