In [1]:
import pandas as pd
import json
import torch   
from torchtext import data 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random
import os
from torchtext.vocab import Vectors # downloaded word embedding
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from ast import literal_eval # convert string to list
from sklearn.preprocessing import MultiLabelBinarizer
import re

import torch.nn.functional as F
from torch.autograd import Variable
from sklearn import preprocessing

In [2]:
# read dataset and testset.
dataset = pd.read_csv("pre-processed data/new_label_dataset.csv")[['content', 'categories']]
testset = pd.read_csv("pre-processed data/new_label_testset.csv")[['content', 'categories']]

# convert string to list.
dataset['categories'] = dataset['categories'].apply(lambda x:literal_eval(x))
testset['categories'] = testset['categories'].apply(lambda x:literal_eval(x))

# seperate 'categories' column.
cat_tr_list = []
for i in dataset['categories']:
    cat_tr_list.append(i)
    
cat_te_list = []
for i in testset['categories']:
    cat_te_list.append(i)

mlb_tr = MultiLabelBinarizer()
labels_tr = mlb_tr.fit_transform(cat_tr_list)
categories_tr = mlb_tr.classes_

mlb_te = MultiLabelBinarizer()
labels_te = mlb_te.fit_transform(cat_te_list)
categories_te = mlb_te.classes_

# convert list to columns.
for i, cat in enumerate(categories_tr):
    dataset[cat] = labels_tr[:, i]
    

for i, cat in enumerate(categories_te):
    testset[cat] = labels_te[:, i]
    
    
dataset.to_csv("pre-processed data/new_label_dataset_1.csv")
testset.to_csv("pre-processed data/new_label_testset_1.csv")

In [3]:
dataset.shape

(29114, 27)

In [4]:
# for imbalance dataset. reduction = 'none' in loss function.
weight = torch.tensor(labels_tr.sum(axis=0)/sum(labels_tr.sum(axis=0)))

weight

tensor([0.0213, 0.0016, 0.0224, 0.0324, 0.0110, 0.0178, 0.0798, 0.0559, 0.0019,
        0.0807, 0.0026, 0.0844, 0.0742, 0.0022, 0.1038, 0.0089, 0.1129, 0.0180,
        0.0316, 0.0032, 0.0994, 0.0169, 0.0811, 0.0019, 0.0338],
       dtype=torch.float64)

In [5]:
features_dataset = pd.read_csv("pre-processed data/label_feature_dataset.csv")
features_testset = pd.read_csv("pre-processed data/label_feature_testset.csv")

col = list(features_dataset.columns)[:-2]

tr_X, te_X = features_dataset[col].values, features_testset[col].values

ml_weight = np.ones((1, 14))
ml_weight[0, :4] = 1.5

# range to 0 and 1.
tr_X = preprocessing.scale(tr_X)*ml_weight
te_X = preprocessing.scale(te_X)*ml_weight


In [6]:
tr_X.shape

(29114, 14)

In [7]:
torch.manual_seed(2020)

# the same as system_2.
TEXT = data.Field(sequential=True, tokenize=lambda x: x.split(), lower=True)
LABEL = data.Field(sequential=False, use_vocab=False)

fields = [(v, LABEL) for v in categories_tr]

fields = [(None, None), ('content', TEXT), (None, None)] + fields

dataset, testset = data.TabularDataset.splits(
    path = 'pre-processed data/',
    train = 'new_label_dataset_1.csv',
    test = 'new_label_testset_1.csv',
    format = 'csv',
    fields = fields,
    skip_header = True
)

In [8]:
# tr_X, te_X = dataset.split(split_ratio=0.8, random_state = random.seed(2020))
# tr_x, val_x = tr_X.split(split_ratio=0.7, random_state = random.seed(2020))

# load downloaded glove word embedding.
cache = '.vector_cache'
if not os.path.exists(cache): os.mkdir(cache)
vectors = Vectors(name='./glove.840B.300d.txt', cache=cache)

# create vocab.
TEXT.build_vocab(dataset, min_freq=3, vectors=vectors)

print("Size of TEXT vocabulary:",len(TEXT.vocab))

print("Top words: ", TEXT.vocab.freqs.most_common(5))  

Size of TEXT vocabulary: 6392
Top words:  [('earthquake', 2846), ('school', 2577), ('shoot', 2453), ('nepal', 2414), ('philippine', 2358)]


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

# Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits((dataset, testset), 
                                                            batch_sizes = (1, 1),
                                                            sort=False,
                                                            sort_key=None,
                                                            shuffle=False,
                                                            sort_within_batch=False,
                                                            repeat=False,
                                                            device=device)

In [10]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars, vector):
            self.dl, self.x_var, self.y_vars, self.vector = dl, x_var, y_vars, vector # x_var is content, y_var is label.

    def __iter__(self):
        for i, batch in enumerate(self.dl):
            x = getattr(batch, self.x_var) # return value of x_var "content" in object batch.
            y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            z = self.vector[i*len(batch):i*len(batch)+len(batch), :]
            yield (x, y, z)

    def __len__(self):
            return len(self.dl)

train_dl = BatchWrapper(train_iterator, "content", list(categories_tr), tr_X)
valid_dl = BatchWrapper(valid_iterator, "content", list(categories_tr), te_X)

### Customer layer:

In [11]:
import math
import torch
from torch import nn

class ElementWiseLinear(nn.Module):
    __constants__ = ['n_features']
    n_features: int
    weight: torch.Tensor
    def __init__(self, n_features: int, bias: bool = True) -> None:
        super(ElementWiseLinear, self).__init__()
        self.n_features = n_features
        self.weight = nn.Parameter(torch.Tensor(n_features, 1))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(n_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
    def reset_parameters(self) -> None:
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(self.bias, -bound, bound)
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        output = torch.mul(input, self.weight)
        if self.bias is not None:
            output += self.bias
        return output
    def extra_repr(self) -> str:
        return 'in_features={}, out_features={}, bias={}'.format(
            self.n_features, self.n_features, self.bias is not None
        )

In [12]:
class GRU(nn.Module):
    def __init__(self, hidden_dim=30, emb_dim=300):
        super().__init__()
        
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_dim, num_layers=1, dropout=0.2, bidirectional = True)
        self.customfc = ElementWiseLinear(14, bias=False)
        self.fc = nn.Linear(hidden_dim+14, 64)
        self.fc1 = nn.Linear(64, 25)

    def forward(self, seq, data):
        embed = self.embedding(seq)
        _, hidden = self.gru(embed)
        embed1 = (hidden[-2,:,:]+hidden[-1,:,:])/2.0
        embed1 = embed1.T
        tensor_data = torch.tensor(data).type(torch.FloatTensor).T
        tensor_data_1 = self.customfc(tensor_data)
        vector = torch.cat((embed1, tensor_data_1)).T
        output = self.fc(vector)
        output1 = self.fc1(F.relu(output))
        return output1

In [18]:
def SGD(epochs=10, hidden_dim = 30):
    
    tr_loss_list, te_loss_list = [], []
    
    model = GRU(hidden_dim, emb_dim=300)

    optimizer = optim.SGD(model.parameters(), lr=0.3, weight_decay=1e-5)
    criterion = nn.BCEWithLogitsLoss(reduction='none')

    # pre-trained Glove.
    pretrained_embeddings = TEXT.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings)

    for epoch in range(epochs):
        tr_loss, val_loss = 0.0, 0.0
        
        # training 
        model.train() 
        for x, y, z in train_dl: 
            optimizer.zero_grad()
            preds = model(x, z)
            loss = criterion(preds, y)
            loss = (loss * weight).mean()
            loss.backward()
            optimizer.step()
            tr_loss += loss.item()*7

        tr_loss /= len(train_dl)

        # testing
        model.eval()
        target_list, preds_list = [], []
        with torch.no_grad():
            for x, y, z in valid_dl:
                preds = model(x, z)
                loss = criterion(preds, y)
                loss = (loss * weight).mean()
                preds = (1/(1 + np.exp(-preds))).numpy() # preds.numpy()
                preds_list.extend(preds)
                target_list.extend(y.numpy())
                val_loss += loss.item()*7

        val_loss /= len(valid_dl)
        
        tr_loss_list.append(tr_loss)
        te_loss_list.append(val_loss)

        print('Epoch: ',epoch,' Training Loss: ',tr_loss,' | Validation Loss: ',val_loss)
        
        if len(te_loss_list) > 2:
            if (te_loss_list[-2] - te_loss_list[-1]) < 0.000001: #(te_loss_list[-2] - te_loss_list[-1]) > 0 and 
                break
        
    return target_list, preds_list, tr_loss_list, te_loss_list

In [23]:
target_list, preds_list, tr_loss, te_loss = SGD(11, 300)

Epoch:  0  Training Loss:  0.09346826095240827  | Validation Loss:  0.177011763406562
Epoch:  1  Training Loss:  0.0832686817530013  | Validation Loss:  0.1723366556954481
Epoch:  2  Training Loss:  0.08174097439472003  | Validation Loss:  0.1666514802469139
Epoch:  3  Training Loss:  0.08067711230065715  | Validation Loss:  0.16325217443257656
Epoch:  4  Training Loss:  0.07998985743192746  | Validation Loss:  0.16110124012316776
Epoch:  5  Training Loss:  0.07951295707458224  | Validation Loss:  0.15926278687191495
Epoch:  6  Training Loss:  0.07910366818347207  | Validation Loss:  0.15829222160455209
Epoch:  7  Training Loss:  0.07869473523529046  | Validation Loss:  0.15789359003054268
Epoch:  8  Training Loss:  0.07830653306528583  | Validation Loss:  0.15764020541516185
Epoch:  9  Training Loss:  0.07793431242497802  | Validation Loss:  0.15732502531785245
Epoch:  10  Training Loss:  0.07758989372660925  | Validation Loss:  0.1571633225405234


In [27]:
# epoch = [i for i in range(len(tr_loss))]

# plt.figure(figsize=(5,5))
# plt.plot(epoch, tr_loss, label='train')
# plt.plot(epoch, te_loss, label='test')
# plt.legend()
# plt.show()

### Result:

In [25]:
preds_te_arr = np.array(preds_list).reshape((-1, 25))

preds_te_arr[preds_te_arr >= 0.5] = 1
preds_te_arr[preds_te_arr < 0.5] = 0

te_y = np.array(target_list).reshape((-1, 25))

In [26]:
preds_te_arr.shape

score = 0
acc = 0

for i in range(preds_te_arr.shape[1]):
    score += f1_score(te_y[:, i], preds_te_arr[:, i],average='macro')
    acc += accuracy_score(te_y[:, i], preds_te_arr[:, i])
    
print("Macro-average F1-score: ", score/25)
print("Average accuracy: ", acc/25)

Macro-average F1-score:  0.4950096476241462
Average accuracy:  0.8851302115173004
