## Multi-class Feature Engineering

In [None]:
# load pre-processed dataset.
import pandas as pd
import json
import torch   
from torchtext import data 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import random
import os
from torchtext.vocab import Vectors
import torch.nn as nn
import torch.optim as optim
from sklearn import preprocessing
import torch.nn.functional as F

In [None]:
testset = pd.read_csv("pre-processed data/new_label_testset.csv")[['priority', 'content']]
dataset = pd.read_csv("pre-processed data/new_label_dataset.csv")[['priority', 'content']]

dataset = dataset[dataset['priority'] != 'Unknown'].reset_index(drop=True)
testset = testset[testset['priority'] != 'Unknown'].reset_index(drop=True)

In [None]:
features_dataset = pd.read_csv("pre-processed data/label_feature_dataset.csv")
features_testset = pd.read_csv("pre-processed data/label_feature_testset.csv")

features_dataset = features_dataset[features_dataset['priority'] != 'Unknown'].reset_index(drop=True)
features_testset = features_testset[features_testset['priority'] != 'Unknown'].reset_index(drop=True)

print(features_dataset.shape)

In [None]:
col = list(features_dataset.columns)[:-2]

tr_X, te_X = features_dataset[col].values, features_testset[col].values

In [None]:
# range to 0 and 1.

tr_X = preprocessing.scale(tr_X)
te_X = preprocessing.scale(te_X)

In [None]:
weight = np.ones((1, 14))
weight[0, :4] = 1.5
weight

In [None]:
# tr_X #*= weight
# te_X #*= weight

In [None]:
torch.manual_seed(2020)

TEXT = data.Field(sequential=True, tokenize=lambda x: x.split(), lower=True)
LABEL = data.LabelField(dtype = torch.float, batch_first=True)

fields = [('priority', LABEL), ('content', TEXT)] 

dataset, testset = data.TabularDataset.splits(
    path = 'pre-processed data/',
    train = 'dataset_priority.csv',
    test = 'testset_priority.csv',
    format = 'csv',
    fields = fields,
    skip_header = True
)

In [None]:
# load downloaded glove word embedding.
cache = '.vector_cache'
if not os.path.exists(cache): os.mkdir(cache)
vectors = Vectors(name='./glove.840B.300d.txt', cache=cache)

# create vocab.
TEXT.build_vocab(dataset, min_freq=3, vectors=vectors)
LABEL.build_vocab(dataset)

print("Size of TEXT vocabulary:",len(TEXT.vocab))

print("Size of LABEL vocabulary:",len(LABEL.vocab))

print("Top words: ", TEXT.vocab.freqs.most_common(5))  

# Word dictionary.
print("LABEL vocabulary: ", LABEL.vocab.stoi)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 
# sort_key = lambda x: len(x.content),sort_within_batch=False, Bucket
train_iterator, valid_iterator = data.BucketIterator.splits((dataset, testset), 
                                                            batch_sizes = (1, 1), 
                                                            sort=False,
                                                            sort_key=None,
                                                            shuffle=False,
                                                            sort_within_batch=False,
                                                            repeat=False,
                                                            device=device)

In [None]:
class BatchWrapper:
    def __init__(self, dl, vector):
            self.dl, self.vector = dl, vector # x_var is content, y_var is label.

    def __iter__(self):
        for i, batch in enumerate(self.dl):
            x = batch.content
            y = batch.priority
            z = self.vector[i*len(batch):i*len(batch)+len(batch), :]
            yield (x, y, z)

    def __len__(self):
            return len(self.dl)

train_dl = BatchWrapper(train_iterator, tr_X)
valid_dl = BatchWrapper(valid_iterator, te_X)

In [None]:
import math
import torch
from torch import nn

class ElementWiseLinear(nn.Module):
    __constants__ = ['n_features']
    n_features: int
    weight: torch.Tensor
    def __init__(self, n_features: int, bias: bool = True) -> None:
        super(ElementWiseLinear, self).__init__()
        self.n_features = n_features
        self.weight = nn.Parameter(torch.Tensor(n_features, 1))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(n_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
    def reset_parameters(self) -> None:
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(self.bias, -bound, bound)
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        output = torch.mul(input, self.weight)
        if self.bias is not None:
            output += self.bias
        return output
    def extra_repr(self) -> str:
        return 'in_features={}, out_features={}, bias={}'.format(
            self.n_features, self.n_features, self.bias is not None
        )

In [None]:
class GRU(nn.Module):
    def __init__(self, hidden_dim=30, emb_dim=300):
        super().__init__()
        
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.gru = nn.GRU(emb_dim, hidden_dim, num_layers=1, dropout=0.2, bidirectional = True)
        self.customfc = ElementWiseLinear(14, bias=False)
        self.fc = nn.Linear(hidden_dim + 14, 128)
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, 4)

    def forward(self, seq, data):
        embed = self.embedding(seq)
        _, hidden = self.gru(embed)
        embed1 = (hidden[-2,:,:]+hidden[-1,:,:])/2.0
        embed1 = embed1.T
        tensor_data = torch.tensor(data).type(torch.FloatTensor).T
        tensor_data_1 = self.customfc(tensor_data)
        vector = torch.cat((embed1, tensor_data_1))
        output = self.fc(vector.T)
        output1 = self.fc1(F.relu(output))
        output2 = self.fc2(F.relu(output1))
        
        return output2

In [None]:
def SGD(epochs=10, hidden_dim = 30, lr=0.001):
    
    tr_loss_list, te_loss_list = [], []
    
    model = GRU(hidden_dim, emb_dim=300)
    
    weight = torch.tensor([0.7220, 0.1742, 0.0941, 0.0348])
    
    optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=1e-3)
    criterion = nn.CrossEntropyLoss(weight=weight)

    # pre-trained Glove.
    pretrained_embeddings = TEXT.vocab.vectors
    model.embedding.weight.data.copy_(pretrained_embeddings)

    for epoch in range(epochs):
        tr_loss, val_loss = 0.0, 0.0
        
        # training 
        model.train() 
        for x, y, z in train_dl: 
            optimizer.zero_grad()
            preds = model(x, z)#.squeeze()
            loss = criterion(preds, y.type(torch.LongTensor))
            loss.backward()
            optimizer.step()
            tr_loss += loss.item()

        tr_loss /= len(train_dl)
        tr_loss *= 7
        # testing
        model.eval()
        target_list, preds_list, probs_list = [], [], []
        with torch.no_grad():
            for x, y, z in valid_dl:
                preds = model(x, z)#.squeeze()
                loss = criterion(preds, y.type(torch.LongTensor)) 
                probs = F.softmax(preds).detach().numpy()
                
                probs_list.extend(probs)
                preds_list.extend(preds)
                target_list.extend(y.numpy())
                val_loss += loss.item()

        val_loss /= len(valid_dl)
        val_loss *= 7
        
        tr_loss_list.append(tr_loss)
        te_loss_list.append(val_loss)

        print('Epoch: ',epoch,' Training Loss: ',tr_loss,' | Validation Loss: ',val_loss)
        
        if len(te_loss_list) > 2:
            if (te_loss_list[-2] - te_loss_list[-1]) < 0.00001: # (te_loss_list[-2] - te_loss_list[-1]) > 0 and 
                break
        
    return tr_loss_list, te_loss_list, preds_list, target_list, probs_list, model

In [None]:
train_loss, valid_loss, preds_list, label_list, prob_list, model = SGD(epochs=20, lr=0.0005)


### Result:

In [19]:
# RMSE
score = 0

for i in range(len(label_list)):
    index = int(label_list[i])
    if index == 0: weight = 0.25
    elif index == 1: weight = 0.5
    elif index == 2: weight = 0.75
    elif index == 3: weight = 1
    else: weight = 0
    score += (weight - weight*prob_list[i][index])**2
    
print("RMSE all: ", score/len(label_list))

RMSE all:  0.04910742681724399


In [39]:
# epoch = [i for i in range(len(train_loss))]

# plt.figure(figsize=(5,5))
# plt.plot(epoch, train_loss, label='train')
# plt.plot(epoch, valid_loss, label='test')
# plt.title("testing process")
# plt.legend()
# plt.show()

In [40]:
# inf = dict([i for i in model.named_parameters()])

# inf_list = [i.item() for i in inf['customfc.weight']]