# Neural net with fasttext embeddings

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
#standard libs
import os, sys
from pathlib import Path
from pprint import pprint
import random
import json
import itertools
from datetime import datetime as dt
# ds libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.autonotebook import tqdm
# custom path
os.chdir('../..')

os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'

  app.launch_new_instance()


In [2]:
from src.train.data_utils import load_data, select_test

In [102]:
# %%writefile src/train/neural.py
"""
Load fasttext weights and build a nueral net
"""

import io
from typing import List, Dict
import random

import torch
from torch import nn
from tqdm.autonotebook import tqdm
import numpy as np


rand_true = lambda test_prob: np.random.choice([True,False], p=[test_prob, 1 - test_prob])


def select_test(data, test_subsets=None,  test_size=0.3):
    """ Return True for test set and False for train """
    if test_subsets is None:
        test_subsets = data['subset'].unique().tolist()
    is_test = data.apply(lambda x: rand_true(test_size) if x['subset'] in test_subsets else False,
              axis=1)
    return is_test



def load_vectors(fname,):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    i = 0
    for line in tqdm(fin, desc='lines'):
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    fin.close()
    return data, d


def weights_to_tensor(topic_weights: Dict[str, float], classes: List[str]) -> torch.Tensor:
    """ convert dict weights to a tensor """
    weight_vec = torch.zeros(len(classes))
    for t,w in topic_weights.items():
        index = classes.index(t)
        weight_vec[index] = w
    return weight_vec


class EmbeddingNet(nn.Module):
    """
    Embedding network to convert tokenized text to word vectors
    """
    def __init__(self, vectors: Dict[str, List[float]], dim: int):
        super().__init__()
        self.vocab = {w:i for i,w in enumerate(list(vectors.keys()))}
        self.dim = dim
        self.id2word = {i:w for i,w in self.vocab.items()}
        self.embeddings = torch.zeros(len(self.vocab), dim, requires_grad=False)
        for w,i in self.vocab.items():
            self.embeddings[i] = torch.tensor(vectors[w])
            
            
    
    def get_doc_vectors(self, doc: List[str]) -> List[torch.Tensor]:
        """ convert a list of tokens to a list of word vectors, if not present skip """
        res = [self.embeddings[self.vocab[token]] for token in doc if token in self.vocab.keys()]
        if len(res) == 0:
            res = [torch.zeros(self.dim)]
        return res
            
    
    def forward(self, documents: List[List[str]]) -> List[List[torch.Tensor]]:
        """ get word vectors for a batch of documents """
        res = [self.get_doc_vectors(doc) for doc in documents]
        return res
    
    
    def get_extremes(self, doc_vecs: List[torch.Tensor]) -> torch.Tensor:
        """ calculate mean,max,min and sum along each dimension and concat """
        if len(doc_vecs) == 0:
            return torch.zeros(self.dim * 4)
        stacked = torch.stack(doc_vecs, dim=0)
        t_max,_ = stacked.max(dim=0)
        t_min,_ = stacked.min(dim=0)
        t_mean = stacked.mean(0)
        t_sum = stacked.sum(0)
        concat = torch.cat([t_max, t_min, t_mean, t_sum],)
        return concat
    
    
    def get_batch_extremes(self, batch:  List[List[torch.Tensor]],) -> torch.Tensor:
        """ get extremes for each document and return as one batch tensor """
        batch_extremes = [self.get_extremes(doc) for doc in batch]
        batch_extremes = torch.stack(batch_extremes)
        return batch_extremes
    
    
def list2pairs(list_obj: List[int]) -> List[List[int]]:
    """ convert list of neural units to pairs for input and output neurons """
    if len(list_obj) < 2:
#         num = list_obj[0]
#         return [[num, num]]
#     elif len(list_obj) == 0:
        return []
    else:
        inps = list_obj[:-1]
        outs = list_obj[1:]
        pairs = [[i,o] for i,o in zip(inps, outs)]
        return pairs
    
    
class DenseNet(nn.Module):
    def __init__(self, input_features: int, dense_units: List[int], num_classes: int, dropout=0.5):
        super().__init__()
        self.dropout = dropout
        linear_neurons = list2pairs([input_features] + dense_units)
        self.hidden = nn.Sequential(*[
            self.linear_block(inp, out)
         for inp,out in linear_neurons
        ])
        self.clf = nn.Linear(linear_neurons[-1][1] if len(linear_neurons) > 0 else input_features, num_classes)
        

    def linear_block(self, in_units, out_units ):
        block = nn.Sequential(
            nn.Linear(in_units, out_units),
            nn.BatchNorm1d(out_units),
            nn.LeakyReLU(0.02),
            nn.Dropout(self.dropout),
            )
        return block
    
    
    def forward(self, x):
        x = self.hidden(x)
        x = self.clf(x)
        return x
    
    
class TopicClassifier(nn.Module):
    """ hold together embeddings and neural net """
    def __init__(self, embedding_net, output_net):
        super().__init__()
        self.embedding_net = embedding_net
        self.output_net = output_net
        
        
    def train(self):
        self.embedding_net.eval()
        self.output_net.train()
        
        
    def eval(self):
        self.embedding_net.eval()
        self.output_net.eval()
        
        
    def forward(self, x):
        with torch.no_grad():
            vectors = self.embedding_net(x)
            vectors = self.embedding_net.get_batch_extremes(vectors)
        probs = self.output_net(vectors)
        return probs
        

generate_vecs = lambda dim: {w:[random.random() for i in range(dim)] for w in list('asdfqwerty')}
def generate_inputs():
    return [list('asd'), list('asdfg'), list('qwerpoi'), list('zcvv')]
        
def test_embeddings():
    dim_size = 10
    vecs = generate_vecs(dim_size)
    # net
    emb_net = EmbeddingNet(vecs, dim_size)
    assert emb_net.embeddings.size() == (len(vecs), dim_size)
    # doc vectors
    doc = list('qwzxdf')
    doc_vecs = emb_net.get_doc_vectors(doc)
    assert len(doc_vecs) == 4
    assert doc_vecs[1].size() == (dim_size,)
    # batch vectors
    docs = generate_inputs()
    batch_vecs = emb_net(docs)
    assert len(batch_vecs) == len(docs)
    assert len(batch_vecs[0]) == 3
    assert len(batch_vecs[3]) == 1
    assert batch_vecs[1][1].size() == (dim_size,)
    # extremes
    extremes = emb_net.get_extremes(batch_vecs[2])
    assert extremes.size() == (dim_size * 4, )
    assert all(extremes[:dim_size] > extremes[dim_size:dim_size*2])
    assert all(extremes[dim_size*3:dim_size*4] > extremes[dim_size*2:dim_size*3])
    batch_extremes = emb_net.get_batch_extremes(batch_vecs)
    assert batch_extremes.size() == (len(docs), dim_size * 4)
    return emb_net


def test_dense():
    params = dict(input_features = 10, dense_units=[10,5], num_classes=4)
    dense = DenseNet(**params)
    assert len(dense.hidden) == 2
    bs = 6
    dummy = torch.randn(bs, params['input_features'])
    out = dense(dummy)
    assert out.size() == (bs, params['num_classes'])
    return dense


def test_clf():
    dim = 21
    emb_params = {'vectors': generate_vecs(dim), 'dim': dim}
    emb_net = EmbeddingNet(**emb_params)
    dense_params = dict(input_features = dim * 4, dense_units=[13,11], num_classes=9)
    dense_net = DenseNet(**dense_params)
    clf = TopicClassifier(emb_net, dense_net)
    docs = generate_inputs()
    out = clf(docs)
    assert out.size() == (len(docs), dense_params['num_classes'])
    return clf


In [103]:
e_net = test_embeddings()

d_net = test_dense()


c_net = test_clf()

## Load data

In [None]:
FILE = './data/interim/train_data.csv'
TEST_SUBSETS = ['r-1', 'r-2']
TEST_SIZE = 0.33

### Text and topics

In [8]:
data = load_data(FILE)

data['is_test'] = select_test(data, TEST_SUBSETS, TEST_SIZE)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33192 entries, 0 to 33191
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         33192 non-null  object 
 1   description   33192 non-null  object 
 2   recent_posts  33192 non-null  object 
 3   lang_code     33192 non-null  object 
 4   id            1447 non-null   float64
 5   category      33192 non-null  object 
 6   subset        33192 non-null  object 
 7   text          33192 non-null  object 
dtypes: float64(1), object(7)
memory usage: 2.0+ MB


In [9]:
data.groupby(['lang_code', 'subset']).agg(dict(is_test=['mean','count']))

Unnamed: 0_level_0,Unnamed: 1_level_0,is_test,is_test
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
lang_code,subset,Unnamed: 2_level_2,Unnamed: 3_level_2
ar,r-2,0.342105,190
en,chan,0.0,42
en,r-1,0.330247,324
en,r-2,0.340659,91
en,tg,0.0,14775
fa,r-2,0.414365,181
ru,chan,0.0,202
ru,r-1,0.33515,367
ru,r-2,0.330097,103
ru,tg,0.0,16726


In [10]:
data = data.query("subset != 'tg'")
data.shape

(1691, 9)

In [11]:
LANGS = data['lang_code'].unique().tolist()

CLASSES = list(set([
            c.strip() for cat in data['category'].tolist() 
            for c in cat.keys() 
        ]))


len(CLASSES)

53

### Data Loader

In [12]:
from torch.utils.data import Dataset, DataLoader

from src.train.text_utils import tokenize_text


In [13]:
class TextData(Dataset):
    ''' tokenize text and convert weight dicts to vectors '''
    def __init__(self, data, lang_code, classes, subsets=None):
        super().__init__()
        self.data = data.query(f'lang_code == "{lang_code}"')
        if subsets:
            self.data = self.data.query(f"subset == {subsets}")
        self.classes = list(set([
            c.strip() for cat in self.data['category'].tolist() 
            for c in cat.keys() 
        ]))
        self.classes = classes
 
        
    def __len__(self, ):
        return len(self.data)
    
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        tokens = tokenize_text(row['text'])
        cat = {c.strip():w for c,w in row['category'].items()}
        weights = weights_to_tensor(cat, self.classes)
        return tokens, weights
    
    
    
def create_loaders(data, lang_code, classes, batch_size=16, subsets=None):
    train_data = data.loc[~data['is_test']]
    test_data = data.loc[data['is_test']]
    train_set = TextData(train_data.query('is_test == False'), lang_code, classes, subsets=subsets)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
    test_set = TextData(test_data.query('is_test == True'), lang_code, classes, subsets=subsets)
    test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
    return train_loader, test_loader



def collate_batch(batch):
    x = [b[0] for b in batch]
    y = torch.stack([b[1] for b in batch], axis=0)
    return x,y

#### test dataset

In [14]:
text_data = TextData(data, 'ru', subsets=['r-2', 'r-1'], classes=CLASSES)


In [24]:
len(CLASSES)

53

In [23]:
w.size()

torch.Size([53])

In [25]:
t,w = text_data[random.randint(0, len(text_data)-1)]


assert isinstance(t, list)
if len(t) > 0:
    assert isinstance(t[-1], str)
assert isinstance(w, torch.Tensor)
assert w.size() == (len(CLASSES),)

In [28]:
bs = 3
l,_ = create_loaders(data, 'ru', classes=CLASSES, batch_size=bs,)

for x,y in l:
    break
    
    
assert len(x) == bs
assert y.size() == (bs, len(CLASSES))

## Load vectors

In [128]:
SIZE = "100k"
DIM = 300

VECTORS = "models/external/word_vectors/{size}.cc.{lang_code}.{dim}.vec"

## Experiment

In [131]:
from torch import optim
from torch.nn import functional as F

In [132]:
def calculate_mae_score(true, predicted):
    """ run softmax over predictions and get mae score """
    probs = F.softmax(predicted,1)
    mae = F.l1_loss(probs, true, reduction='sum') / true.size(0)
    return loss2score(mae.item())


loss2score = lambda loss: 1/(1+loss)

### Configure

In [167]:
lang_code = 'ru'


word_vectors,_ = load_vectors(VECTORS.format(size=SIZE, lang_code=lang_code, dim=DIM))

lines: 0it [00:00, ?it/s]

In [168]:

BATCH_SIZE = 128
DENSE_UNITS = [128,]

ETA = 0.01
DROPOUT = 0.25

EPOCHS = 30

In [169]:
train_loader,test_loader = create_loaders(data, lang_code, classes=CLASSES, batch_size=BATCH_SIZE)

embed = EmbeddingNet(word_vectors, DIM)

dense = DenseNet(DIM * 4, DENSE_UNITS, len(CLASSES), DROPOUT)

clf = TopicClassifier(embed, dense)

print(clf)

criterion = nn.L1Loss(reduction='sum')

optimizer = optim.Adam(dense.parameters(), lr=ETA)

TopicClassifier(
  (embedding_net): EmbeddingNet()
  (output_net): DenseNet(
    (hidden): Sequential(
      (0): Sequential(
        (0): Linear(in_features=1200, out_features=128, bias=True)
        (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): LeakyReLU(negative_slope=0.02)
        (3): Dropout(p=0.25, inplace=False)
      )
    )
    (clf): Linear(in_features=128, out_features=53, bias=True)
  )
)


### Train and eval

In [171]:
for i in range(EPOCHS):
    # train
    clf.train()
    train_loss = 0.
    train_size = 0.
    for x,y in train_loader:
        optimizer.zero_grad()
        out = clf(x)
        loss = criterion(out, y)
        train_loss += loss.item()
        train_size += y.size(0)
        loss.backward()
        optimizer.step()
    train_loss /= train_size
    # eval
    clf.eval()
    test_loss = 0.
    test_size = 0
    with torch.no_grad():
        for x,y in test_loader:
            out = clf(x)
            loss = criterion(out, y)
            test_loss += loss.item()
            test_size += y.size(0)
    test_loss /= test_size
    print(f"Epoch {i+1}: train {train_loss:.4f}, eval {test_loss:.4f}")
    
print(f"Scores: train {loss2score(train_loss):.2f}, eval {loss2score(test_loss):.2f}")

Epoch 1: train 1.2495, eval 1.2892
Epoch 2: train 1.2494, eval 1.2513
Epoch 3: train 1.2331, eval 1.2403
Epoch 4: train 1.2150, eval 1.2203
Epoch 5: train 1.2040, eval 1.2400
Epoch 6: train 1.2045, eval 1.2343
Epoch 7: train 1.2015, eval 1.2221
Epoch 8: train 1.1898, eval 1.2367
Epoch 9: train 1.1949, eval 1.2482
Epoch 10: train 1.2124, eval 1.2447
Epoch 11: train 1.2047, eval 1.2228
Epoch 12: train 1.1883, eval 1.1897
Epoch 13: train 1.1754, eval 1.1951
Epoch 14: train 1.1681, eval 1.2199
Epoch 15: train 1.1890, eval 1.2024
Epoch 16: train 1.1958, eval 1.1979
Epoch 17: train 1.1823, eval 1.1641
Epoch 18: train 1.1589, eval 1.1668
Epoch 19: train 1.1648, eval 1.1671
Epoch 20: train 1.1723, eval 1.1531
Epoch 21: train 1.1479, eval 1.1704
Epoch 22: train 1.1560, eval 1.1703
Epoch 23: train 1.1599, eval 1.1668
Epoch 24: train 1.1538, eval 1.1589
Epoch 25: train 1.1450, eval 1.1529
Epoch 26: train 1.1419, eval 1.1588
Epoch 27: train 1.1436, eval 1.1687
Epoch 28: train 1.1471, eval 1.1816
E