In [2]:
from utils import *

train = pd.read_parquet('data_fusion_train.parquet')
train.reset_index(inplace=True)
train = train[train.category_id != -1].drop_duplicates('item_name')
train 

Unnamed: 0,index,receipt_id,receipt_dayofweek,receipt_time,item_name,item_quantity,item_price,item_nds_rate,category_id,brands
1,1,11,6,20:34,"Молоко 3,2%,шт",2.000,8,2,78,
3,3,39,4,11:28,"Компот из изюма, 114 ккал",1.000,4,1,71,
4,4,39,4,11:28,"Макаронные изделия отварные (масло сливочное),...",1.000,4,1,71,
12,17,56,5,11:42,Кофе Капучино Большой Эден 18,1.000,12,1,70,
24,40,105,3,01:53,Хлеб на СЫВОРОТКЕ 350г,1.000,7,-1,84,
...,...,...,...,...,...,...,...,...,...,...
26059416,45669181,9880594,2,20:11,"Напиток Энерг. Ред Булл 0,355л",1.000,10,6,83,
26066451,45681543,9908635,5,01:09,Хеменгуэй Дайкири,1.000,15,6,0,
26071696,45690702,9929539,0,14:39,"Пиво светлое ""Халзан"" 4,5 % об, пл/б. 1,5 л(шт)",1.000,10,6,0,
26072628,45692298,9932986,3,22:26,Экспресс педикюр,1.000,15,6,42,


In [3]:
train = train[~(train.item_name == '')]
train.shape

(48224, 10)

In [4]:
#deal with tensors
import torch   

#handling text data
from torchtext import data    

#Reproducing same results
SEED = 2021

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

In [5]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(train, test_size=0.1, stratify=train.category_id, random_state=SEED)

In [6]:
train_df.shape

(43401, 10)

In [7]:
test_df.shape

(4823, 10)

In [8]:
TEXT = data.Field(batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype = torch.float, batch_first=True)

In [9]:
from torchtext import data

class PandasDataFrame(data.Dataset):

    def __init__(self, df, text_field, label_field, is_test=False, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for i, row in df.iterrows():
            label = row.category_id if not is_test else None
            text = row.item_name
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, text_field, label_field, train_df, val_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)

        if train_df is not None:
            train_data = cls(train_df.copy(), text_field, label_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), text_field, label_field, **kwargs)


        return tuple(d for d in (train_data, val_data) if d is not None)
        
train_ds, test_ds = PandasDataFrame.splits(
  text_field=TEXT, label_field=LABEL, train_df=train_df, val_df=test_df)

In [10]:
    print(vars(train_ds.examples[0])), print(vars(test_ds.examples[0]))

{'text': ['ЧАЙ', 'ЛЕСНЫЕ', 'ЯГОДЫ', '1Л'], 'label': 83}
{'text': ['Суп', 'Рассольник', 'Ленинградский', 'У'], 'label': 71}


(None, None)

In [11]:
#initialize glove embeddings
TEXT.build_vocab(train_ds, min_freq=2)  
LABEL.build_vocab(train_ds)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

#Word dictionary
#print(TEXT.vocab.stoi)   

Size of TEXT vocabulary: 18344
Size of LABEL vocabulary: 96
[('с', 3832), ('1', 1480), ('в', 1320), ('для', 1037), ('шт', 957), ('и', 907), ('из', 876), ('С', 749), ('1кг', 740), ('порц.', 631)]


In [12]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

#set batch size
BATCH_SIZE = 64

#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_ds, test_ds), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [13]:
from model import classifier

In [14]:
size_of_vocab = len(TEXT.vocab)
embedding_dim = 40
num_hidden_nodes = 64
num_output_nodes = len(LABEL.vocab)
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = classifier(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = bidirection, dropout = dropout)

In [15]:
size_of_vocab

18344

In [16]:
num_output_nodes

96

In [17]:
print(model)

#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
    
print(f'The model has {count_parameters(model):,} trainable parameters')

#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
if pretrained_embeddings is not None:
    model.embedding.weight.data.copy_(pretrained_embeddings)

# print(pretrained_embeddings.shape)

classifier(
  (embedding): Embedding(18344, 40)
  (lstm): LSTM(40, 64, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=128, out_features=96, bias=True)
)
The model has 899,744 trainable parameters


In [18]:
import torch.optim as optim

#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
    
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

NameError: name 'nn' is not defined

In [51]:
from sklearn.metrics import f1_score

In [52]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths)
        
        #compute the loss
        loss = criterion(predictions, batch.label.long())        
        
        #compute the binary accuracy
        
        acc = f1_score(batch.label.long().cpu().detach().numpy(), predictions.argmax(axis=1).cpu().detach().numpy(), average='weighted')   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [53]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths)
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label.long())

            acc = f1_score(batch.label.long().cpu().detach().numpy(), predictions.argmax(axis=1).cpu().detach().numpy(), average='weighted') 
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [54]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    print('\t================')

	Train Loss: 3.202 | Train Acc: 17.00%
	 Val. Loss: 2.637 |  Val. Acc: 30.65%
	Train Loss: 2.361 | Train Acc: 38.46%
	 Val. Loss: 2.149 |  Val. Acc: 42.18%
	Train Loss: 1.927 | Train Acc: 49.68%
	 Val. Loss: 1.893 |  Val. Acc: 51.88%
	Train Loss: 1.638 | Train Acc: 57.35%
	 Val. Loss: 1.726 |  Val. Acc: 55.65%
	Train Loss: 1.422 | Train Acc: 62.98%
	 Val. Loss: 1.622 |  Val. Acc: 58.38%
	Train Loss: 1.242 | Train Acc: 67.36%
	 Val. Loss: 1.547 |  Val. Acc: 60.92%
	Train Loss: 1.101 | Train Acc: 70.88%
	 Val. Loss: 1.510 |  Val. Acc: 62.66%
	Train Loss: 0.980 | Train Acc: 73.86%
	 Val. Loss: 1.510 |  Val. Acc: 63.43%
	Train Loss: 0.882 | Train Acc: 76.66%
	 Val. Loss: 1.495 |  Val. Acc: 64.33%
	Train Loss: 0.797 | Train Acc: 78.96%
	 Val. Loss: 1.492 |  Val. Acc: 65.24%
	Train Loss: 0.723 | Train Acc: 80.62%
	 Val. Loss: 1.566 |  Val. Acc: 64.08%
	Train Loss: 0.670 | Train Acc: 82.18%
	 Val. Loss: 1.541 |  Val. Acc: 65.42%
	Train Loss: 0.606 | Train Acc: 83.80%
	 Val. Loss: 1.541 |  Val