In [1]:
import pathlib
import random
import pandas as pd
import numpy as np
import sys

from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict

from sklearn.metrics import (
    f1_score, 
    accuracy_score,
    classification_report, 
)

import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, AutoTokenizer, BertModel, BertConfig, AutoModel, AdamW

ROOT_DIR = "/Users/vladimirkalajcidi/DLS-NLP-Workshop/"
DATA_DIR = ROOT_DIR + "data/"
RANDOM_SEED = 42

In [28]:
df_trends = pd.read_csv(DATA_DIR + "trends_description.csv")
df = pd.read_csv(DATA_DIR + "train.csv")

In [29]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,assessment,tags,text,trend_id_res0,trend_id_res1,trend_id_res2,trend_id_res3,trend_id_res4,...,trend_id_res40,trend_id_res41,trend_id_res42,trend_id_res43,trend_id_res44,trend_id_res45,trend_id_res46,trend_id_res47,trend_id_res48,trend_id_res49
0,0,5652,6.0,"{ASSORTMENT,PROMOTIONS,DELIVERY}","Маленький выбор товаров, хотелось бы ассортиме...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,18092,4.0,"{ASSORTMENT,PRICE,PRODUCTS_QUALITY,DELIVERY}",Быстро,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13845,6.0,"{DELIVERY,PROMOTIONS,PRICE,ASSORTMENT,SUPPORT}",Доставка постоянно задерживается,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,3,25060,6.0,"{PRICE,PROMOTIONS,ASSORTMENT}",Наценка и ассортимент расстраивают,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1428,6.0,"{PRICE,PROMOTIONS}",Можно немного скинуть минимальную сумму заказа...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
X, y = df[["text"]], df[[f"trend_id_res{i}" for i in range(50)]]
X = X.astype("str").copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = RANDOM_SEED)

df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

X_train.shape is (3698, 1)
y_train.shape is (3698, 50)
X_test.shape is (925, 1)
y_test.shape is (925, 50)


In [45]:
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

In [39]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'

In [40]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
EPOCHS = 10
LEARNING_RATE = 2e-5
tokenizer = AutoTokenizer.from_pretrained('roberta-base')



In [41]:
target_cols = [col for col in y_train.columns]
target_cols

['trend_id_res0',
 'trend_id_res1',
 'trend_id_res2',
 'trend_id_res3',
 'trend_id_res4',
 'trend_id_res5',
 'trend_id_res6',
 'trend_id_res7',
 'trend_id_res8',
 'trend_id_res9',
 'trend_id_res10',
 'trend_id_res11',
 'trend_id_res12',
 'trend_id_res13',
 'trend_id_res14',
 'trend_id_res15',
 'trend_id_res16',
 'trend_id_res17',
 'trend_id_res18',
 'trend_id_res19',
 'trend_id_res20',
 'trend_id_res21',
 'trend_id_res22',
 'trend_id_res23',
 'trend_id_res24',
 'trend_id_res25',
 'trend_id_res26',
 'trend_id_res27',
 'trend_id_res28',
 'trend_id_res29',
 'trend_id_res30',
 'trend_id_res31',
 'trend_id_res32',
 'trend_id_res33',
 'trend_id_res34',
 'trend_id_res35',
 'trend_id_res36',
 'trend_id_res37',
 'trend_id_res38',
 'trend_id_res39',
 'trend_id_res40',
 'trend_id_res41',
 'trend_id_res42',
 'trend_id_res43',
 'trend_id_res44',
 'trend_id_res45',
 'trend_id_res46',
 'trend_id_res47',
 'trend_id_res48',
 'trend_id_res49']

In [47]:
class BERTDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.max_len = max_len
        self.text = df.text
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [61]:
train_dataset = BERTDataset(df_train, tokenizer, MAX_LEN)
test_dataset = BERTDataset(df_test, tokenizer, MAX_LEN)

In [62]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, 
                          num_workers=0, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=VALID_BATCH_SIZE, 
                          num_workers=0, shuffle=False, pin_memory=True)

In [70]:
for _,data in enumerate(train_loader, 0):
    print(data)

KeyError: 3590

In [63]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.roberta = AutoModel.from_pretrained('roberta-base')
#         self.l2 = torch.nn.Dropout(0.3)
        self.fc = torch.nn.Linear(768,50)
    
    def forward(self, ids, mask, token_type_ids):
        _, features = self.roberta(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#         output_2 = self.l2(output_1)
        output = self.fc(features)
        return output

model = BERTClass()
model.to(device);

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [65]:
optimizer = AdamW(params =  model.parameters(), lr=LEARNING_RATE, weight_decay=1e-6)



In [68]:
def train(epoch):
    for _,data in enumerate(train_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        loss = loss_fn(outputs, targets)
        if _%500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [69]:
for epoch in range(EPOCHS):
    train(epoch)

KeyError: 596