In [1]:
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm




In [2]:
datapath = f'train.csv'
df = pd.read_csv(datapath)
df.head()

Unnamed: 0,oid,category,text
0,365271984,winter_sport,Волшебные фото Виктория Поплавская ЕвгенияМедв...
1,503385563,extreme,Возвращение в подземелье Треша 33 Эйфория тупо...
2,146016084,football,Лучшие чешские вратари – Доминик Доминатор Гаш...
3,933865449,boardgames,Rtokenoid Warhammer40k валрак решил нас подкор...
4,713550145,hockey,Шестеркин затаскивает Рейнджерс в финал Восточ...


In [9]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

In [10]:
labels = dict(zip(df.category.unique(), list(range(len(df.category.unique())))))

In [11]:
labels

{'winter_sport': 0,
 'extreme': 1,
 'football': 2,
 'boardgames': 3,
 'hockey': 4,
 'esport': 5,
 'athletics': 6,
 'motosport': 7,
 'basketball': 8,
 'tennis': 9,
 'autosport': 10,
 'martial_arts': 11,
 'volleyball': 12}

In [12]:
class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


In [13]:
class TestDataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.oids = df['oid'].tolist()
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]


    def __len__(self):
        return len(self.labels)

    def get_batch_oids(self, idx):
        # Fetch a batch of labels
        return np.array(self.oids[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_oids = self.get_batch_oids(idx)

        return batch_oids, batch_texts


In [14]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('DeepPavlov/rubert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 13)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.sigmoid(linear_output)

        return final_layer


     

In [15]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    # use_cuda=False
    device = torch.device("cuda" if use_cuda else "cpu")
    print('device', device)

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  

In [16]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [17]:
def test(model, test_data):

    test = TestDataset(test_data)

    # test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()
    
    result = []
    with torch.no_grad():

        for test_oid, test_input in test:

            test_oid = torch.from_numpy(test_oid).to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            pred = output.argmax(dim=1).cpu().detach().numpy()
            max_val = output.max().item()
            result.append([test_oid.item(), pred[0], max_val])
    
    return result

In [19]:
np.random.seed(112)
df_train, df_val = np.split(df.sample(frac=1, random_state=112), 
                                     [int(.95*len(df))])

print(len(df_train),len(df_val))

46705 2459


In [20]:
EPOCHS = 7
model = BertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


device cuda


100%|██████████| 23353/23353 [48:33<00:00,  8.01it/s]


Epochs: 1 | Train Loss:  1.052 | Train Accuracy:  0.698 | Val Loss:  0.955 | Val Accuracy:  0.840


100%|██████████| 23353/23353 [48:23<00:00,  8.04it/s]


Epochs: 2 | Train Loss:  0.924 | Train Accuracy:  0.879 | Val Loss:  0.909 | Val Accuracy:  0.893


100%|██████████| 23353/23353 [48:24<00:00,  8.04it/s]


Epochs: 3 | Train Loss:  0.891 | Train Accuracy:  0.922 | Val Loss:  0.896 | Val Accuracy:  0.904


100%|██████████| 23353/23353 [48:25<00:00,  8.04it/s]


Epochs: 4 | Train Loss:  0.876 | Train Accuracy:  0.946 | Val Loss:  0.892 | Val Accuracy:  0.903


100%|██████████| 23353/23353 [48:26<00:00,  8.04it/s]


Epochs: 5 | Train Loss:  0.867 | Train Accuracy:  0.960 | Val Loss:  0.890 | Val Accuracy:  0.903


100%|██████████| 23353/23353 [48:28<00:00,  8.03it/s]


Epochs: 6 | Train Loss:  0.863 | Train Accuracy:  0.967 | Val Loss:  0.888 | Val Accuracy:  0.908


100%|██████████| 23353/23353 [48:30<00:00,  8.02it/s]


Epochs: 7 | Train Loss:  0.860 | Train Accuracy:  0.971 | Val Loss:  0.889 | Val Accuracy:  0.908


In [17]:
torch.save(model.state_dict(), './save/5ep.pt')

In [14]:
# model.load_state_dict(torch.load('./save/5ep.pt'))

<All keys matched successfully>

## Predict

In [21]:
df_test = pd.read_csv('test.csv')
df_test

Unnamed: 0,oid,text
0,749208109,СПОЧНО СООБЩЕСТВО ПРОДАЕТСЯ ЗА 1300Р ЗА ПОКУПК...
1,452466036,Естественное восстановление после тяжелой трен...
2,161038103,Тема нарядов продолжается Одна из британских ж...
3,663621910,Привет Избранный. Ты спрашиваешь себя ЧТО здес...
4,566255305,КОРОЛЬ ПЯТИСОТНИКОВ В ДЕЛЕ Андрей Рублев успеш...
...,...,...
26255,169728316,Выиграй коллекционный пазл по Wortokenoid of W...
26256,279369911,Волейбол от первого лица Егора Пупынина переко...
26257,600699419,Вы были когда нибудь на свидании где вам задав...
26258,560223506,ТОП 20 самых эффективных общефизических упражн...


In [None]:
pred = test(model, df_test)


In [23]:
pred = pd.DataFrame(pred, columns=['oid','category', 'prob'])
pred

Unnamed: 0,oid,category,prob
0,749208109,5,0.997216
1,452466036,11,0.999206
2,161038103,9,0.999839
3,663621910,5,0.998995
4,566255305,9,0.999485
...,...,...,...
26255,169728316,3,0.998454
26256,279369911,12,0.999675
26257,600699419,3,0.999604
26258,560223506,12,0.997792


In [24]:
labels_inv = {v: k for k, v in labels.items()}
labels_inv

{0: 'winter_sport',
 1: 'extreme',
 2: 'football',
 3: 'boardgames',
 4: 'hockey',
 5: 'esport',
 6: 'athletics',
 7: 'motosport',
 8: 'basketball',
 9: 'tennis',
 10: 'autosport',
 11: 'martial_arts',
 12: 'volleyball'}

In [25]:
pred['category'] = pred['category'].map(labels_inv.get)
pred

Unnamed: 0,oid,category,prob
0,749208109,esport,0.997216
1,452466036,martial_arts,0.999206
2,161038103,tennis,0.999839
3,663621910,esport,0.998995
4,566255305,tennis,0.999485
...,...,...,...
26255,169728316,boardgames,0.998454
26256,279369911,volleyball,0.999675
26257,600699419,boardgames,0.999604
26258,560223506,volleyball,0.997792


In [26]:
pred.to_csv('./output/bert_output.csv')