In [226]:
import torch
import warnings
import torch.nn.functional as F
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import transformers
from transformers import BertModel, BertTokenizer

from pylab import rcParams
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from tqdm.notebook import tqdm
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
sns.set(style='darkgrid', palette='muted', font_scale=1.2)
rcParams['figure.figsize'] = 12, 8

warnings.filterwarnings("ignore")

In [227]:
torch.cuda.empty_cache()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

RANDOM_SEED = 777
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x1a8b0651090>

# Данные - Decorative

In [228]:
full = pd.read_csv('../data/raw/Decorative/Full.csv')
full

Unnamed: 0,item,frequency,words_ordered,stemmed_text,count_words
0,-,51,-,-,1
1,--,4,--,--,1
2,помада -бомба,4,-бомба помада,-бомб помад,2
3,цвет -бомба,11,-бомба цвет,-бомб цвет,2
4,оттенок -в,3,-в оттенок,-в оттенок,2
...,...,...,...,...,...
1567008,ящичке,16,ящичке,ящичк,1
1567009,ящички,3,ящички,ящичк,1
1567010,єффект,3,єффект,єффект,1
1567011,ұнады,6,ұнады,ұнад,1


In [229]:
synonyms = pd.read_csv('../data/raw/Decorative/Synonyms.csv')
synonyms

Unnamed: 0,Topic,Stemma,Synonyms,Result
0,Cлой,топовый слой тож,топовый слой тоже,1
1,Mini-size,есть мин,есть мини,1
2,Mini-size,есть мин,есть минусы,0
3,Mini-size,миниатюр,баночка миниатюрная,1
4,Mini-size,миниатюр,довольно миниатюрна,1
...,...,...,...,...
202301,Яркий макияж,любите яркие цвет,любите яркие цвета,0
202302,Яркий макияж,любит яркие оттенк,любит яркие оттенки,0
202303,Яркий макияж,теряет свою яркост,теряет свою яркость,0
202304,Яркий макияж,такой насыщенный и ярк,такой насыщенный и яркий,0


In [230]:
df = full.merge(synonyms, left_on='item', right_on='Synonyms')
df = df.loc[df['Result'] == 1, ['item', 'frequency', 'Topic']].reset_index(drop=True)
df.rename(columns={'item': 'review', 'Topic': 'sentiment'}, inplace=True)
df

Unnamed: 0,review,frequency,sentiment
0,веко -выкинула,30,Веки
1,гель -лака,38,Продукт
2,фирма -меня,15,Производитель
3,густой -получилось,78,Консистенция
4,бежевый ..,192,Цвет/оттенок/тон
...,...,...,...
171100,ярко-салатовый,15,Цвет/оттенок/тон
171101,ярко-синий,54,Цвет/оттенок/тон
171102,ярко-фиолетовый,13,Цвет/оттенок/тон
171103,ярко-черная,18,Цвет/оттенок/тон


In [231]:
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()

df['sentiment'] = encode.fit_transform(df['sentiment'])
class_names = df['sentiment'].unique()
class_names

array([ 27, 227, 229, 113, 334,  15, 279,  23,  39,  44,  51,  57, 135,
       109, 169, 128, 204, 188, 168, 268, 181, 193, 228, 237, 282, 242,
       251, 267, 305, 341, 346, 147, 337,  34,  18,  89, 139,  19, 102,
       294, 350, 187, 324, 107, 127, 215, 106, 165, 254, 244, 157,  68,
       137, 140, 122, 287,  10,  53, 319,  76, 173, 246, 329, 166, 199,
       116, 338, 325,  93,  43, 272,  90,  92, 271, 288, 269, 344, 349,
        38,  32, 232, 170, 141, 347, 119, 196, 261, 308, 262,  95, 286,
        77, 180, 112, 197, 125, 138, 131, 146, 236, 239, 283, 222, 213,
       245, 302, 240,  83, 317,  13, 277, 105,  17,  79,  61, 295, 296,
       320, 241, 275, 117, 276, 100, 121, 171, 326, 316,  33, 322,   3,
        52,  81,  66, 339,  47,  58,  70, 203, 153, 164, 192, 292,  94,
       219,   4, 273,   5, 278, 145,  73,  98,   6,   7, 335, 179, 309,
       115, 238, 206, 318,  28,  35,   8,  14, 291,   9, 285,  96, 103,
       207, 208, 218, 118, 220, 177, 243, 255, 152, 163,  65, 20

# BERT 

In [232]:
PRE_TRAINED_MODEL_NAME = 'cointegrated/rubert-tiny'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=True)

In [233]:
MAX_LEN = 254

class ReviewDataset(Dataset):

    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt')

        return {
          'review_text': review,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)}

In [98]:
df_train, df_test = train_test_split(df, test_size=0.6, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.98, random_state=RANDOM_SEED)
df_train.shape, df_val.shape, df_test.shape

((45053, 3), (1351, 3), (66229, 3))

In [99]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = ReviewDataset(
        reviews=df.review.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len)

    return DataLoader(ds, batch_size=batch_size)

In [100]:
BATCH_SIZE = 32

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

# MODEL 

In [101]:
bert_model = BertModel.from_pretrained('cointegrated/rubert-tiny')

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [102]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=0.1)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(outputs["pooler_output"])
        return self.out(output)

In [103]:
model = SentimentClassifier(len(class_names))
model = model.to(device)
model

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29564, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affin

In [105]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0
  
    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)


def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [104]:
EPOCHS = 2

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']

optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=2e-5)
loss_fn = nn.CrossEntropyLoss().to(device)

In [111]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in tqdm(range(EPOCHS)):
    train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device, len(df_train))

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device, len(df_val))

    print(f'Val loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state_2.bin')
        best_accuracy = val_acc
        

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1408 [00:00<?, ?it/s]

Train loss 0.9262768057682975 accuracy 0.8006348078929262


  0%|          | 0/43 [00:00<?, ?it/s]

Val loss 0.9509508093429166 accuracy 0.7831236121391562



  0%|          | 0/1408 [00:00<?, ?it/s]

Train loss 0.8396292509913276 accuracy 0.8132643775109316


  0%|          | 0/43 [00:00<?, ?it/s]

Val loss 0.8906586087027262 accuracy 0.7920059215396003

CPU times: total: 41.5 s
Wall time: 2min 18s


In [112]:
test_acc, _ = eval_model(
    model,
    test_data_loader,
    loss_fn,
    device,
    len(df_test)
)

test_acc.item()

  0%|          | 0/2070 [00:00<?, ?it/s]

0.7958900179679597

In [113]:
def get_predictions(model, data_loader):
    model = model.eval()

    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:

            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)

            probs = F.softmax(outputs, dim=1)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return review_texts, predictions, prediction_probs, real_values

In [114]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(model, test_data_loader)

In [115]:
print(classification_report(y_test, y_pred, labels=class_names))

              precision    recall  f1-score   support

          15       0.81      0.82      0.81       340
         159       0.89      0.94      0.92      1038
          74       0.87      0.89      0.88      1087
           7       0.90      0.95      0.92       742
         200       0.93      0.96      0.95       528
          14       0.84      0.75      0.79       717
          24       0.57      0.89      0.69        79
          27       0.86      0.97      0.91       118
          32       0.93      0.91      0.92       494
          36       0.76      0.99      0.86        92
          88       0.79      0.69      0.74       248
          71       0.75      0.88      0.81        85
         111       0.93      0.98      0.95       415
          86       0.89      0.90      0.90       268
         138       0.83      0.87      0.85       294
         124       0.54      0.10      0.17       127
         110       0.89      0.74      0.81        77
         191       0.88    

In [122]:
idx = 10

review_text = y_review_texts[idx]
true_sentiment = y_test[idx]
pred_df = pd.DataFrame({
  'class_names': class_names,
  'values': y_pred_probs[idx]
})

pred_df.sort_values('values', ascending=False)

Unnamed: 0,class_names,values
157,183,0.930028
66,195,0.011675
239,241,0.005407
172,70,0.002971
242,245,0.002670
...,...,...
26,219,0.000004
165,181,0.000004
10,88,0.000003
65,26,0.000003


In [191]:
count = 0
pred = []
for idx in range(len(y_review_texts)):

    review_text = y_review_texts[idx]
    true_sentiment = y_test[idx]
    pred_df = pd.DataFrame({
      'class_names': class_names,
      'values': y_pred_probs[idx]
    })
    
    if any(pred_df.sort_values('values', ascending=False)['values'].values > 0.9):
        count += 1
    pred.append(pred_df.sort_values('values', ascending=False)['values'].values)

In [148]:
from sklearn.metrics import roc_auc_score

In [199]:
roc_auc_score(np.array(y_test).reshape(-1, 1), pred, multi_class='ovr')

0.7371488891887589

# Active

In [202]:
df = df.sort_values('frequency', ascending=False)

Unnamed: 0,review,frequency,sentiment
168695,товара,1500335,157
170816,цена,152956,243
170580,цвета,149952,239
155391,помада,148516,157
166624,сохнет,103716,34
...,...,...,...
21743,будет стойко,10,202
157741,появляется трещина,10,173
67682,достаточно смуглой,10,210
86901,лица из-за,10,87
