In [1]:
!git clone https://github.com/PolyakovMA/sentiment-transformer.git

Cloning into 'sentiment-transformer'...
remote: Enumerating objects: 110, done.[K
remote: Counting objects: 100% (110/110), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 110 (delta 60), reused 57 (delta 24), pack-reused 0 (from 0)[K
Receiving objects: 100% (110/110), 2.75 MiB | 14.17 MiB/s, done.
Resolving deltas: 100% (60/60), done.


In [2]:
%cd sentiment-transformer

/content/sentiment-transformer


In [3]:
import pandas as pd
import torch
from src.data_preprocessing import load_dataset, preprocess_dataset
from src.model import load_model
from src.train import train_model, get_device
from src.evaluate import compute_metrics
from datasets import Dataset
from src.utils import set_global_seed, get_class_weights, custom_compute_loss
from transformers import pipeline

In [4]:
set_global_seed(42)

In [5]:
device = get_device()
device

device(type='cuda')

In [6]:
url = "https://raw.githubusercontent.com/aiedu-courses/all_datasets/refs/heads/main/Womens%20Clothing%20E-Commerce%20Reviews.csv"

In [7]:
df = pd.read_csv(url)
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [8]:
dataset = load_dataset(url)
dataset.head()

Unnamed: 0,Review Text,labels
0,Absolutely wonderful - silky and sexy and comf...,1
1,Love this dress! it's sooo pretty. i happene...,1
2,I had such high hopes for this dress and reall...,0
3,"I love, love, love this jumpsuit. it's fun, fl...",1
4,This shirt is very flattering to all due to th...,1


In [9]:
dataset = preprocess_dataset(dataset)
dataset.head()

Unnamed: 0,Review Text,labels
0,absolutely wonderful silky and sexy and comfor...,1
1,love this dress its sooo pretty i happened to ...,1
2,i had such high hopes for this dress and reall...,0
3,i love love love this jumpsuit its fun flirty ...,1
4,this shirt is very flattering to all due to th...,1


In [None]:
tokenizer, model = load_model()

In [11]:
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [12]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [11]:
tokenized = dataset['Review Text'].apply(lambda x: tokenizer(x, padding='max_length', truncation=True))
tokenized.head()

Unnamed: 0,Review Text
0,"[input_ids, attention_mask]"
1,"[input_ids, attention_mask]"
2,"[input_ids, attention_mask]"
3,"[input_ids, attention_mask]"
4,"[input_ids, attention_mask]"


In [12]:
dataset['input_ids'] = tokenized.apply(lambda x: x['input_ids'])
dataset['attention_mask'] = tokenized.apply(lambda x: x['attention_mask'])
dataset = dataset.drop(columns='Review Text')
dataset.head()

Unnamed: 0,labels,input_ids,attention_mask
0,1,"[101, 7078, 6919, 18848, 1998, 7916, 1998, 662...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ..."
1,1,"[101, 2293, 2023, 4377, 2049, 17111, 2080, 349...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,0,"[101, 1045, 2018, 2107, 2152, 8069, 2005, 2023...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,1,"[101, 1045, 2293, 2293, 2293, 2023, 14523, 146...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1,"[101, 2023, 3797, 2003, 2200, 4257, 17989, 200...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [13]:
ds = Dataset.from_pandas(dataset).train_test_split(test_size=0.2, seed=42)
ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 18788
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 4698
    })
})

In [14]:
class_weights = get_class_weights(ds['train']['labels'])
class_weights

tensor([2.7900, 0.6092])

In [15]:
class_weights.to(device)

tensor([2.7900, 0.6092], device='cuda:0')

In [16]:
def compute_loss_func(outputs, labels, num_items_in_batch):
    return custom_compute_loss(outputs, labels, class_weights)


In [17]:
trainer = train_model(model,
                      ds['train'],
                      ds['test'],
                      output_dir='./results',
                      compute_metrics=compute_metrics,
                      compute_loss_func=compute_loss_func)

Epoch,Training Loss,Validation Loss,F1
1,No log,0.438869,0.903409
2,No log,0.509698,0.911877
3,No log,0.693872,0.911276



Best model saved at: ./results/checkpoint-4698
Best F1 score: 0.9119


In [23]:
classifier = pipeline(
    "text-classification",
    model=trainer.model,
    tokenizer=tokenizer,
    device=device
)

Device set to use cuda


In [24]:
reviews = [
    "I love this dress, it fits perfectly and the fabric is great!",
    "The product is terrible, very bad quality and waste of money.",
    "Not bad, but the size was a little off.",
]

In [25]:
preds = classifier(reviews)
preds

[{'label': 'LABEL_1', 'score': 0.9995579123497009},
 {'label': 'LABEL_0', 'score': 0.9911150932312012},
 {'label': 'LABEL_1', 'score': 0.9890550374984741}]

In [26]:
label_map = {
    "LABEL_0": "Negative",
    "LABEL_1": "Positive"
}

In [27]:
for text, pred in zip(reviews, preds):
    label = label_map.get(pred["label"], pred["label"])
    score = round(pred["score"], 3)
    print(f"Отзыв: {text}\n→ Класс: {label}, вероятность: {score}\n")

Отзыв: I love this dress, it fits perfectly and the fabric is great!
→ Класс: Positive, вероятность: 1.0

Отзыв: The product is terrible, very bad quality and waste of money.
→ Класс: Negative, вероятность: 0.991

Отзыв: Not bad, but the size was a little off.
→ Класс: Positive, вероятность: 0.989

