In [None]:
!pip install gdown
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
from tqdm.notebook import tqdm
from torch import nn
import torch.optim as optim
from sklearn.metrics import f1_score
import gdown

warnings.filterwarnings('ignore')

In [None]:
!gdown --id 1Mgj0o6Vahq2PymH9qcEN1bO4sC0oFyId
!gdown --id 1td5Zq6gvpIrj3d2jh-OrxXLD5-GhMWLM

In [None]:
train = pd.read_csv('./reviews_train.csv', index_col='id')
test = pd.read_csv('./reviews_test.csv', index_col='Unnamed: 0').reset_index().fillna('Ок')

In [None]:
train['like'] = train['like'].replace({-1: 0})

In [None]:
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, tokenizer, mod='train', max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.mod = mod
        
        if self.mod == 'train':
          self.labels = np.unique(dataset['like'].values)
          self.label2num = {l: num for num, l in enumerate(self.labels)}
        
    def __getitem__(self, idx):
        """
        Return dict with tokens, attention_mask and label
        """

        text = self.dataset['review'][idx]
        if self.mod == 'train':
          label = self.dataset['like'][idx]
        
        tokenizer_output = tokenizer.encode_plus(
        text, 
        max_length=self.max_length, # максимальная длина текста
        padding="max_length", # надо ли добавлять паддинг в конце?
        return_tensors="pt", # возвращает pytorch тензоры
        )

        if tokenizer_output['input_ids'].shape[1] > 512:
          tokenizer_output['input_ids'] = tokenizer_output['input_ids'][:, :512]
          tokenizer_output['attention_mask'] = tokenizer_output['input_ids'][:, :512]

        
        if self.mod == 'train':
          target = self.label2num[label]
          return {
              "input_ids": tokenizer_output['input_ids'].view(-1), 
              "mask": tokenizer_output['attention_mask'].view(-1),
              "target": target
          }

        return {
              "input_ids": tokenizer_output['input_ids'].view(-1), 
              "mask": tokenizer_output['attention_mask'].view(-1)
          }

        
    def __len__(self):
        """
        Return length of dataset
        """
        return len(self.dataset)

train_data = ReviewsDataset(train, tokenizer, 'train')
test_data = ReviewsDataset(test, tokenizer, 'test')

torch.manual_seed(2) 

print(f"Number of training examples: {len(train_data)}")
print(f"Number of testing examples: {len(test_data)}")

In [None]:
from torch.utils.data import DataLoader

batch_size=64

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

#Using BERT for text classification.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
pred_labels = []
true_labels = []
criterion = nn.BCELoss()

model.eval()
with torch.no_grad():
    for i, batch in enumerate(tqdm(test_loader)):

        output = model(batch['input_ids'].to(device), batch['mask'].to(device))
        pred_labels.append(output.logits.argmax(-1).detach().cpu().numpy())

pred_labels = np.concatenate(pred_labels, axis=0)

In [None]:
submit = pd.DataFrame(pred_labels, columns=['like'])
submit['like'] = submit['like'].replace({0: -1})
submit.head()

In [None]:
submit.to_csv('bert.csv', index=False)