In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import torch
from torch.utils.data import Dataset

class CustomDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len=512):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    target = self.targets[idx]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }


In [4]:
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
train_l = []
valid_l = []
train_a = []
valid_a = []
test_prec = []
test_rec = []
test_f1=[]
class BertClassifier:
    global train_l
    global valid_l 
    global train_a 
    global valid_a 
    global test_prec 
    global test_rec 
    global test_f1
    def __init__(self, model_path, tokenizer_path, n_classes=2, epochs=1, model_save_path='/content/bert.pt'):
        self.model = BertForSequenceClassification.from_pretrained(model_path)
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model_save_path=model_save_path
        self.max_len = 512
        self.epochs = epochs
        self.out_features = self.model.bert.encoder.layer[1].output.dense.out_features
        self.model.classifier = torch.nn.Linear(self.out_features, n_classes)
        self.model.to(self.device)
        self.train_loader=None
    
    def preparation(self, X_train, y_train, X_valid, y_valid):
        self.train_set = CustomDataset(X_train, y_train, self.tokenizer)
        self.valid_set = CustomDataset(X_valid, y_valid, self.tokenizer)

        self.train_loader = DataLoader(self.train_set, batch_size=2, shuffle=True)
        self.valid_loader = DataLoader(self.valid_set, batch_size=2, shuffle=True)

        self.optimizer = AdamW(self.model.parameters(), lr=2e-5, correct_bias=False)
        self.scheduler = get_linear_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=0,
                num_training_steps=len(self.train_loader) * self.epochs
            )
        self.loss_fn = torch.nn.CrossEntropyLoss().to(self.device)
            
    def fit(self):
        self.model = self.model.train()
        losses = []
        correct_predictions = 0
        lo=0
        for data in self.train_loader:
            lo=lo+2
            if lo%100==0:
                print('итерация ',lo,' из ',len(self.train_loader))
            input_ids = data["input_ids"].to(self.device)
            attention_mask = data["attention_mask"].to(self.device)
            targets = data["targets"].to(self.device)

            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask
                )

            preds = torch.argmax(outputs.logits, dim=1)
            loss = self.loss_fn(outputs.logits, targets)

            correct_predictions += torch.sum(preds == targets)

            losses.append(loss.item())
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()

        train_acc = correct_predictions.double() / len(self.train_set)
        train_loss = np.mean(losses)
        return train_acc, train_loss
    
    def eval(self):
        self.model = self.model.eval()
        losses = []
        correct_predictions = 0

        with torch.no_grad():
            for data in self.valid_loader:
                input_ids = data["input_ids"].to(self.device)
                attention_mask = data["attention_mask"].to(self.device)
                targets = data["targets"].to(self.device)

                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                    )

                preds = torch.argmax(outputs.logits, dim=1)
                loss = self.loss_fn(outputs.logits, targets)
                correct_predictions += torch.sum(preds == targets)
                losses.append(loss.item())
        
        val_acc = correct_predictions.double() / len(self.valid_set)
        val_loss = np.mean(losses)
        return val_acc, val_loss
    
    def train(self):
        import copy
        global train_l
        global valid_l 
        global train_a 
        global valid_a 
        global test_prec 
        global test_rec 
        global test_f1
        best_accuracy = 0
        t_los=[1]
        # t_los=1
        for epoch in range(self.epochs):
            print(f'Epoch {epoch + 1}/{self.epochs}')
            train_acc, train_loss = self.fit()
            print(f'Train loss {train_loss} accuracy {train_acc}')

            val_acc, val_loss = self.eval()
            print(f'Val loss {val_loss} accuracy {val_acc}')
            print('-' * 10)
            if (len(t_los)==2) and (train_loss>t_los[1] and train_loss>t_los[0]):
                # train_l.append(copy.deepcopy(train_loss))
                # train_a.append(copy.deepcopy(train_acc))
                # valid_l.append(copy.deepcopy(val_loss))
                # valid_a.append(copy.deepcopy(best_accuracy))
                break
            else:
                # torch.save(self.model, self.model_save_path)
                if len(t_los)==1:
                    t_los.append(copy.deepcopy(train_loss))
                else:
                    t_los.pop(0)
                    t_los.append(copy.deepcopy(train_loss))
            # if train_loss<t_los:
            #     torch.save(self.model, self.model_save_path)
            #     t_los=train_loss
            # else:
            #     break
            if val_acc > best_accuracy:
                torch.save(self.model, self.model_save_path)
                best_accuracy = val_acc
                train_l.append(copy.deepcopy(train_loss))
                train_a.append(copy.deepcopy(train_acc))
                valid_l.append(copy.deepcopy(val_loss))
                valid_a.append(copy.deepcopy(best_accuracy))
        self.model = torch.load(self.model_save_path)
    
    def predict(self, text):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        out = {
              'text': text,
              'input_ids': encoding['input_ids'].flatten(),
              'attention_mask': encoding['attention_mask'].flatten()
          }
        
        input_ids = out["input_ids"].to(self.device)
        attention_mask = out["attention_mask"].to(self.device)
        
        outputs = self.model(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0)
        )
        
        prediction = torch.argmax(outputs.logits, dim=1).cpu().numpy()[0]

        return prediction


In [None]:
from sklearn.model_selection import train_test_split
import copy
import pandas as pd
dataframe=pd.read_csv('corrrect_dataset.csv')
dataframe.rename(columns={'News': 'text', 'FAKE?': 'label'}, inplace=True)
x=dataframe['text']
y=dataframe['label']
x1, x2, y1, y2 = train_test_split(x, y, train_size=0.8, random_state=10,stratify=y)
x_3, x_4, y_3, y_4 = train_test_split(x1, y1, train_size=0.5, random_state=10,stratify=y1)
x_tr1, x_te1, y_tr1, y_te1 = train_test_split(x_3, y_3, train_size=0.5, random_state=10,stratify=y_3)
x_tr2, x_te2, y_tr2, y_te2 = train_test_split(x_4, y_4, train_size=0.5, random_state=10,stratify=y_4)
elems=[]
s=pd.DataFrame()
# 1
te=pd.concat([pd.DataFrame(x2),pd.DataFrame(y2)],axis=1).reset_index(drop=True)
t0=pd.concat([pd.DataFrame(x_tr1),pd.DataFrame(y_tr1)],axis=1).reset_index(drop=True)
t1=pd.concat([pd.DataFrame(x_tr2),pd.DataFrame(y_tr2)],axis=1).reset_index(drop=True)
t2=pd.concat([pd.DataFrame(x_te1),pd.DataFrame(y_te1)],axis=1).reset_index(drop=True)
t3=pd.concat([pd.DataFrame(x_te2),pd.DataFrame(y_te2)],axis=1).reset_index(drop=True)
t4=pd.concat([t1,t2]).reset_index(drop=True)
t4=pd.concat([t4,t3]).reset_index(drop=True)
tr=pd.concat([t4,t0]).reset_index(drop=True)
tx,vx,ty,vy= train_test_split(tr['text'], tr['label'], train_size=0.8, random_state=10,stratify=tr['label'])
tr=pd.concat([pd.DataFrame(tx),pd.DataFrame(ty)],axis=1).reset_index(drop=True)
va=pd.concat([pd.DataFrame(vx),pd.DataFrame(vy)],axis=1).reset_index(drop=True)
elems.append([copy.deepcopy(tr),copy.deepcopy(te),copy.deepcopy(va)])
# # 2
# te=pd.concat([pd.DataFrame(x_tr1),pd.DataFrame(y_tr1)],axis=1).reset_index(drop=True)
# t0=pd.concat([pd.DataFrame(x2),pd.DataFrame(y2)],axis=1).reset_index(drop=True)
# t1=pd.concat([pd.DataFrame(x_tr2),pd.DataFrame(y_tr2)],axis=1).reset_index(drop=True)
# t2=pd.concat([pd.DataFrame(x_te1),pd.DataFrame(y_te1)],axis=1).reset_index(drop=True)
# t3=pd.concat([pd.DataFrame(x_te2),pd.DataFrame(y_te2)],axis=1).reset_index(drop=True)
# t4=pd.concat([t1,t2]).reset_index(drop=True)
# t4=pd.concat([t4,t3]).reset_index(drop=True)
# tr=pd.concat([t4,t0]).reset_index(drop=True)
# tx,vx,ty,vy= train_test_split(tr['text'], tr['label'], train_size=0.8, random_state=10,stratify=tr['label'])
# tr=pd.concat([pd.DataFrame(tx),pd.DataFrame(ty)],axis=1).reset_index(drop=True)
# va=pd.concat([pd.DataFrame(vx),pd.DataFrame(vy)],axis=1).reset_index(drop=True)
# elems.append([copy.deepcopy(tr),copy.deepcopy(te),copy.deepcopy(va)])
# # 3
# te=pd.concat([pd.DataFrame(x_tr2),pd.DataFrame(y_tr2)],axis=1).reset_index(drop=True)
# t0=pd.concat([pd.DataFrame(x2),pd.DataFrame(y2)],axis=1).reset_index(drop=True)
# t1=pd.concat([pd.DataFrame(x_tr1),pd.DataFrame(y_tr1)],axis=1).reset_index(drop=True)
# t2=pd.concat([pd.DataFrame(x_te1),pd.DataFrame(y_te1)],axis=1).reset_index(drop=True)
# t3=pd.concat([pd.DataFrame(x_te2),pd.DataFrame(y_te2)],axis=1).reset_index(drop=True)
# t4=pd.concat([t1,t2]).reset_index(drop=True)
# t4=pd.concat([t4,t3]).reset_index(drop=True)
# tr=pd.concat([t4,t0]).reset_index(drop=True)
# tx,vx,ty,vy= train_test_split(tr['text'], tr['label'], train_size=0.8, random_state=10,stratify=tr['label'])
# tr=pd.concat([pd.DataFrame(tx),pd.DataFrame(ty)],axis=1).reset_index(drop=True)
# va=pd.concat([pd.DataFrame(vx),pd.DataFrame(vy)],axis=1).reset_index(drop=True)
# elems.append([copy.deepcopy(tr),copy.deepcopy(te),copy.deepcopy(va)])
# # 4
# te=pd.concat([pd.DataFrame(x_te1),pd.DataFrame(y_te1)],axis=1).reset_index(drop=True)
# t0=pd.concat([pd.DataFrame(x2),pd.DataFrame(y2)],axis=1).reset_index(drop=True)
# t1=pd.concat([pd.DataFrame(x_tr2),pd.DataFrame(y_tr2)],axis=1).reset_index(drop=True)
# t2=pd.concat([pd.DataFrame(x_tr1),pd.DataFrame(y_tr1)],axis=1).reset_index(drop=True)
# t3=pd.concat([pd.DataFrame(x_te2),pd.DataFrame(y_te2)],axis=1).reset_index(drop=True)
# t4=pd.concat([t1,t2]).reset_index(drop=True)
# t4=pd.concat([t4,t3]).reset_index(drop=True)
# tr=pd.concat([t4,t0]).reset_index(drop=True)
# tx,vx,ty,vy= train_test_split(tr['text'], tr['label'], train_size=0.8, random_state=10,stratify=tr['label'])
# tr=pd.concat([pd.DataFrame(tx),pd.DataFrame(ty)],axis=1).reset_index(drop=True)
# va=pd.concat([pd.DataFrame(vx),pd.DataFrame(vy)],axis=1).reset_index(drop=True)
# elems.append([copy.deepcopy(tr),copy.deepcopy(te),copy.deepcopy(va)])
# # 5
# te=pd.concat([pd.DataFrame(x_te2),pd.DataFrame(y_te2)],axis=1).reset_index(drop=True)
# t0=pd.concat([pd.DataFrame(x2),pd.DataFrame(y2)],axis=1).reset_index(drop=True)
# t1=pd.concat([pd.DataFrame(x_tr2),pd.DataFrame(y_tr2)],axis=1).reset_index(drop=True)
# t2=pd.concat([pd.DataFrame(x_te1),pd.DataFrame(y_te1)],axis=1).reset_index(drop=True)
# t3=pd.concat([pd.DataFrame(x_tr1),pd.DataFrame(y_tr1)],axis=1).reset_index(drop=True)
# t4=pd.concat([t1,t2]).reset_index(drop=True)
# t4=pd.concat([t4,t3]).reset_index(drop=True)
# tr=pd.concat([t4,t0]).reset_index(drop=True)
# tx,vx,ty,vy= train_test_split(tr['text'], tr['label'], train_size=0.8, random_state=10,stratify=tr['label'])
# tr=pd.concat([pd.DataFrame(tx),pd.DataFrame(ty)],axis=1).reset_index(drop=True)
# va=pd.concat([pd.DataFrame(vx),pd.DataFrame(vy)],axis=1).reset_index(drop=True)
# elems.append([copy.deepcopy(tr),copy.deepcopy(te),copy.deepcopy(va)])
for i in range (0,len(elems)):
    train_data = elems[i][0]
    valid_data = elems[i][2]
    test_data  = elems[i][1]
    classifier = BertClassifier(
        model_path='cointegrated/rubert-tiny',
        tokenizer_path='cointegrated/rubert-tiny',
        n_classes=2,
        epochs=4,
        model_save_path='/content/bert.pt')
    classifier.preparation(
        X_train=list(train_data['text']),
        y_train=list(train_data['label']),
        X_valid=list(valid_data['text']),
        y_valid=list(valid_data['label']))
    classifier.train()
    texts = list(test_data['text'])
    labels = list(test_data['label'])
    predictions = [classifier.predict(t) for t in texts]
    from sklearn.metrics import precision_recall_fscore_support
    precision, recall, f1score = precision_recall_fscore_support(labels, predictions,average='macro')[:3]
    print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')
    test_prec.append(copy.deepcopy(precision)) 
    test_rec.append(copy.deepcopy(recall)) 
    test_f1.append(copy.deepcopy(f1score))
    

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not i

Epoch 1/4
итерация  100  из  256
итерация  200  из  256
итерация  300  из  256
итерация  400  из  256
итерация  500  из  256
Train loss 0.051035044979016675 accuracy 0.98046875
Val loss 0.0005440200079647184 accuracy 1.0
----------
Epoch 2/4
итерация  100  из  256
итерация  200  из  256
итерация  300  из  256
итерация  400  из  256
итерация  500  из  256
Train loss 0.00039198292830633363 accuracy 1.0
Val loss 0.00032755967549746856 accuracy 1.0
----------
Epoch 3/4
итерация  100  из  256
итерация  200  из  256
итерация  300  из  256


In [None]:
import matplotlib.pyplot as plt
from IPython.display import clear_output
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator, LinearLocator, LogLocator)
import matplotlib.ticker as ticker
import numpy as np
fig, ax = plt.subplots()
fig.set_size_inches(8.5, 8.5)
ax.plot(train_l, label="train loss")
ax.legend()
ax.plot(train_a, label="test accuracy")
ax.legend()
ax.plot(valid_l, label="val loss")
ax.legend()
ax.plot(valid_a, label="val accuracy")
ax.legend()
# ax.plot(test_prec, label="precision")
# ax.legend()
# ax.plot(test_rec, label="recall")
# ax.legend()
ax.plot(test_f1, label="f1")
ax.legend()

ax.yaxis.set_major_locator(MultipleLocator(base=0.04))
plt.show()

In [None]:
# import pandas as pd
# train_data = pd.read_csv('train_dataset.csv')
# valid_data = pd.read_csv('valid_dataset.csv')
# test_data  = pd.read_csv('test_dataaset.csv')
# classifier = BertClassifier(
#         model_path='cointegrated/rubert-tiny',
#         tokenizer_path='cointegrated/rubert-tiny',
#         n_classes=2,
#         epochs=2,
#         model_save_path='/content/bert.pt'
# )

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not i

In [None]:
# classifier.preparation(
#         X_train=list(train_data['text']),
#         y_train=list(train_data['label']),
#         X_valid=list(valid_data['text']),
#         y_valid=list(valid_data['label']))

In [None]:
# classifier.train()

Epoch 1/2
итерация  100  из 480
итерация  200  из 480
итерация  300  из 480
итерация  400  из 480
Train loss 0.6720701104670297 accuracy 0.6916666666666667
Val loss 0.6012049832691749 accuracy 0.7416666666666667
----------
Epoch 2/2
итерация  100  из 480
итерация  200  из 480
итерация  300  из 480
итерация  400  из 480
Train loss 0.5025482886024596 accuracy 0.7895833333333333
Val loss 0.6097739484238749 accuracy 0.7666666666666667
----------


In [None]:
# texts = list(test_data['text'])
# labels = list(test_data['label'])
# predictions = [classifier.predict(t) for t in texts]

In [None]:
# from sklearn.metrics import precision_recall_fscore_support
# precision, recall, f1score = precision_recall_fscore_support(labels, predictions,average='macro')[:3]
# print(f'precision: {precision}, recall: {recall}, f1score: {f1score}')

precision: 0.7071428571428572, recall: 0.7160551313093686, f1score: 0.7076023391812867
