In [1]:
%pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [84]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import time


In [3]:

# Load the pre-trained RuBERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=2)
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
df = pd.read_csv("dataset.csv", delimiter=';', usecols=['Вопрос','Класс'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Вопрос  1200 non-null   object
 1   Класс   1200 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 18.9+ KB


In [42]:
# Assuming 'Вопрос' column contains your features (questions) and 'Класс' is the target variable
X = df['Вопрос']  # Features
y = df['Класс']   # Target

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# If you need to convert these back to DataFrame format for any reason
train_df = pd.DataFrame({'Вопрос': X_train, 'Класс': y_train})
test_df = pd.DataFrame({'Вопрос': X_test, 'Класс': y_test})

train_df

Unnamed: 0,Вопрос,Класс
865,"Определить holidayRequestID, employeeID, start...",0
1042,^C0d3&H@ck()+,0
625,Сколько всего видов птиц занесено в Красную кн...,0
775,"Найти все рейсы авиакомпании, следующие из Мос...",0
445,"Найти кандидатов, у которых оценка за вступите...",1
...,...,...
583,"Получить список курсов, на которые зарегистрир...",1
59,"Выберите id студентов, которые не зарегистриро...",1
371,"Вывести список всех студентов, у которых больш...",1
1130,Можно ли объединить несколько заказов в один д...,0


In [43]:
test_df

Unnamed: 0,Вопрос,Класс
562,"Получить список адресов студентов, проживающих...",1
582,"Вывести количество студентов, у которых пароль...",1
680,"Отобразить все картины эпохи Возрождения, хран...",0
987,What payment methods do you accept?,0
892,"Show carRentalID, companyName, location, daily...",0
...,...,...
504,"Найти студентов, у которых дата регистрации на...",1
787,Найти все товары в интернет-магазине электрони...,0
966,How can I improve my credit score quickly?,0
1124,Можно ли заказать услугу с выездом специалиста...,0


In [44]:
class TextDataset(Dataset):
    def __init__(self, dataframe, question_column, class_column, tokenizer, max_length):
        """
        Args:
            dataframe (pandas.DataFrame): The DataFrame containing your data.
            question_column (str): The name of the column that contains the text (questions).
            class_column (str): The name of the column that contains the target classes.
            tokenizer: A tokenizer object capable of tokenizing the text (questions).
            max_length (int): The maximum length of the tokenized text.
        """
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Tokenize questions
        self.encodings = tokenizer(list(dataframe[question_column]), truncation=True, padding=True, max_length=max_length)
        
        # Convert class labels to tensor
        self.labels = torch.tensor(dataframe[class_column].values)
        
    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    
train_dataset = TextDataset(train_df, question_column='Вопрос', class_column='Класс', tokenizer=tokenizer, max_length=128)
train_dataset[1]

{'input_ids': tensor([101, 228, 174, 137, 239, 143, 116, 184, 168, 236, 253, 120, 122, 126,
         102,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [46]:
test_dataset=TextDataset(test_df, question_column='Вопрос', class_column='Класс', tokenizer=tokenizer, max_length=128)
test_dataset[1]

{'input_ids': tensor([  101, 42020,  6313,  9595, 17416,   128,   875,  5327, 77340, 38792,
           152, 28628,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [64]:
# Prepare your labeled dataset
# train_dataset = ...  # Your preprocessed and labeled training dataset
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [49]:
torch.cuda.is_available()

True

In [80]:
torch.cuda.empty_cache()

In [61]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
# Set up the optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=2e-5)

In [63]:
start_time = time.time()

# Fine-tune the model
for epoch in range(3):
    epoch_start_time = time.time()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_start_time
    print(f"Epoch {epoch+1} ended. Time taken: {epoch_time:.2f} seconds")

end_time = time.time()
total_time = end_time - start_time
print(f"\nTotal training time: {total_time:.2f} seconds")
        

Epoch 1 ended. Time taken: 16.55 seconds
Epoch 2 ended. Time taken: 16.61 seconds
Epoch 3 ended. Time taken: 16.65 seconds

Total training time: 49.81 seconds


In [89]:
model.eval()
predictions = []
labels = []

with torch.no_grad():
    for batch in test_dataloader:
        outputs = model(input_ids=batch['input_ids'].to(device), 
                        attention_mask=batch['attention_mask'].to(device))
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())
        labels.extend(batch['labels'].tolist())

accuracy = accuracy_score(labels, predictions)
precision = precision_score(labels, predictions, average='binary')  # For binary classification
recall = recall_score(labels, predictions, average='binary')  # For binary classification

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [69]:
def predict(text):
    # Tokenize the input text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Get the model's prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

In [98]:
predict("адрес преподов")

0

In [81]:
tokenizer.save_pretrained('./guardian/ruBERT/')
model.save_pretrained('./guardian/ruBERT/')