In [None]:
%pip install transformers

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss
import time, os


In [None]:
# model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=2)
# tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

current_path = os.getcwd()
model_path= current_path + "\\ruBERT_1.0acc"
model_path
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)
tokenizer = BertTokenizer.from_pretrained(model_path)

In [None]:
df = pd.read_csv("dataset.csv", delimiter=';', usecols=['Вопрос','Класс'])
df.info()

In [None]:
X = df['Вопрос']
y = df['Класс']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

train_df = pd.DataFrame({'Вопрос': X_train, 'Класс': y_train})
test_df = pd.DataFrame({'Вопрос': X_test, 'Класс': y_test})

train_df

In [None]:
test_df

In [None]:
class TextDataset(Dataset):
    def __init__(self, dataframe, question_column, class_column, tokenizer, max_length):
        """
        Args:
            dataframe (pandas.DataFrame): The DataFrame containing your data.
            question_column (str): The name of the column that contains the text (questions).
            class_column (str): The name of the column that contains the target classes.
            tokenizer: A tokenizer object capable of tokenizing the text (questions).
            max_length (int): The maximum length of the tokenized text.
        """
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        self.encodings = tokenizer(list(dataframe[question_column]), truncation=True, padding=True, max_length=max_length)
        
        self.labels = torch.tensor(dataframe[class_column].values)
        
    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    
train_dataset = TextDataset(train_df, question_column='Вопрос', class_column='Класс', tokenizer=tokenizer, max_length=128)
train_dataset[1]

In [None]:
test_dataset=TextDataset(test_df, question_column='Вопрос', class_column='Класс', tokenizer=tokenizer, max_length=128)
test_dataset[1]

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [None]:
torch.cuda.is_available()

In [None]:
torch.cuda.empty_cache()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
start_time = time.time()

for epoch in range(3):
    epoch_start_time = time.time()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    epoch_end_time = time.time()
    epoch_time = epoch_end_time - epoch_start_time
    print(f"Epoch {epoch+1} ended. Time taken: {epoch_time:.2f} seconds")

end_time = time.time()
total_time = end_time - start_time
print(f"\nTotal training time: {total_time:.2f} seconds")
        

In [None]:
model.eval()
predictions = []
labels = []

with torch.no_grad():
    for batch in test_dataloader:
        outputs = model(input_ids=batch['input_ids'].to(device), 
                        attention_mask=batch['attention_mask'].to(device))
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())
        labels.extend(batch['labels'].tolist())

accuracy = accuracy_score(labels, predictions)
precision = precision_score(labels, predictions, average='binary')  # For binary classification
recall = recall_score(labels, predictions, average='binary')  # For binary classification
log_loss = log_loss(labels, predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Logloss: {log_loss}")

In [None]:
def predict(text):
    # Tokenize the input text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Get the model's prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

In [None]:
predict("адрес преподов")

In [None]:
tokenizer.save_pretrained('./guardian/ruBERT/')
model.save_pretrained('./guardian/ruBERT/')