# Relevant

In [4]:
import pandas as pd
import torch
import numpy as np

from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import f1_score
from tqdm import tqdm

# Загрузка данных
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

model_path = "models/relevant"
batch_size = 8
max_len = 128

test_data = pd.read_csv("train_Feedback/train_data.csv")
test_labels = test_data['is_relevant']
test_texts = (test_data['question_2'] + test_data['question_3'] + test_data['question_4'] + test_data['question_5']).to_list()

tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

test_dataset = CustomDataset(test_texts, test_labels, tokenizer, max_len)

test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

model_relevant = BertForSequenceClassification.from_pretrained(model_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_relevant.to(device)

# Валидация
model_relevant.eval()
val_preds = []
val_labels = []
for batch in tqdm(test_dataloader, total=len(test_dataloader)):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].numpy()

    with torch.no_grad():
        outputs = model_relevant(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
        val_preds.extend(preds)
        val_labels.extend(labels)

val_f1 = f1_score(val_labels, val_preds, average='micro')
print(f'Test F1-Score: {val_f1}')

100%|██████████| 13/13 [00:03<00:00,  3.61it/s]

Test F1-Score: 0.8932038834951457





# Object

In [2]:
import pandas as pd
import torch
import numpy as np

from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import f1_score
from tqdm import tqdm

# Загрузка данных
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

model_path = "models/object"
batch_size = 8
max_len = 128

test_data = pd.read_csv("train_data.csv")
test_labels = test_data['object']
test_texts = (test_data['question_2'] + test_data['question_3'] + test_data['question_4'] + test_data['question_5']).to_list()

tokenizer = BertTokenizer.from_pretrained(model_path)

test_dataset = CustomDataset(test_texts, test_labels, tokenizer, max_len)

test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

model_relevant = BertForSequenceClassification.from_pretrained(model_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_relevant.to(device)

# Валидация
model_relevant.eval()
val_preds = []
val_labels = []
for batch in tqdm(test_dataloader, total=len(test_dataloader)):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].numpy()

    with torch.no_grad():
        outputs = model_relevant(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
        val_preds.extend(preds)
        val_labels.extend(labels)

val_f1 = f1_score(val_labels, val_preds, average='micro')
print(f'Test F1-Score: {val_f1}')

100%|██████████| 13/13 [00:03<00:00,  3.98it/s]

Test F1-Score: 0.9611650485436893





# Positive

In [4]:
import pandas as pd
import torch
import numpy as np

from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import f1_score
from tqdm import tqdm

# Загрузка данных
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

model_path = "models/positive"
batch_size = 8
max_len = 128

test_data = pd.read_csv("train_data.csv")
test_labels = test_data['is_positive']
test_texts = (test_data['question_2'] + test_data['question_3'] + test_data['question_4'] + test_data['question_5']).to_list()

tokenizer = BertTokenizer.from_pretrained(model_path)

test_dataset = CustomDataset(test_texts, test_labels, tokenizer, max_len)

test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

model_relevant = BertForSequenceClassification.from_pretrained(model_path)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_relevant.to(device)

# Валидация
model_relevant.eval()
val_preds = []
val_labels = []
for batch in tqdm(test_dataloader, total=len(test_dataloader)):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].numpy()

    with torch.no_grad():
        outputs = model_relevant(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
        val_preds.extend(preds)
        val_labels.extend(labels)

val_f1 = f1_score(val_labels, val_preds, average='micro')
print(f'Test F1-Score: {val_f1}')

100%|██████████| 13/13 [00:03<00:00,  3.89it/s]

Test F1-Score: 0.970873786407767



