In [1]:
import pandas as pd

data = pd.read_csv("comments_data.csv")
data = data.drop(columns=['Unnamed: 0'])
data.head()

Unnamed: 0,Reviews
0,Потрясающе полезная и интересная лекция
1,Потрясающая лекция!
2,Замечательные лекции! Спасибо за возможность у...
3,"Обалдеть, качество подачи материала, именно пр..."
4,"Лучшее объяснение функций, что я видел!"


In [2]:
data = data.dropna()
data['Reviews'] = data['Reviews'].astype(str)

In [3]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

model_checkpoint = 'cointegrated/rubert-base-cased-nli-threeway'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()
    
def predict_zero_shot(text, label_texts, model, tokenizer, label='entailment', normalize=True):
    label_texts
    tokens = tokenizer([text] * len(label_texts), label_texts, truncation=True, return_tensors='pt', padding=True)
    with torch.inference_mode():
        result = torch.softmax(model(**tokens.to(model.device)).logits, -1)
    proba = result[:, model.config.label2id[label]].cpu().numpy()
    if normalize:
        proba /= sum(proba)
    return proba

print(data['Reviews'].to_list()[0])
classes = ['Лекция', 'Курс', 'Лектор']
print(predict_zero_shot(data['Reviews'].to_list()[0], classes, model, tokenizer))

tqdm.pandas(desc="Processing")
data['object'] = data['Reviews'].progress_apply(lambda x: np.array(predict_zero_shot(x, classes, model, tokenizer)).argmax())
data['object'] = data['object'].astype(int)

  from .autonotebook import tqdm as notebook_tqdm
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM

Потрясающе полезная и интересная лекция
[0.3620666  0.31604737 0.321886  ]


=(true | false)
Processing: 100%|██████████| 1281/1281 [01:26<00:00, 14.87it/s]


In [4]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

model_checkpoint = 'cointegrated/rubert-base-cased-nli-threeway'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()
    
def predict_zero_shot(text, label_texts, model, tokenizer, label='entailment', normalize=True):
    label_texts
    tokens = tokenizer([text] * len(label_texts), label_texts, truncation=True, return_tensors='pt', padding=True)
    with torch.inference_mode():
        result = torch.softmax(model(**tokens.to(model.device)).logits, -1)
    proba = result[:, model.config.label2id[label]].cpu().numpy()
    if normalize:
        proba /= sum(proba)
    return proba

print(data['Reviews'].to_list()[0])
classes = ['Неинформативный', 'Информативный']
print(predict_zero_shot(data['Reviews'].to_list()[0], classes, model, tokenizer))

tqdm.pandas(desc="Processing")
data['is_relevant'] = data['Reviews'].progress_apply(lambda x: np.array(predict_zero_shot(x, classes, model, tokenizer)).argmax())
data['is_relevant'] = data['is_relevant'].astype(int)

Потрясающе полезная и интересная лекция
[0.00357977 0.99642026]


Processing: 100%|██████████| 1281/1281 [01:13<00:00, 17.35it/s]


In [5]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

model_checkpoint = 'cointegrated/rubert-base-cased-nli-threeway'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()
    
def predict_zero_shot(text, label_texts, model, tokenizer, label='entailment', normalize=True):
    label_texts
    tokens = tokenizer([text] * len(label_texts), label_texts, truncation=True, return_tensors='pt', padding=True)
    with torch.inference_mode():
        result = torch.softmax(model(**tokens.to(model.device)).logits, -1)
    proba = result[:, model.config.label2id[label]].cpu().numpy()
    if normalize:
        proba /= sum(proba)
    return proba

print(data['Reviews'].to_list()[0])
classes = ['Негативный', 'Позитивный']
print(predict_zero_shot(data['Reviews'].to_list()[0], classes, model, tokenizer))

tqdm.pandas(desc="Processing")
data['is_positive'] = data['Reviews'].progress_apply(lambda x: np.array(predict_zero_shot(x, classes, model, tokenizer)).argmax())
data['is_positive'] = data['is_positive'].astype(int)

Потрясающе полезная и интересная лекция
[8.2251843e-04 9.9917746e-01]


Processing: 100%|██████████| 1281/1281 [01:15<00:00, 16.98it/s]


In [6]:
data['is_relevant'] = data['is_relevant'] * data['Reviews'].apply(lambda x: len(x.split()) >= 8)

In [7]:
file_path = 'comments_classes.csv'

# Сохраняем DataFrame в CSV файл
data.to_csv(file_path, index=False)

In [8]:
data['is_positive'].value_counts()

is_positive
1    663
0    618
Name: count, dtype: int64

In [9]:
data['object'].value_counts()

object
1    655
2    438
0    188
Name: count, dtype: int64

In [10]:
data['is_relevant'].value_counts()

is_relevant
1    686
0    595
Name: count, dtype: int64

In [11]:
data = data[data['object'] != 1]

In [12]:
data['object'].value_counts()

object
2    438
0    188
Name: count, dtype: int64

In [13]:
data['is_relevant'].value_counts()

is_relevant
1    363
0    263
Name: count, dtype: int64

In [14]:
data['is_positive'].value_counts()

is_positive
1    349
0    277
Name: count, dtype: int64

In [15]:
df2 = pd.read_csv("stepik_data.csv")
all_data = pd.concat([df2, data], axis=0)

In [16]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6345 entries, 0 to 1280
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Reviews      6345 non-null   object
 1   is_relevant  6345 non-null   int64 
 2   is_positive  6345 non-null   int64 
 3   object       6345 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 247.9+ KB


In [17]:
all_data.tail()

Unnamed: 0,Reviews,is_relevant,is_positive,object
1274,Спасибо за лекцию! Кстати было бы интересно по...,1,1,0
1276,Здравствуйте! Спасибо за замечательную лекцию ...,1,1,0
1277,1:14:07 действительно ли scoped_lock тяжелее l...,1,0,0
1278,https://youtu.be/xTpAJWe7ZD4?t=4484 Константин...,1,1,0
1280,"очень интересно, спасибо огромное за материал!...",0,1,0


In [18]:
all_data.to_csv("all_data.csv")

In [19]:
tr_data = pd.read_csv("train_data.csv")
tr_data['Reviews'] = tr_data['question_2'] + tr_data['question_3'] + tr_data['question_4'] + tr_data['question_5']
tr_data = tr_data.drop(columns=['timestamp', 'question_1', 'question_2', 'question_3', 'question_4', 'question_5'])

In [21]:
all_all_data = pd.concat([all_data, tr_data], axis=0)
all_all_data.to_csv("finall_data.csv")