In [1]:
import pandas as pd

In [2]:
education = pd.read_csv("/Users/shu/training_python_datascince/kaggle_sentimental_analysis/Sentiment_dataset/Education.csv")
finance = pd.read_csv("/Users/shu/training_python_datascince/kaggle_sentimental_analysis/Sentiment_dataset/Finance.csv")
politics = pd.read_csv("/Users/shu/training_python_datascince/kaggle_sentimental_analysis/Sentiment_dataset/Politics.csv")
sports = pd.read_csv("/Users/shu/training_python_datascince/kaggle_sentimental_analysis/Sentiment_dataset/Sports.csv")

In [3]:
print(education.Label.value_counts(), '\n\n', education.isna().any())

positive    26
negative    26
Name: Label, dtype: int64 

 Text     False
Label    False
dtype: bool


In [4]:
print(finance.Label.value_counts(), '\n\n', finance.isna().any())

positive    34
negative    14
Name: Label, dtype: int64 

 Text     False
Label    False
dtype: bool


In [5]:
print(politics.Label.value_counts(), '\n\n', politics.isna().any())

negative    28
positive    25
Name: Label, dtype: int64 

 Text     False
Label    False
dtype: bool


In [6]:
print(sports.Label.value_counts(), '\n\n', sports.isna().any())

positive    28
negative    28
Name: Label, dtype: int64 

 Text     False
Label    False
dtype: bool


In [7]:
df = pd.concat([education, politics, sports, finance])

In [8]:
df = df.sample(frac=1)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 209 entries, 25 to 46
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    209 non-null    object
 1   Label   209 non-null    object
dtypes: object(2)
memory usage: 4.9+ KB


In [10]:
df.duplicated().any()

False

### Обучаем модель с помощью torch. Отличие от предыдущей модели не сильно отличаются

In [11]:
import torch
import evaluate
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler


In [12]:
df.head()

Unnamed: 0,Text,Label
25,Political campaigns rely on fear-mongering tac...,negative
41,Financial engineering techniques allow firms t...,positive
19,The media plays a crucial role in holding poli...,positive
0,The financial markets are influenced by a myri...,positive
15,The impact of political decisions extends far ...,positive


In [13]:
# наш таргет
df['Label'] = df['Label'].apply(lambda x: {True: 1, False:0}[x=='positive'])


In [14]:
df.head()

Unnamed: 0,Text,Label
25,Political campaigns rely on fear-mongering tac...,0
41,Financial engineering techniques allow firms t...,1
19,The media plays a crucial role in holding poli...,1
0,The financial markets are influenced by a myri...,1
15,The impact of political decisions extends far ...,1


In [15]:
# Конвертируем датафрейм в Dataset
train, test = train_test_split(df, test_size=0.2)
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)


In [16]:
# Выполняем предобработку текста
tokenizer = AutoTokenizer.from_pretrained('SkolkovoInstitute/russian_toxicity_classifier')


In [17]:
def tokenize_func(examples):
    return tokenizer(examples['Text'], padding='max_length', truncation=True)

def ds_preproc(ds):
    ds = ds.map(tokenize_func)
    ds = ds.remove_columns(['Text', '__index_level_0__'])
    ds = ds.rename_column('Label', 'labels')
    ds.set_format('torch')
    return ds


In [18]:
tokenized_train = ds_preproc(train)
tokenized_test = ds_preproc(test)


Map:   0%|          | 0/167 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

In [19]:
# Создаем даталоадер
train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=8)
test_dataloader = DataLoader(tokenized_test, batch_size=8)


In [20]:
# Загружаем модель и указываем кол-во классов
model = AutoModelForSequenceClassification.from_pretrained(
    'SkolkovoInstitute/russian_toxicity_classifier',
    num_labels=2)


In [21]:
# Задаем оптимайзер и шедулер
optimizer = AdamW(model.parameters(), lr=5e-6)

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name = 'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps)

# device = 'mps'
# model.to(device)


In [23]:
# Выполняем цикл...
for epoch in tqdm(range(num_epochs)):

    #... обучения
    model.train()
    
    for batch in tqdm(train_dataloader, leave=False):
#         batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

#     ... оценки
    metric = evaluate.load('f1')

    model.eval()
    
    for batch in tqdm(test_dataloader, leave=False):
#         batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        metric.add_batch(predictions=predictions, references=batch['labels'])

    print(f'epoch {epoch} -', metric.compute())


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

epoch 0 - {'f1': 0.6779661016949152}


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

epoch 1 - {'f1': 0.43902439024390244}


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

epoch 2 - {'f1': 0.5714285714285714}


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

epoch 3 - {'f1': 0.5833333333333334}


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

epoch 4 - {'f1': 0.5833333333333334}
