# **Задание**

Используя ноутбук занятия (также размещен в папке Materials) и данные fakenews, 3 раза разными способами получить на задаче классификации значение f1 выше 0.91 для методов на sklearn и выше 0.52 для методов на pytorch.

# Модели на базе библиотеки sklearn.

In [None]:
!wget https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv

--2023-10-05 00:00:25--  https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1253562 (1.2M) [text/plain]
Saving to: ‘Constraint_Train.csv’


2023-10-05 00:00:26 (16.1 MB/s) - ‘Constraint_Train.csv’ saved [1253562/1253562]



In [None]:
import pandas as pd
import numpy as np
import tqdm

In [None]:
df = pd.read_csv('Constraint_Train.csv')

In [None]:
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [None]:
df.shape

(6420, 3)

Выполним предобработку текстов новостей.

In [None]:
import re

In [None]:
# Функция предобработки текстов датафрейма:

def cleaning(text):

  text_ = re.sub(r'http\S+', '', text) # удаление ссылок
  text_ = re.sub('-', ' ', text_) # замена дефисов на пробелы
  text_ = text_.lower() # приведение к нижнему регистру
  text_ = re.sub('\s+', ' ', text_) # удаление лишних пробельных символов
  text_ = re.sub('[^a-zA-Z\s]', '', text_) # удаление всего, что не является токенами из латинских букв, в т.ч. знаков пунктуации

  return text_

In [None]:
clean_texts = [cleaning(text) for text in df.tweet]

In [None]:
clean_texts[:3]

['the cdc currently reports  deaths in general the discrepancies in death counts between different sources are small and explicable the death toll stands at roughly  people today',
 'states reported  deaths a small rise from last tuesday southern states reported  of those deaths ',
 'politically correct woman almost uses pandemic as excuse not to reuse plastic bag coronavirus nashville']

In [None]:
# Обновленный датафрейм:
df_clean = pd.DataFrame({'tweet':clean_texts, 'label':df.label})

In [None]:
df_clean.head()

Unnamed: 0,tweet,label
0,the cdc currently reports deaths in general t...,real
1,states reported deaths a small rise from last...,real
2,politically correct woman almost uses pandemic...,fake
3,indiafightscorona we have covid testing labor...,real
4,populous states can generate large case counts...,real


In [None]:
df_clean.label.unique()

array(['real', 'fake'], dtype=object)

In [None]:
# Замена строчных значений таргета на числовые:
df_clean['label'] = df_clean['label'].replace('real', 1)
df_clean['label'] = df_clean['label'].replace('fake', 0)

## Модель №1: CountVectorizer + LogisticRegression.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# На тестовое множество отведем 20% записей датафрейма:
X_train, X_test, y_train, y_test = train_test_split(df_clean['tweet'], df_clean['label'], test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score

In [None]:
count_vect_1 = CountVectorizer(ngram_range=(1, 1))
bow_1 = count_vect_1.fit_transform(X_train)

In [None]:
%%time
model_1 = LogisticRegression(max_iter=500, random_state=42)
model_1.fit(bow_1, y_train)

CPU times: user 269 ms, sys: 283 ms, total: 552 ms
Wall time: 297 ms


In [None]:
pred_1 = model_1.predict(count_vect_1.transform(X_test))

In [None]:
# accuracy_1 = accuracy_score(y_test, pred_1).round(4)
f1_1 = f1_score(y_test, pred_1).round(4)
# accuracy_1,
f1_1

0.9196

## Модель №2: TfidfVectorizer + SGDClassifier.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier

In [None]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
tfidf_vect = TfidfVectorizer(analyzer='word', tokenizer=word_tokenize, stop_words='english')

In [None]:
X_tfidf = tfidf_vect.fit_transform(df_clean['tweet'])



In [None]:
X_tfidf.shape

(6420, 12485)

In [None]:
X_train_tfidf, X_test_tfidf = train_test_split(X_tfidf, test_size=0.2, random_state=42)

In [None]:
%%time

model_2 = SGDClassifier()
model_2.fit(X_train_tfidf, y_train)

CPU times: user 17.5 ms, sys: 9.09 ms, total: 26.6 ms
Wall time: 17.5 ms


In [None]:
pred_2 = model_2.predict(X_test_tfidf)

In [None]:
f1_2 = f1_score(y_test, pred_2).round(4)
f1_2

0.9236

## Модель №3: лемматизация + TfidfVectorizer + RidgeClassifier

In [None]:
from sklearn.linear_model import RidgeClassifier

Лемматизацию выполним с помощью библиотеки spacy.

In [None]:
!pip install spaCy



In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
# Формирование списка текстов новостей после выполнения лемматизации входящих в тексты слов:
clean_texts_3 = []

for text in tqdm(clean_texts):
  lem_text = nlp(text)
  new_lem_text = ' '.join([token.lemma_ for token in lem_text])
  new_lem_text = re.sub('\s+', ' ', new_lem_text) # удаление лишних пробельных символов
  clean_texts_3.append(new_lem_text)

100%|██████████| 6420/6420 [01:16<00:00, 83.95it/s] 


In [None]:
clean_texts_3[:3]

['the cdc currently report death in general the discrepancy in death count between different source be small and explicable the death toll stand at roughly people today',
 'state report death a small rise from last tuesday southern state report of those death',
 'politically correct woman almost use pandemic as excuse not to reuse plastic bag coronaviru nashville']

In [None]:
# Обновленный датафрейм для модели №3:
df_clean_3 = pd.DataFrame({'tweet':clean_texts_3, 'label':df_clean.label})

In [None]:
df_clean_3.head()

Unnamed: 0,tweet,label
0,the cdc currently report death in general the ...,1
1,state report death a small rise from last tues...,1
2,politically correct woman almost use pandemic ...,0
3,indiafightscorona we have covid testing labora...,1
4,populous state can generate large case count b...,1


In [None]:
tfidf_vect_3 = TfidfVectorizer(tokenizer=word_tokenize, analyzer='word')

In [None]:
X_3 =  tfidf_vect_3.fit_transform(df_clean_3['tweet'])



In [None]:
X_train_3, X_test_3 = train_test_split(X_3, test_size=0.2, random_state=42)

In [None]:
model_3 = RidgeClassifier(tol=1e-2, solver="sparse_cg", random_state=42)
model_3.fit(X_train_3, y_train)

In [None]:
pred_3 = model_3.predict(X_test_3)

In [None]:
f1_3 = f1_score(y_test, pred_3).round(4)
f1_3

0.9287

Датафрейм с результатами:

In [None]:
df_res = pd.DataFrame({'Model':['CountVectorizer + LogisticRegression', 'TfidfVectorizer + SGDClassifier', 'лемматизация + TfidfVectorizer + RidgeClassifier'],
                       'f1':[f1_1, f1_2, f1_3]})
df_res

Unnamed: 0,Model,f1
0,CountVectorizer + LogisticRegression,0.9196
1,TfidfVectorizer + SGDClassifier,0.9236
2,лемматизация + TfidfVectorizer + RidgeClassifier,0.9287


# Модели на базе библиотеки pytorch.

Для всех моделей эмбеддинги будем формировать из предобработанных текстов (из датафрейма df_clean).

Модель №1 - LSTM.

Изменения:

1) параметры сети LSTM: количество фичей скрытого слоя сети LSTM уменьшено до 50;

2) в качестве оптимизатора выбран Adagrad, в качестве лосс-функции - кроссэнтропия CrossEntropyLoss.

In [None]:
df_clean.head()

Unnamed: 0,tweet,label
0,the cdc currently reports deaths in general t...,real
1,states reported deaths a small rise from last...,real
2,politically correct woman almost uses pandemic...,fake
3,indiafightscorona we have covid testing labor...,real
4,populous states can generate large case counts...,real


In [None]:
labels = (df.label == 'real').astype(int).to_list()

In [None]:
token_lists = [word_tokenize(text) for text in tqdm(df_clean.tweet)]

100%|██████████| 6420/6420 [00:01<00:00, 5375.71it/s]


In [None]:
from gensim.models.word2vec import Word2Vec
model_tweets = Word2Vec(token_lists, workers=4, vector_size=300, min_count=3, window=5)

In [None]:
max_len = len(max(token_lists, key=len))

In [None]:
def get_word_embedding(tokens, max_len):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model_tweets.wv:
                result.append(model_tweets.wv[word])
            else:
                result.append(np.zeros(300))
        else:
            result.append(np.zeros(300))
    return result

In [None]:
features = np.array([get_word_embedding(text, 100) for text in token_lists])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
x = torch.FloatTensor(3, 4)
x.zero_()

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])

In [None]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(300, 50)
        self.out = nn.Linear(50, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


net = Net()
net.cuda()
print(net)

Net(
  (lstm): LSTM(300, 50)
  (out): Linear(in_features=50, out_features=1, bias=True)
)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2)

In [None]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

In [None]:
in_data.shape, targets.shape

(torch.Size([5136, 100, 300]), torch.Size([5136]))

In [None]:
optimizer = optim.Adagrad(net.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [None]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size].cuda()
        batch_y = targets[i:i + batch_size].cuda()

        # batch_x = in_data[i:i + batch_size]
        # batch_y = targets[i:i + batch_size]

        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [None]:
net.train()

Net(
  (lstm): LSTM(300, 50)
  (out): Linear(in_features=50, out_features=1, bias=True)
)

In [None]:
for i in range(40):
  train_one_epoch(in_data, targets)

100%|██████████| 321/321 [00:01<00:00, 232.95it/s]


tensor(13.8629, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:01<00:00, 271.80it/s]


tensor(13.8629, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 349.20it/s]


tensor(13.8629, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 363.31it/s]


tensor(13.7853, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 356.99it/s]


tensor(12.0663, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 359.04it/s]


tensor(11.4382, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 360.61it/s]


tensor(11.5015, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 360.74it/s]


tensor(11.4358, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 356.02it/s]


tensor(11.4302, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 359.31it/s]


tensor(11.4681, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 356.98it/s]


tensor(11.4039, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 357.70it/s]


tensor(11.4949, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 333.85it/s]


tensor(11.4138, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:01<00:00, 273.39it/s]


tensor(11.4100, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:01<00:00, 289.13it/s]


tensor(11.3766, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 352.37it/s]


tensor(11.3912, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 356.52it/s]


tensor(11.3806, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 363.69it/s]


tensor(11.3824, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 361.50it/s]


tensor(11.3815, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 355.48it/s]


tensor(11.3846, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 355.43it/s]


tensor(11.3794, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 352.41it/s]


tensor(11.3747, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 358.19it/s]


tensor(11.3748, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 359.79it/s]


tensor(11.3747, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 366.71it/s]


tensor(11.3817, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 324.62it/s]


tensor(11.3871, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:01<00:00, 271.94it/s]


tensor(11.3625, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:01<00:00, 296.61it/s]


tensor(11.3657, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 354.15it/s]


tensor(11.3642, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 349.52it/s]


tensor(11.3652, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 360.57it/s]


tensor(11.3634, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 347.35it/s]


tensor(11.3634, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 354.03it/s]


tensor(11.3573, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 348.17it/s]


tensor(11.3569, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 356.76it/s]


tensor(11.3551, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 356.85it/s]


tensor(11.3547, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 360.25it/s]


tensor(11.3987, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:00<00:00, 361.11it/s]


tensor(11.3509, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:01<00:00, 303.59it/s]


tensor(11.3240, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 321/321 [00:01<00:00, 270.22it/s]

tensor(11.3677, device='cuda:0', grad_fn=<DivBackward1>)





In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net(in_data_test.cuda()).reshape(-1)

In [None]:
output

tensor([0.9998, 0.3380, 0.9973,  ..., 0.0353, 0.9987, 0.9399], device='cuda:0')

In [None]:
max(torch.nn.functional.softmax(output))

  max(torch.nn.functional.softmax(output))


tensor(0.0011, device='cuda:0')

In [None]:
targets_test

tensor([1., 0., 1.,  ..., 1., 1., 1.])

Для подсчета метрики f1 воспользуемся библиотекой torcheval.

In [None]:
!pip install torcheval

Collecting torcheval
  Downloading torcheval-0.0.7-py3-none-any.whl (179 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m112.6/179.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.2/179.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torcheval
Successfully installed torcheval-0.0.7


In [None]:
from torcheval.metrics.functional import binary_f1_score

 Метрика f1 модели №1 на pytorch:

In [None]:
input = output
target = targets_test.cuda()

f1_nn1 = binary_f1_score(input, target, threshold=0.5)
f1_nn1

tensor(0.8535, device='cuda:0')

Модель №2 - CNN.

Изменения:

1) эмбеддинги сформированы из предварительно предобработанных текстов (из датафрейма df_clean);

2) сверточная сеть с функцией активации relu и линейным классификатором, обучение 30 эпох.

3) оптимизатор - Adagrad, лосс-функция - кроссэнтропия CrossEntropyLoss.

In [None]:
class CNNModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv1d(300, 50, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(50),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
        )
        self.cl = nn.Sequential(
            nn.Linear(50, 1)
        )

    def forward(self, x):
        x = x.permute(0, 2, 1)
        features = self.cnn(x)
        prediction = self.cl(features)
        return prediction


net2 = CNNModel()
net2.cuda()
print(net2)

CNNModel(
  (cnn): Sequential(
    (0): Conv1d(300, 50, kernel_size=(3,), stride=(2,), padding=(1,))
    (1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): AdaptiveMaxPool1d(output_size=1)
    (4): Flatten(start_dim=1, end_dim=-1)
  )
  (cl): Sequential(
    (0): Linear(in_features=50, out_features=1, bias=True)
  )
)


In [None]:
optimizer = optim.Adagrad(net2.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [None]:
def train_one_epoch(in_data, targets, batch_size=15):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size].cuda()
        batch_y = targets[i:i + batch_size].cuda()

        # batch_x = in_data[i:i + batch_size]
        # batch_y = targets[i:i + batch_size]

        optimizer.zero_grad()
        output = net2(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [None]:
net2.train()

CNNModel(
  (cnn): Sequential(
    (0): Conv1d(300, 50, kernel_size=(3,), stride=(2,), padding=(1,))
    (1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): AdaptiveMaxPool1d(output_size=1)
    (4): Flatten(start_dim=1, end_dim=-1)
  )
  (cl): Sequential(
    (0): Linear(in_features=50, out_features=1, bias=True)
  )
)

In [None]:
for i in range(30):
  train_one_epoch(in_data, targets)

100%|██████████| 343/343 [00:02<00:00, 161.93it/s]


tensor(2.4670, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:01<00:00, 332.26it/s]


tensor(2.4354, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 380.00it/s]


tensor(2.4641, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 419.87it/s]


tensor(2.4415, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 422.49it/s]


tensor(2.4430, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 424.62it/s]


tensor(2.5103, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 423.34it/s]


tensor(2.4503, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 408.90it/s]


tensor(2.3857, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 424.74it/s]


tensor(2.4032, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 424.63it/s]


tensor(2.3944, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 423.32it/s]


tensor(2.4061, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 425.94it/s]


tensor(2.3930, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 427.83it/s]


tensor(2.3833, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 428.68it/s]


tensor(2.3953, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 383.43it/s]


tensor(2.4102, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 347.12it/s]


tensor(2.4158, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 358.09it/s]


tensor(2.4128, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 420.92it/s]


tensor(2.3889, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 429.33it/s]


tensor(2.3891, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 403.17it/s]


tensor(2.3825, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 345.87it/s]


tensor(2.3607, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 426.21it/s]


tensor(2.3659, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 422.03it/s]


tensor(2.3537, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 422.62it/s]


tensor(2.3449, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 416.68it/s]


tensor(2.3409, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 425.70it/s]


tensor(2.3387, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 428.27it/s]


tensor(2.3495, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 424.83it/s]


tensor(2.3639, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:00<00:00, 386.48it/s]


tensor(2.3428, device='cuda:0', grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:01<00:00, 335.85it/s]

tensor(2.3334, device='cuda:0', grad_fn=<DivBackward1>)





In [None]:
with torch.no_grad():
    output2 = net2(in_data_test.cuda()).reshape(-1)

In [None]:
output2

tensor([ 1.9430, -0.0234,  1.4686,  ...,  1.0928,  0.9559,  1.0025],
       device='cuda:0')

In [None]:
targets_test

tensor([1., 0., 1.,  ..., 1., 1., 1.])

Метрика f1 модели №2 на pytorch:

In [None]:
input = output2
target = targets_test.cuda()

f1_nn2 = binary_f1_score(input, target, threshold=0.5)
f1_nn2

tensor(0.8182, device='cuda:0')

Модель №3.

Изменения:

1) эмбеддинги сформированы из предварительно предобработанных текстов (из датафрейма df_clean);

2) CNN (предыдущая модель) с большим количеством слоев, обучение 10 эпох.

3) оптимизатор - Adagrad, лосс-функция - кроссэнтропия CrossEntropyLoss.

In [None]:
class CNNModel3(nn.Module):
    def __init__(self):
        super().__init__()

        self.cnn = nn.Sequential(
            nn.Conv1d(300, 50, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(50),
            nn.ReLU(),
            nn.Conv1d(50, 50, kernel_size=3, padding=1, stride=2),
            nn.BatchNorm1d(50),
            nn.ReLU(),

            nn.AdaptiveMaxPool1d(1),
            nn.Flatten(),
        )
        self.cl = nn.Sequential(
            nn.Linear(50, 1)
        )

    def forward(self, x):
        x = x.permute(0, 2, 1)
        features = self.cnn(x)
        prediction = self.cl(features)
        return prediction


net3 = CNNModel3()

#net3.cuda()
print(net3)

CNNModel3(
  (cnn): Sequential(
    (0): Conv1d(300, 50, kernel_size=(3,), stride=(2,), padding=(1,))
    (1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv1d(50, 50, kernel_size=(3,), stride=(2,), padding=(1,))
    (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): AdaptiveMaxPool1d(output_size=1)
    (7): Flatten(start_dim=1, end_dim=-1)
  )
  (cl): Sequential(
    (0): Linear(in_features=50, out_features=1, bias=True)
  )
)


In [None]:
optimizer = optim.Adagrad(net3.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [None]:
net3.train()

CNNModel3(
  (cnn): Sequential(
    (0): Conv1d(300, 50, kernel_size=(3,), stride=(2,), padding=(1,))
    (1): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Conv1d(50, 50, kernel_size=(3,), stride=(2,), padding=(1,))
    (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU()
    (6): AdaptiveMaxPool1d(output_size=1)
    (7): Flatten(start_dim=1, end_dim=-1)
  )
  (cl): Sequential(
    (0): Linear(in_features=50, out_features=1, bias=True)
  )
)

In [None]:
def train_one_epoch(in_data, targets, batch_size=15):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        # batch_x = in_data[i:i + batch_size].cuda()
        # batch_y = targets[i:i + batch_size].cuda()

        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]

        optimizer.zero_grad()
        output = net3(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [None]:
for i in range(10):
  train_one_epoch(in_data, targets)

100%|██████████| 343/343 [00:06<00:00, 56.81it/s]


tensor(7.9559, grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:05<00:00, 68.34it/s]


tensor(7.8925, grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:04<00:00, 85.68it/s]


tensor(7.8507, grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:03<00:00, 95.84it/s]


tensor(7.7080, grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:05<00:00, 68.54it/s]


tensor(7.5515, grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:03<00:00, 91.10it/s]


tensor(7.5612, grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:03<00:00, 96.74it/s]


tensor(7.4416, grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:04<00:00, 79.05it/s]


tensor(7.3804, grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:05<00:00, 62.09it/s]


tensor(7.3775, grad_fn=<DivBackward1>)


100%|██████████| 343/343 [00:03<00:00, 97.27it/s]

tensor(7.3147, grad_fn=<DivBackward1>)





In [None]:
with torch.no_grad():
    # output3 = net3(in_data_test.cuda()).reshape(-1)
    output3 = net3(in_data_test).reshape(-1)

In [None]:
output3

tensor([-0.7803,  1.4176,  1.5761,  ..., -0.2520,  0.1729,  2.0088])

In [None]:
targets_test

tensor([1., 0., 1.,  ..., 1., 1., 1.])

Метрика f1 модели №3 на pytorch:

In [None]:
input = output3
#target = targets_test.cuda()
target = targets_test

f1_nn3 = binary_f1_score(input, target, threshold=0.5)
f1_nn3

tensor(0.7892)