<a href="https://colab.research.google.com/github/Sergey-Kit/itmo_dl_nlp_course/blob/hw_4/itmo_dl_nlp_course_dz_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Подготовка текста под задачу классификации

##### 1. Установка зависимостей

In [48]:
import numpy as np
import pandas as pd
import re
import os
import gdown
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import csv

In [49]:
pd.set_option('max_colwidth', 100)

In [50]:
RANDOM_STATE = 42
torch.manual_seed(RANDOM_STATE)

<torch._C.Generator at 0x79c47404ec50>

##### 2. Загрузка датасета

In [61]:
folder_path = "/content/archive"
os.makedirs(folder_path, exist_ok=True)

In [62]:
true_csv_url = "https://drive.google.com/uc?id=1e4FyQH7iKOZ4HStxnHyZGc9bObcvCUql"
fake_csv_url = "https://drive.google.com/uc?id=1sLsgqklL7ULXOqb3R4pv7XASEay7t-i2"
folder_path = "/content/archive"
gdown.download(true_csv_url, f"{folder_path}/True.csv", quiet=False)
gdown.download(fake_csv_url, f"{folder_path}/Fake.csv", quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1e4FyQH7iKOZ4HStxnHyZGc9bObcvCUql
To: /content/archive/True.csv
100%|██████████| 53.6M/53.6M [00:00<00:00, 183MB/s]
Downloading...
From: https://drive.google.com/uc?id=1sLsgqklL7ULXOqb3R4pv7XASEay7t-i2
To: /content/archive/Fake.csv
100%|██████████| 62.8M/62.8M [00:00<00:00, 216MB/s]


'/content/archive/Fake.csv'

In [63]:
true_data = pd.read_csv('archive/True.csv')
true_data['text'] = true_data['text'].str.split('-', n=1).str[1].str.strip()
with open('archive/Fake.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    false_data = pd.DataFrame(reader)
false_data['label'] = 0
true_data['label'] = 1
data = pd.concat([false_data, true_data], ignore_index=True)
data['full_text'] = data['title'] + ' ' + data['text']
data = data.drop(columns=['title', 'text', 'subject', 'date'])

##### 3. Чистим данные

In [64]:
len(data['full_text'])

44898

In [65]:
%%time
def clean_and_tokenize(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)

    tokens = word_tokenize(text)
    return tokens

data['tokenized_text'] = data['full_text'].apply(clean_and_tokenize)

CPU times: user 1min 13s, sys: 1.81 s, total: 1min 15s
Wall time: 1min 16s


In [66]:
data.sample(5)

Unnamed: 0,label,full_text,tokenized_text
13285,0,"INSANE MAN IS SCALING The Trump Tower Right Now…Has A Message For Trump [Video] Haha! No, it s n...","[insane, man, is, scaling, the, trump, tower, right, nowhas, a, message, for, trump, video, haha..."
20794,0,LIST OF U.S. STATES With Most Illegal Aliens Wow these are some pretty eye-opening statistics Re...,"[list, of, us, states, with, most, illegal, aliens, wow, these, are, some, pretty, eyeopening, s..."
31565,1,White House presses Congress on bill allowing 9/11 families to sue Saudi Arabia The White House ...,"[white, house, presses, congress, on, bill, allowing, 911, families, to, sue, saudi, arabia, the..."
38494,1,"Hariri indicates possibility of revoking his resignation Saad al-Hariri, who announced on Nov. 4...","[hariri, indicates, possibility, of, revoking, his, resignation, saad, alhariri, who, announced,..."
17669,0,JUST IN: TRUMP’S LONGTIME Bodyguard Tells Congress What REALLY Happened When Russians Offered To...,"[just, in, trumps, longtime, bodyguard, tells, congress, what, really, happened, when, russians,..."


##### 4. Делим выборку

In [67]:
X_train, X_test, y_train, y_test = train_test_split(data['tokenized_text'],
                                                    data['label'],
                                                    random_state=RANDOM_STATE,
                                                    test_size=0.20
                                                    )

##### 5. Получаем эмбендинги для обучения NN

In [9]:
word2vec_model = Word2Vec(sentences=X_train,
                          vector_size=100,
                          window=5,
                          min_count=1,
                          workers=1
                          )

In [10]:
def text_to_sequence(tokenized_text, max_length):
    sequence = np.zeros((max_length, 100))
    for i, word in enumerate(tokenized_text[:max_length]):
        if word in word2vec_model.wv:
            sequence[i] = word2vec_model.wv[word]
    return sequence

max_length = 100
X_train_seq = np.array([text_to_sequence(text, max_length) for text in X_train])
X_test_seq = np.array([text_to_sequence(text, max_length) for text in X_test])

y_train_seq = y_train.values
y_test_seq = y_test.values

## Построение моделей

##### 1. Получаем даталоадер

In [13]:
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_seq, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_seq, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

##### 2. CNN

In [15]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(100, 128, 5)
        self.pool = nn.MaxPool1d(5)
        self.fc1 = nn.Linear(128 * ((max_length - 5 + 1) // 5), 1)

    def forward(self, x):
        x = self.conv1(x)
        x = torch.relu(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = torch.sigmoid(x)
        return x


model_cnn = CNN()
optimizer = optim.Adam(model_cnn.parameters(), lr=0.001)
criterion = nn.BCELoss()

epochs = 5

for epoch in range(epochs):
    epoch_loss = 0
    for batch_X, batch_y in tqdm(train_dataloader,
                                 desc=f"Epoch {epoch + 1}/{epochs}"):
        optimizer.zero_grad()
        outputs = model_cnn(batch_X.permute(0, 2, 1))
        loss = criterion(outputs.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f'Avg Loss: {epoch_loss / len(train_dataloader)}')

Epoch 1/5: 100%|██████████| 900/900 [00:32<00:00, 27.42it/s]


Avg Loss: 0.08647220988227572


Epoch 2/5: 100%|██████████| 900/900 [00:41<00:00, 21.72it/s]


Avg Loss: 0.026611947608010573


Epoch 3/5: 100%|██████████| 900/900 [00:25<00:00, 35.38it/s]


Avg Loss: 0.008401938945575643


Epoch 4/5: 100%|██████████| 900/900 [00:25<00:00, 35.54it/s]


Avg Loss: 0.009696940349521064


Epoch 5/5: 100%|██████████| 900/900 [00:29<00:00, 30.65it/s]

Avg Loss: 0.012770473037081264





##### 3. LSTM

In [16]:
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(100, 128, batch_first=True)
        self.fc1 = nn.Linear(128, 1)

    def forward(self, x):
        h_0 = torch.zeros(1, x.size(0), 128)
        c_0 = torch.zeros(1, x.size(0), 128)
        out, _ = self.lstm(x, (h_0, c_0))
        out = self.fc1(out[:, -1, :])
        out = torch.sigmoid(out)
        return out


model_lstm = LSTM()
optimizer = optim.Adam(model_lstm.parameters(), lr=0.001)
criterion = nn.BCELoss()

epochs = 5

for epoch in range(epochs):
    epoch_loss = 0
    for batch_X, batch_y in tqdm(train_dataloader,
                                 desc=f"Epoch {epoch + 1}/{epochs}"):
        optimizer.zero_grad()
        outputs = model_lstm(batch_X)
        loss = criterion(outputs.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f'Avg Loss: {epoch_loss / len(train_dataloader)}')

Epoch 1/5: 100%|██████████| 900/900 [01:19<00:00, 11.37it/s]


Avg Loss: 0.27754983050955667


Epoch 2/5: 100%|██████████| 900/900 [01:35<00:00,  9.44it/s]


Avg Loss: 0.09618883562222537


Epoch 3/5: 100%|██████████| 900/900 [01:19<00:00, 11.30it/s]


Avg Loss: 0.056765588032495644


Epoch 4/5: 100%|██████████| 900/900 [01:18<00:00, 11.48it/s]


Avg Loss: 0.04363251645751815


Epoch 5/5: 100%|██████████| 900/900 [01:32<00:00,  9.75it/s]

Avg Loss: 0.03511797357444367





##### 4. Рассчет

In [17]:
def evaluate_model(model, X_test_tensor, y_test_tensor):
    with torch.no_grad():
        outputs = model(X_test_tensor.permute(0, 2, 1) if
                        isinstance(model, CNN) else X_test_tensor)
        predictions = (outputs.squeeze() > 0.5).float()
    return f1_score(y_test_tensor, predictions)


f1_cnn = evaluate_model(model_cnn, X_test_tensor, y_test_tensor)
print(f"CNN - F1: {f1_cnn}")
f1_lstm = evaluate_model(model_lstm, X_test_tensor, y_test_tensor)
print(f"LSTM - F1: {f1_lstm}")

CNN - F1: 0.9862566969485208
LSTM - F1: 0.9859351389050331
