In [None]:
import pandas as pd
import nltk
import torch
import numpy as np
import gensim
import torch.nn.functional as F
from nltk import word_tokenize
from collections import Counter
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch import nn
from tqdm.auto import tqdm
from torchmetrics.functional import f1, recall, accuracy

nltk.download('punkt')

In [None]:
!pip install torchmetrics

In [20]:
DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

Посмотрим на данные.

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NNmethods/Fake.csv')
df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [None]:
df.value_counts('subject')

subject
News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
dtype: int64

In [None]:
df['subjectnum'] = df.subject.astype('category').cat.codes

Выше мы видим классы новостей. Они, к сожалению, очень неравноценные.

Препроцессинг: приводим тексты к нижнему регистру и токенизируем.

In [None]:
def preprocess(text):
    return ' '.join(word_tokenize(text.lower()))

In [None]:
df['processed'] = df['text'].apply(preprocess)

Уберем из датасета дубли.

In [None]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

Поделим датасет на тестовую и обучающую выборки.

In [None]:
train_data, val_data = train_test_split(df, test_size=0.2, shuffle=True)

Делаем word2id словарь.

In [None]:
vocab = Counter()
for text in df['processed']:
    vocab.update(text.split())
print('уникальных слов:', len(vocab))

уникальных слов: 181384


In [None]:
filtered_vocab = set()
for word in vocab:
    if vocab[word] > 2:
        filtered_vocab.add(word)
print('уникальных слов, втретившихся больше 2 раз:', len(filtered_vocab))

уникальных слов, втретившихся больше 2 раз: 59638


In [None]:
word2id = {'UNK': 0}
for word in filtered_vocab:
    word2id[word] = len(word2id)
id2word = {i:word for word, i in word2id.items()}

W2V модель

In [None]:
! wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2021-12-29 20:27:20--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 54.231.133.248
Connecting to s3.amazonaws.com (s3.amazonaws.com)|54.231.133.248|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



In [17]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)

In [18]:
weights = np.zeros((len(word2id), 300))

for word, i in word2id.items():
    try:
        weights[i] = w2v_model[word]
    except KeyError:
        weights[i] = torch.FloatTensor((300,)).uniform_(-0.25, 0.25)

weights = torch.FloatTensor(weights)

In [19]:
del w2v_model

Датасет

In [21]:
class DataSet(Dataset):
    def __init__(self, dataset, word2id, max_len, device):
        self.dataset = dataset['processed'].values
        self.word2id = word2id
        self.length = dataset.shape[0]
        self.device = device
        self.max_len = max_len
        self.target = torch.Tensor(dataset['subjectnum'].values)
    
    def __len__(self):
        return self.length

    def __getitem__(self, index): 
        tokens = self.dataset[index].split()
        ids = torch.LongTensor([self.word2id[token] if token in self.word2id else self.word2id['UNK'] for token in tokens][:self.max_len])
        y = [self.target[index]]
        return ids, y

    def collate_fn(self, batch):
      ids, y = list(zip(*batch))
      padded_ids = torch.vstack([F.pad(seq, pad=(0, self.max_len - seq.shape[0]), mode='constant', value=0) for seq in ids])
      padded_ids = pad_sequence(ids, batch_first=True).to(self.device)
      y = torch.LongTensor(y).to(self.device)
      return padded_ids, y.T[0]

In [22]:
max_len = max(train_data['processed'].str.split().apply(len))
max_len

8886

Такой max_len - слишком большой для моей gpu... Придется, вопреки статье, взять меньше.

In [23]:
max_len = 500

In [24]:
train_dataset = DataSet(dataset=train_data, word2id=word2id, max_len=max_len, device=DEVICE)
train_sampler = RandomSampler(train_dataset)
train_iterator = DataLoader(train_dataset, collate_fn = train_dataset.collate_fn, sampler=train_sampler, batch_size=128)

In [25]:
val_dataset = DataSet(dataset=val_data, word2id=word2id, max_len=max_len, device=DEVICE)
val_sampler = SequentialSampler(val_dataset)
val_iterator = DataLoader(val_dataset, collate_fn=val_dataset.collate_fn, sampler=val_sampler, batch_size=128)

Модель

In [45]:
class C_LSTM(nn.Module):

    def __init__(self, weights=weights, 
                 vocab_size=len(word2id),
                 embedding_dim=300,
                 max_length=max_len,
                 output_dim=6,
                 filter_sizes=[2, 3, 4],
                 num_filters=150,
                 lstm_layers=1,
                 memory_dim=150,
                 dropout_input=False,
                 dropout_lstm=True,
                 dropout_rate=0.5):

        super().__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding.from_pretrained(weights)
        self.max_length = max_length
        self.filter_sizes = filter_sizes
        self.cnn_layers = len(filter_sizes)
        self.max_features = max_length - max(filter_sizes) + 1
        self.dropout_input = dropout_input
        self.dropout_lstm = dropout_lstm
        self.dropout = nn.Dropout(p=dropout_rate)
        windows = []
        for filter_size in filter_sizes:
            conv = nn.Conv1d(in_channels=embedding_dim,
                             out_channels=num_filters,
                             kernel_size=filter_size, 
                             padding='valid')
            windows.append(conv)
        self.windows = nn.ModuleList(windows)
        self.lstm = nn.LSTM(input_size=len(filter_sizes) * num_filters,
                            hidden_size=memory_dim,
                            num_layers=lstm_layers,
                            batch_first=True)
        self.relu = nn.ReLU()
        self.linnear = nn.Linear(in_features=memory_dim,
                                out_features=output_dim)
        self.softmax = nn.Softmax(dim=1)

def forward(self, word):

        embedded = self.embedding(word)
        embedded = embedded.transpose(1, 2)
        if self.dropout_input:
            embedded = self.dropout(embedded)

        feature_maps = []
        for conv in self.conv_layers:
            feature_map = self.relu(conv(embedded))[:, :, :self.max_features]
            feature_maps.append(feature_map)

        if self.cnn_layers > 1:
            rnn_input = torch.cat(feature_maps, 1).transpose(1, 2)
        else:
            rnn_input = feature_map.transpose(1, 2)

        _, (hidden_state, _) = self.lstm(rnn_input)
        if self.dropout_lstm:
            embedded = self.dropout(hidden_state)
        logits = self.out(self.hidden(torch.squeeze(hidden_state, 0)))

        return logits

In [27]:
def train(model, iterator, optimizer, criterion, metric, n_epoch):

    progress_bar = tqdm(total=len(train_iterator.dataset),
                        desc='Epoch {}'.format(n_epoch + 1))

    epoch_losses = []  
    epoch_metrics = []
    model.train() 

    for i, (texts, ys) in enumerate(iterator):

        optimizer.zero_grad()
        preds = model(texts) 
        loss = criterion(preds, ys)  
        loss.backward() 
        optimizer.step() 
        epoch_losses.append(loss.item())
        batch_metric = metric(preds.argmax(1).long(), ys.long(),
                              average='weighted', num_classes=6,
                              ignore_index=0)
        epoch_metrics.append(batch_metric.cpu().numpy())

        progress_bar.update(texts.shape[0])

    progress_bar.close()
    
    return epoch_losses, epoch_metrics

In [28]:
def evaluate(model, iterator, criterion, n_epoch, metric):

    epoch_losses = []
    epoch_metrics = []
    model.eval()

    with torch.no_grad():

        for i, (texts, ys) in enumerate(iterator):
            preds = model(texts)
            loss = criterion(preds, ys)

            epoch_losses.append(loss.item())
            batch_metric = metric(preds.argmax(1).long(), ys.long(),
                                  average='weighted', num_classes=6,
                                  ignore_index=0)
            epoch_metrics.append(batch_metric.cpu().numpy())

    return epoch_losses, epoch_metrics

In [32]:
def train_and_evaluate(model, train_iterator, val_iterator,
                       optimizer, criterion, metric, epochs):
    train_losses = []
    train_metrics = []
    eval_losses = []
    eval_metrics = []

    for n_epoch in range(epochs):

        print('\nTraining...')
        train_loss, train_metric = train(model, train_iterator,
                                            optimizer, criterion,
                                            metric, n_epoch)
        train_losses.append(train_loss)
        train_metrics.append(train_metric)

        print('\nValidating...\n')
        eval_loss, eval_metric = evaluate(model, val_iterator,
                                             criterion, n_epoch,
                                             metric)
        eval_losses.append(eval_loss)
        eval_metrics.append(eval_metric)

    return train_losses, eval_losses, train_metrics, eval_metrics

In [46]:
c_lstm = C_LSTM()
optimizer = torch.optim.RMSprop(c_lstm.parameters(), lr=0.001,
                                weight_decay=0.0001)
criterion = nn.CrossEntropyLoss()

c_lstm = c_lstm.to(DEVICE)
criterion = criterion.to(DEVICE)

In [48]:
losses_train, losses_eval, metrics_train, metrics_eval = train_and_evaluate(c_lstm, train_iterator, val_iterator, optimizer, criterion, accuracy, epochs=3)


Training...


Epoch 1:   0%|          | 0/18782 [00:00<?, ?it/s]

NotImplementedError: ignored

In [None]:
def plot(train_loss, train_metric, val_loss, val_metric):
    plt.figure(figsize=(8, 6))
    plt.grid()
    
    plt.plot(train_loss)
    plt.plot(train_metric)
    
    plt.title('Training')
    plt.xlabel('Step')
    plt.ylabel('Score')
    
    plt.legend(['loss', 'accuracy'], loc='upper right')
    plt.show()

    plt.figure(figsize=(8, 6))
    plt.grid()
    
    plt.plot(val_loss)
    plt.plot(val_metric)
    plt.title('Validation')
    plt.xlabel('Step')
    plt.ylabel('Score')
    
    
    plt.legend(['loss', 'accuracy'], loc='upper right')
    plt.show()

In [None]:
plot_train_and_eval(losses_train, losses_eval, metrics_train, metrics_eval)