# Task

https://github.com/netology-ds-team/nlp-homeworks/tree/main/7_Classification_in_AOT

Используя ноутбук занятия (также размещен в папке Materials) и данные fakenews, 3 раза разными способами получить на задаче классификации значение f1 выше 0.91 для методов на sklearn и выше 0.52 для методов на pytorch.

# Data Load

In [None]:
!wget https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv

--2022-09-13 07:01:08--  https://raw.githubusercontent.com/diptamath/covid_fake_news/main/data/Constraint_Train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1253562 (1.2M) [text/plain]
Saving to: ‘Constraint_Train.csv.2’


2022-09-13 07:01:08 (111 MB/s) - ‘Constraint_Train.csv.2’ saved [1253562/1253562]



In [None]:
import pandas as pd

df = pd.read_csv('Constraint_Train.csv')
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


# Data preparation

Токенизируем корпус, убирая стоп-слова

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from tqdm import tqdm

noise = stopwords.words('english') + list(punctuation)
# sentences = [word_tokenize(text.lower()) for text in tqdm(df.tweet) if token not in noise]

sentences = []
for tweet in df['tweet']:
    text = []
    for token in word_tokenize(tweet.lower()):
        # if token not in noise:
        text.append(token)
    sentences.append(text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
sentences[0][:5]

['the', 'cdc', 'currently', 'reports', '99031']

In [None]:
from gensim.models.word2vec import Word2Vec
%time model_tweets = Word2Vec(sentences, workers=4, size=300, min_count=3, window=5, iter=15)

CPU times: user 8.82 s, sys: 909 ms, total: 9.73 s
Wall time: 5.97 s


In [None]:
# from gensim.models import FastText

# model_tweets = FastText(size=500, window=3, min_count=1, sentences=sentences, iter=10)

In [None]:
model_tweets.wv.most_similar('france')

[('2015', 0.9495022296905518),
 ('tower', 0.9424166679382324),
 ('corpses', 0.9334820508956909),
 ('migrants', 0.9311167597770691),
 ('aid', 0.9284852147102356),
 ('impeachment', 0.9263078570365906),
 ('front', 0.9258139133453369),
 ('section', 0.9253641963005066),
 ('rai', 0.9251426458358765),
 ('chicken', 0.9246896505355835)]

In [None]:
model_tweets.init_sims()

In [None]:
import numpy as np

In [None]:
def get_text_embedding(text):
    result = []
    for word in word_tokenize(text.lower()):
        if word in model_tweets.wv:
            result.append(model_tweets.wv[word])

    if len(result):
        result = np.sum(result, axis=0)
    else:
        result = np.zeros(300)
    return result

In [None]:
features = [get_text_embedding(text) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:02<00:00, 2397.09it/s]


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, df.label, test_size=0.33, random_state=42)

# Model LR

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=42)

In [None]:
from sklearn.metrics import classification_report

In [None]:
predicted = model.predict(X_test)

In [None]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.89      0.91      0.90      1004
        real       0.92      0.90      0.91      1115

    accuracy                           0.91      2119
   macro avg       0.91      0.91      0.91      2119
weighted avg       0.91      0.91      0.91      2119



# Model RF

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [None]:
predicted = clf.predict(X_test)

In [None]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.93      0.90      0.92      1004
        real       0.92      0.94      0.93      1115

    accuracy                           0.92      2119
   macro avg       0.92      0.92      0.92      2119
weighted avg       0.92      0.92      0.92      2119



# Model LGBM

In [None]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier(random_state=0)
clf.fit(X_train, y_train)

LGBMClassifier(random_state=0)

In [None]:
predicted = clf.predict(X_test)

In [None]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.94      0.92      0.93      1004
        real       0.93      0.95      0.94      1115

    accuracy                           0.93      2119
   macro avg       0.93      0.93      0.93      2119
weighted avg       0.93      0.93      0.93      2119



# Model SVM

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf = make_pipeline(StandardScaler(), SVC(gamma='auto', random_state=0))
clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto', random_state=0))])

In [None]:
predicted = clf.predict(X_test)

In [None]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.92      0.91      0.91      1004
        real       0.92      0.93      0.92      1115

    accuracy                           0.92      2119
   macro avg       0.92      0.92      0.92      2119
weighted avg       0.92      0.92      0.92      2119



# Torch model

In [None]:
labels = (df.label == 'real').astype(int).to_list()

Нужно заранее задать размер для максимальной длины предложений.

In [None]:
token_lists = [word_tokenize(text.lower()) for text in df.tweet]
max_len = len(max(token_lists, key=len))
max_len

1592

In [None]:
def get_word_embedding(tokens, max_len):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model_tweets.wv:
                result.append(model_tweets.wv[word])
            else:
                result.append(np.zeros(300))
        else:
            result.append(np.zeros(300))
    return result

In [None]:
max_len = 300
features = [get_word_embedding(text, max_len=max_len) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:03<00:00, 1721.29it/s]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.lstm1 = nn.LSTM(300, 200, dropout=0.1)
        self.out = nn.Linear(200, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm1(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


net = Net().to(device)
print(net)

Net(
  (lstm1): LSTM(300, 200, dropout=0.1)
  (out): Linear(in_features=200, out_features=1, bias=True)
)


  "num_layers={}".format(dropout, num_layers))


In [None]:
in_data = torch.tensor(X_train).float().to(device)
targets = torch.tensor(y_train).float().to(device)
in_data.shape, targets.shape

(torch.Size([4301, 300, 300]), torch.Size([4301]))

In [None]:
optimizer = optim.Adam(net.parameters(), lr=3e-4)
criterion = nn.BCELoss()

In [None]:
in_data_test = torch.tensor(X_test).float().to(device)
targets_test = torch.tensor(y_test).float().to(device)

In [None]:
epochs = 10
batch_size = 64

for epoch in range(1, epochs + 1):
    running_loss = .0
    running_corrects = 0
    net.train() 
    for i in range(0, in_data.shape[0], batch_size):
        batch_x = in_data[i:i + batch_size]
        batch_y = targets[i:i + batch_size]
        optimizer.zero_grad()
        preds = net(batch_x).squeeze()
        loss = criterion(preds, batch_y)
        loss.backward()
        optimizer.step()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def print_metric(model, X_tst=in_data_test, y_tst=targets_test, batch_size=batch_size):
    y_true = np.zeros(len(X_tst))
    y_pred = np.zeros(len(X_tst))
    model.eval()
    with torch.no_grad():
        for i in range(0, in_data_test.shape[0], batch_size):
            if in_data_test.shape[0] - i < batch_size:
                batch_x = X_tst[i:i + batch_size]
                batch_y = y_tst[i:i + batch_size]
                y_batch_pred = torch.exp(model(batch_x))
                y_true[i : i + batch_size] = batch_y.cpu().numpy()
                y_pred[i : i + batch_size] = y_batch_pred.cpu().numpy().flatten() > 0.5
            else:
                batch_x = X_tst[i:in_data_test.shape[0]]
                batch_y = y_tst[i:in_data_test.shape[0]]
                y_batch_pred = torch.exp(model(batch_x))
                y_true[i:in_data_test.shape[0]] = batch_y.cpu().numpy()
                y_pred[i:in_data_test.shape[0]] = y_batch_pred.cpu().numpy().flatten() > 0.5              

    print(f'Accuracy: {accuracy_score(y_true, y_pred):.2f}')
    print(f'Precision: {precision_score(y_true, y_pred):.2f}')
    print(f'Recall: {recall_score(y_true, y_pred):.2f}')
    print(f'F1: {f1_score(y_true, y_pred):.2f}')
    return y_pred

y_pred = print_metric(net)

Accuracy: 0.53
Precision: 0.53
Recall: 1.00
F1: 0.69


In [None]:
np.unique(np.array(y_pred)), np.unique(np.array(y_test))

(array([1.]), array([0, 1]))