The goal of the project is to achieve an F1 score above 0.91 for classification tasks using sklearn methods and above 0.52 for methods using PyTorch, employing three different approaches each.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('Constraint_Train.csv')

In [None]:
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [None]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
sentences = [word_tokenize(text.lower()) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:02<00:00, 2192.80it/s]


In [None]:
from gensim.models.word2vec import Word2Vec
%time model_tweets = Word2Vec(sentences, workers=4, vector_size=300, min_count=3, window=5, epochs=15)

CPU times: user 11 s, sys: 84.4 ms, total: 11 s
Wall time: 6.82 s


In [None]:
model_tweets.wv.most_similar('vaccine')

[('cure', 0.7684659957885742),
 ('developed', 0.751139223575592),
 ('drug', 0.7404034733772278),
 ('fight', 0.7348445653915405),
 ('scientists', 0.7217168807983398),
 ('pandemic', 0.7215371131896973),
 ('novel', 0.713915228843689),
 ('remedy', 0.7021166086196899),
 ('combat', 0.6999064087867737),
 ('against', 0.6963312029838562)]

In [None]:
model_tweets.init_sims()

  model_tweets.init_sims()


In [None]:
import numpy as np

In [None]:
def get_text_embedding(text):
    result = []
    for word in word_tokenize(text.lower()):
        if word in model_tweets.wv:
            result.append(model_tweets.wv[word])

    if len(result):
        result = np.average(result, axis=0)
    else:
        result = np.zeros(300)
    return result

In [None]:
features = [get_text_embedding(text) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:03<00:00, 1710.61it/s]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, df.label, test_size=0.33)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.metrics import classification_report

In [None]:
predicted = model.predict(X_test)

In [None]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.89      0.90      0.90       977
        real       0.91      0.90      0.91      1142

    accuracy                           0.90      2119
   macro avg       0.90      0.90      0.90      2119
weighted avg       0.90      0.90      0.90      2119



1.CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vec = CountVectorizer()

In [None]:
bow = vec.fit_transform(df.tweet)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bow, df.label, test_size=0.33)
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.92      0.92      0.92      1037
        real       0.93      0.92      0.93      1082

    accuracy                           0.92      2119
   macro avg       0.92      0.92      0.92      2119
weighted avg       0.92      0.92      0.92      2119



2.Tf-Idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer()

In [None]:
bow = tfidf.fit_transform(df.tweet)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bow, df.label, test_size=0.33)
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.89      0.93      0.91       994
        real       0.94      0.90      0.92      1125

    accuracy                           0.91      2119
   macro avg       0.91      0.92      0.91      2119
weighted avg       0.92      0.91      0.91      2119



3.Tf-Idf bigrams

In [None]:
tfidf_bi = TfidfVectorizer(ngram_range = (2, 2))

In [None]:
bow = tfidf_bi.fit_transform(df.tweet)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bow, df.label, test_size=0.33)
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.86      0.90      0.88       992
        real       0.91      0.87      0.89      1127

    accuracy                           0.88      2119
   macro avg       0.88      0.88      0.88      2119
weighted avg       0.88      0.88      0.88      2119



3.Tf-Idf n-grams

In [None]:
tfidf_ng = TfidfVectorizer(ngram_range = (1, 3))

In [None]:
bow = tfidf_ng.fit_transform(df.tweet)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(bow, df.label, test_size=0.33)
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.89      0.94      0.91      1003
        real       0.94      0.90      0.92      1116

    accuracy                           0.92      2119
   macro avg       0.92      0.92      0.92      2119
weighted avg       0.92      0.92      0.92      2119



PyTorch + LSTM

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
def get_word_embedding(tokens, max_len):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model_tweets.wv:
                result.append(model_tweets.wv[word])
            else:
                result.append(np.zeros(300))
        else:
            result.append(np.zeros(300))
    return result

In [None]:
token_lists = [word_tokenize(text.lower()) for text in df.tweet]

In [None]:
features = [get_word_embedding(text, 100) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:02<00:00, 2181.55it/s]


In [None]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(300, 100)
        self.out = nn.Linear(100, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


net = Net()
print(net)

Net(
  (lstm): LSTM(300, 100)
  (out): Linear(in_features=100, out_features=1, bias=True)
)


In [None]:
labels = (df.label == 'real').astype(int).to_list()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25)

In [None]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

  in_data = torch.tensor(X_train).float()


In [None]:
in_data.shape

torch.Size([4815, 100, 300])

In [None]:
optimizer = optim.Adam(net.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size].to(device)
        batch_y = targets[i:i + batch_size].to(device)
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [None]:
net.to(device)

Net(
  (lstm): LSTM(300, 100)
  (out): Linear(in_features=100, out_features=1, bias=True)
)

In [None]:
train_one_epoch(in_data, targets)

100%|██████████| 301/301 [00:25<00:00, 11.76it/s]


tensor(0.6931, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [None]:
result = (output.cpu() > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.4660436137071651

Let's try a different optimizer.

In [None]:
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.BCELoss()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25)

In [None]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

In [None]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size].to(device)
        batch_y = targets[i:i + batch_size].to(device)
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [None]:
train_one_epoch(in_data, targets)

100%|██████████| 301/301 [00:15<00:00, 19.42it/s]

tensor(53.3333, grad_fn=<BinaryCrossEntropyBackward0>)





In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [None]:
result = (output > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.4766355140186916


Let's try to make the sentences longer.

In [None]:
features = [get_word_embedding(text, 200) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:07<00:00, 861.51it/s] 


In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33)

In [None]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

In [None]:
in_data.shape

torch.Size([4301, 200, 300])

In [None]:
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.BCELoss()

In [None]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size].to(device)
        batch_y = targets[i:i + batch_size].to(device)
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [None]:
train_one_epoch(in_data, targets)

100%|██████████| 269/269 [00:33<00:00,  7.93it/s]

tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)





In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [None]:
result = (output > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.4874941009910335

In [None]:
for i in range(20):
  train_one_epoch(in_data, targets)

100%|██████████| 269/269 [00:27<00:00,  9.66it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:26<00:00, 10.32it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:28<00:00,  9.48it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:26<00:00, 10.02it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:24<00:00, 10.86it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:26<00:00, 10.25it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:25<00:00, 10.36it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:25<00:00, 10.36it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:26<00:00, 10.31it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:24<00:00, 10.76it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:25<00:00, 10.68it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:26<00:00, 10.25it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:27<00:00,  9.70it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:25<00:00, 10.51it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:24<00:00, 11.05it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:28<00:00,  9.43it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:26<00:00, 10.17it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:25<00:00, 10.51it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:26<00:00, 10.34it/s]


tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)


100%|██████████| 269/269 [00:25<00:00, 10.59it/s]

tensor(69.2308, grad_fn=<BinaryCrossEntropyBackward0>)





In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [None]:
result = (output.cpu() > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.4874941009910335

Let's try to further lengthen the sentences.

In [None]:
features = [get_word_embedding(text, 300) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:05<00:00, 1146.42it/s]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33)

In [None]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

  in_data = torch.tensor(X_train).float()


In [None]:
in_data.shape

torch.Size([4301, 300, 300])

In [None]:
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.BCELoss()

In [None]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size].to(device)
        batch_y = targets[i:i + batch_size].to(device)
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [None]:
train_one_epoch(in_data, targets)

100%|██████████| 269/269 [06:30<00:00,  1.45s/it]

tensor(0.7124, grad_fn=<BinaryCrossEntropyBackward0>)





In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    output = net(in_data_test).reshape(-1)

In [None]:
result = (output > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.5247758376592733