In [None]:
import pandas as pd
df = pd.read_csv('Constraint_Train.csv')
df.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [None]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import nltk
nltk.download('punkt')

sentences = [word_tokenize(text.lower()) for text in tqdm(df.tweet)]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
100%|██████████| 6420/6420 [00:03<00:00, 1659.03it/s]


In [None]:
print(sentences[:3])

[['the', 'cdc', 'currently', 'reports', '99031', 'deaths', '.', 'in', 'general', 'the', 'discrepancies', 'in', 'death', 'counts', 'between', 'different', 'sources', 'are', 'small', 'and', 'explicable', '.', 'the', 'death', 'toll', 'stands', 'at', 'roughly', '100000', 'people', 'today', '.'], ['states', 'reported', '1121', 'deaths', 'a', 'small', 'rise', 'from', 'last', 'tuesday', '.', 'southern', 'states', 'reported', '640', 'of', 'those', 'deaths', '.', 'https', ':', '//t.co/yasgrtt4ux'], ['politically', 'correct', 'woman', '(', 'almost', ')', 'uses', 'pandemic', 'as', 'excuse', 'not', 'to', 'reuse', 'plastic', 'bag', 'https', ':', '//t.co/thf8gunfpe', '#', 'coronavirus', '#', 'nashville']]


In [None]:
from gensim.models.word2vec import Word2Vec
%time model_tweets = Word2Vec(sentences, workers=4, vector_size=300, min_count=3, window=5, epochs=30)

CPU times: user 21.8 s, sys: 353 ms, total: 22.1 s
Wall time: 14.7 s


In [None]:
import numpy as np

In [None]:
def get_text_embedding(text):
    result = []
    for word in word_tokenize(text.lower()):
        if word in model_tweets.wv:
            result.append(model_tweets.wv[word])

    if len(result):
        result = np.average(result, axis=0)
    else:
        result = np.zeros(300)
    return result

In [None]:
features = [get_text_embedding(text) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:04<00:00, 1318.73it/s]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, df.label, test_size=0.2)

In [None]:
#Sklearn_one

In [None]:
from sklearn.metrics import classification_report
model = LogisticRegression()
model.fit(X_train, y_train)
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.92      0.89      0.91       618
        real       0.90      0.93      0.92       666

    accuracy                           0.91      1284
   macro avg       0.91      0.91      0.91      1284
weighted avg       0.91      0.91      0.91      1284



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
#Sklearn_two

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.94      0.92      0.93       618
        real       0.93      0.94      0.93       666

    accuracy                           0.93      1284
   macro avg       0.93      0.93      0.93      1284
weighted avg       0.93      0.93      0.93      1284



In [None]:
#Sklearn_three

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer

clf = Pipeline([
    ('vect', CountVectorizer(ngram_range = (1,1))),
    ('lda', LatentDirichletAllocation(n_components=10, learning_method='batch',max_iter=10, batch_size=128, random_state=42)),
    ('lr', LogisticRegression(random_state = 42, solver = 'liblinear'))
])

model.fit(X_train, y_train)
predicted = model.predict(X_test)
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

        fake       0.93      0.92      0.92       618
        real       0.92      0.94      0.93       666

    accuracy                           0.93      1284
   macro avg       0.93      0.93      0.93      1284
weighted avg       0.93      0.93      0.93      1284



In [None]:
#pytorch one

In [None]:
labels = (df.label == 'real').astype(int).to_list()

In [None]:
token_lists = [word_tokenize(text.lower()) for text in df.tweet]
max_len = len(max(token_lists, key=len))

In [None]:
max_len

1592

In [None]:
from collections import Counter
fd = Counter([len(tokens) for tokens in token_lists])

In [None]:
fd.most_common(10)

[(20, 178),
 (25, 174),
 (22, 170),
 (18, 170),
 (19, 168),
 (21, 168),
 (16, 163),
 (17, 162),
 (15, 160),
 (23, 156)]

In [None]:
def get_word_embedding(tokens, max_len):
    result = []
    for i in range(max_len):
        if i < len(tokens):
            word = tokens[i]
            if word in model_tweets.wv:
                result.append(model_tweets.wv[word])
            else:
                result.append(np.zeros(300))
        else:
            result.append(np.zeros(300))
    return result

In [None]:
features = [get_word_embedding(text, 100) for text in tqdm(token_lists)]

100%|██████████| 6420/6420 [00:02<00:00, 2884.45it/s]


In [None]:
!pip install torch


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

Torch one

In [None]:

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.lstm = nn.LSTM(300, 100)
        self.out = nn.Linear(100, 1)

    def forward(self, x):
        embeddings, (shortterm, longterm) = self.lstm(x.transpose(0, 1))
        prediction = torch.sigmoid(self.out(longterm))
        return prediction


net = Net()
#net.cuda()
print(net)

Net(
  (lstm): LSTM(300, 100)
  (out): Linear(in_features=100, out_features=1, bias=True)
)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25)

In [None]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

  in_data = torch.tensor(X_train).float()


In [None]:
in_data.shape

torch.Size([4815, 100, 300])

In [None]:
optimizer = optim.Adam(net.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

In [None]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]#.cuda()
        batch_y = targets[i:i + batch_size]#.cuda()
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.reshape(-1), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [None]:
net.train()

Net(
  (lstm): LSTM(300, 100)
  (out): Linear(in_features=100, out_features=1, bias=True)
)

In [None]:
for i in range(20):
  train_one_epoch(in_data, targets)

100%|██████████| 301/301 [00:16<00:00, 17.73it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 18.81it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 18.81it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 17.95it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:18<00:00, 16.06it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 18.45it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:15<00:00, 18.87it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 18.56it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 18.61it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 18.75it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:15<00:00, 18.84it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 18.11it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 18.17it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 18.66it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:15<00:00, 18.85it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 18.79it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:15<00:00, 19.11it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:15<00:00, 18.82it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:18<00:00, 16.45it/s]


tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:16<00:00, 18.22it/s]

tensor(0.7412, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)





In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
    #output = net(in_data_test.cuda()).reshape(-1)
    output = net(in_data_test).reshape(-1)

In [None]:
max(torch.nn.functional.softmax(output))

  max(torch.nn.functional.softmax(output))


tensor(0.0010)

In [None]:
targets_test

tensor([1., 0., 1.,  ..., 1., 1., 1.])

In [None]:
result = (output.cpu() > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.6504672897196262

In [None]:
features = [get_text_embedding(text) for text in tqdm(df.tweet)]

100%|██████████| 6420/6420 [00:03<00:00, 1732.12it/s]


Torch two

In [None]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.out = nn.Linear(300, 1)

    def forward(self, x):
        return torch.sigmoid(self.out(x))


net = Net()
#net.cuda()
print(net)

Net(
  (out): Linear(in_features=300, out_features=1, bias=True)
)


In [None]:
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25)

In [None]:
in_data = torch.tensor(X_train).float()
targets = torch.tensor(y_train).float()

In [None]:
def train_one_epoch(in_data, targets, batch_size=16):
    for i in tqdm(range(0, in_data.shape[0], batch_size)):
        batch_x = in_data[i:i + batch_size]#.cuda()
        batch_y = targets[i:i + batch_size]#.cuda()
        optimizer.zero_grad()
        output = net(batch_x)
        loss = criterion(output.squeeze(), batch_y)
        loss.backward()
        optimizer.step()
    print(loss)

In [None]:
for i in range(20):
  train_one_epoch(in_data, targets)

100%|██████████| 301/301 [00:00<00:00, 621.61it/s]


tensor(0.6928, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 518.96it/s]


tensor(0.6667, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 390.98it/s]


tensor(0.6519, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 730.54it/s]


tensor(0.6421, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1012.98it/s]


tensor(0.6351, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1116.93it/s]


tensor(0.6297, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1175.93it/s]


tensor(0.6254, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 932.03it/s]


tensor(0.6219, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 878.38it/s]


tensor(0.6190, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1017.15it/s]


tensor(0.6165, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 997.96it/s]


tensor(0.6144, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1051.89it/s]


tensor(0.6125, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 998.11it/s]


tensor(0.6108, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1017.30it/s]


tensor(0.6093, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1064.47it/s]


tensor(0.6079, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 923.41it/s]


tensor(0.6067, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 953.52it/s]


tensor(0.6056, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1456.38it/s]


tensor(0.6045, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1264.22it/s]


tensor(0.6036, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


100%|██████████| 301/301 [00:00<00:00, 1279.29it/s]

tensor(0.6027, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)





In [None]:
in_data_test = torch.tensor(X_test).float()
targets_test = torch.tensor(y_test).float()

In [None]:
with torch.no_grad():
#    output = net(in_data_test.cuda()).squeeze(1) #havent got cuda. lets use suda)
     output = net(in_data_test).squeeze(1)

In [None]:
result = (output.cpu() > 0.5) == targets_test

In [None]:
result.sum().item() / len(result)

0.8741433021806854