In [1]:
import pandas as pd
import sys
import json
from google.colab import drive
import nltk
import re
from tqdm.notebook import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
# split the data into train and validation
from sklearn.model_selection import train_test_split
tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
nltk.download('stopwords')
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Mounted at /content/drive


In [5]:
path = '/content/drive/MyDrive/COVID19 Fake News Detection in English/input/'
train_df = pd.read_csv(f"{path}train_data.csv")
test_df = pd.read_csv(f"{path}test_data.csv")

In [None]:
#tokenize the sentences
df = train_df['clean_tweet'].fillna('_##_').values
tokenizer = Tokenizer(num_words=12000)
tokenizer.fit_on_texts(list(df))
word_index = tokenizer.word_index
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df[['target']])
train_df, val_df = train_df.reset_index(drop=True), val_df.reset_index(drop=True)

train_df.shape, val_df.shape

((5136, 4), (1284, 4))

In [None]:
val_df.iloc[2]

tweet          New WHO guidelines recommend encouraging child...
label                                                       fake
target                                                         1
clean_tweet    new guidelines recommend encouraging children ...
Name: 2, dtype: object

In [None]:
PAD = 0
UNK = 1
from torch.utils.data import Dataset, DataLoader
import spacy
from collections import Counter
class SentimentDataset(Dataset):
    """Define the pytorch Dataset to process the tweets
       This class can be used for both training and validation dataset
       Run it for training data and pass the word2idx and idx2word when running
       for validation data
    """
    
    def __init__(self, df, word2idx=None, idx2word=None, max_vocab_size=50000):
        print('Processing Data')
        self.df = df
        print('Removing white space...')
        self.df.clean_tweet = self.df.clean_tweet.progress_apply(lambda x: x.strip())
        self.nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])
        if word2idx is None:
            print('Building Counter...')
            word_counter = self.build_counter()
            print('Building Vocab...')
            self.word2idx, self.idx2word = self.build_vocab(word_counter, max_vocab_size)
        else:
            self.word2idx, self.idx2word = word2idx, idx2word
        print('*'*100)
        print('Dataset info:')
        print(f'Number of Tweets: {self.df.shape[0]}')
        print(self.word2idx)
        print(f'Vocab Size: {len(self.word2idx)}')
        print('*'*100)
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        sent = self.df.clean_tweet[idx]
        tokens = [w.text.lower() for w in self.nlp(self.tweet_clean(sent))]
        vec = self.vectorize(tokens, self.word2idx)
        return vec, self.df.target[idx]
    
    def tweet_clean(self, text):
        """Very basic text cleaning. This function can be built upon for
           better preprocessing
        """
        text = re.sub(r'[\s]+', ' ', text) # replace multiple white spaces with single space
#         text = re.sub(r'@[A-Za-z0-9]+', ' ', text) # remove @ mentions
        text = re.sub(r'https?:/\/\S+', ' ', text) # remove links
        text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric character
        return text.strip()
    
    def build_counter(self):
        """Tokenize the tweets using spacy and build vocabulary
        """
        words_counter = Counter()
        for sent in tqdm(self.df.clean_tweet.values):
            words_counter.update(w.text.lower() for w in self.nlp(self.tweet_clean(sent)))
        return words_counter
    
    def build_vocab(self, words_counter, max_vocab_size):
        """Add pad and unk tokens and build word2idx and idx2word dictionaries
        """
        word2idx = {'<PAD>': PAD, '<UNK>': UNK}
        word2idx.update({word:i+2 for i, (word, count) in tqdm(enumerate(words_counter.most_common(max_vocab_size)))})
        idx2word = {idx: word for word, idx in tqdm(word2idx.items())}
        return word2idx, idx2word
    
    def vectorize(self, tokens, word2idx):
        """Convert tweet to vector
        """
        vec = [word2idx.get(token, UNK) for token in tokens]
        return vec

In [None]:
train_ds = SentimentDataset(train_df, max_vocab_size=10000)

In [None]:
for i,val in enumerate(train_ds):
  print(val)
  if i == 2:
    break

([15, 33, 346, 952, 2185], 1)
([64, 76, 28, 2567, 5759, 3990, 5760, 254, 4], 1)
([43, 222, 185, 2568, 2, 3, 20, 55, 1285, 369, 134, 23, 1931, 1031, 624, 1286, 3991, 209, 3992, 200, 441, 1406, 5761], 0)


In [None]:
class SentimentDataset(Dataset):
  def __init__(self, df):
    self.df = df
    self.text = df['clean_tweet'].fillna('_##_').values
    self.text_seq = tokenizer.texts_to_sequences(self.text)
    print('*'*100)
    print('Dataset info:')
    print(f'Number of Tweets: {self.df.shape[0]}')
    print(word_index)
    print(f'Vocab Size: {len(word_index)}')
    print('*'*100)

  def __len__(self):
        return self.df.shape[0]  

  def __getitem__(self, idx):
        return self.text_seq[idx], self.df.target[idx] #return text seq and target

train = SentimentDataset(train_df)
valid = SentimentDataset(val_df)
valid[0]

****************************************************************************************************
Dataset info:
Number of Tweets: 5136
Vocab Size: 14619
****************************************************************************************************
****************************************************************************************************
Dataset info:
Number of Tweets: 1284
Vocab Size: 14619
****************************************************************************************************


([271,
  331,
  583,
  34,
  1105,
  84,
  1110,
  40,
  287,
  213,
  673,
  78,
  22,
  1028,
  2619,
  5428],
 0)

In [None]:
#Function to pad and transpose data (to be used in Dataloader)
def collate_fn(data):
    """This function will be used to pad the tweets to max length
       in the batch and transpose the batch from 
       batch_size x max_seq_len to max_seq_len x batch_size.
       It will return padded vectors, labels and lengths of each tweets (before padding)
       It will be used in the Dataloader
    """
    data.sort(key=lambda x: len(x[0]), reverse=True)
    print("data = ",data[0],len(data[0][0]))
    lens = [len(sent) for sent, label in data]
    labels = []
    padded_sents = torch.zeros(len(data), max(lens)).long()
    for i, (sent, label) in enumerate(data):
        padded_sents[i,:lens[i]] = torch.LongTensor(sent)
        labels.append(label)
    
    padded_sents = padded_sents.transpose(0,1)
    return padded_sents, torch.FloatTensor(labels), lens

In [None]:
train_loader = DataLoader(train, batch_size=256, shuffle=True, collate_fn=collate_fn)

In [None]:
# import torch
# for i,val in enumerate(train):
#   print(val)
#   print("----------------------------------------------")
#   if i == 0:
#     break

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

class LSTMClassifier(nn.Module):
    def __init__(self,embedding_matrix):        
        super(LSTMClassifier, self).__init__()
        self.embedding_size = 300
        self.max_features = 120000

        self.embedding = nn.Embedding(self.max_features, self.embedding_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = nn.Dropout2d(0.1)

        
        self.hidden_size = 60
        self.num_layers = 1
        self.reduction_size = 16
        self.ih2h = nn.LSTM(
                        self.embedding_size, 
                        self.hidden_size,
                        self.num_layers,
                        bidirectional=True, 
                        batch_first=True
                        )
        self.h2r = nn.Linear(2 * self.hidden_size, self.reduction_size)
        self.r2o = nn.Linear(self.reduction_size, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, x,lens):

        h_embedding = self.embedding(x)
        h_embedding = torch.squeeze(self.embedding_dropout(torch.unsqueeze(h_embedding, 0))) #(bs x sq * embed_size)
        h_embedding = h_embedding.transpose(0, 1).squeeze()
        print("h_embedding = ",h_embedding.shape)
        packed_embedded = pack_padded_sequence(h_embedding, lens, batch_first=True) 
        print("packed_embedded = ",h_embedding.shape)
        o, (h_n, c_n) = self.ih2h(packed_embedded) # o = bs x sq x 2*hidden, o[-1] = sq x 2*hidden
        cat = torch.cat((h_n[-2, :, :], h_n[-1, :, :]), dim=1)
        rel = self.relu(cat)
        print("rel = ", rel.shape)
        dense1 = self.h2r(rel)
        print("dense1 = ",dense1.shape)
        drop = self.dropout(dense1)
        preds = self.r2o(drop)
        return preds

In [None]:
def loss_fn(y_pred, y_true):
    return nn.BCEWithLogitsLoss()(y_pred, y_true.view(-1,1))
device = torch.device("cpu")
model = LSTMClassifier(embedding_matrix)
model.to(device)
def train_fn(data_loader, model, optimizer, device, n_examples):
    model.train()
    tk0 = tqdm(data_loader, total=len(data_loader))

    train_losses = []
    correct_predictions = 0
    for bi, (x_batch, y_batch,lens) in enumerate(tk0):
        optimizer.zero_grad()
        print("length = ",len(x_batch))
        y_pred = model(x_batch,lens)

        loss = loss_fn(y_pred, y_batch)
        preds = torch.round(nn.Sigmoid()(y_pred)).squeeze()
        correct_predictions += torch.sum(preds == y_batch)
        train_losses.append(loss.item())

        train_f1 = utils.f1_score(y_pred, y_batch)
        
        f1 = np.round(train_f1.item(), 3)
      
        if (bi % 10 == 0 and bi != 0) or (bi == len(data_loader) - 1) :
            print(f'bi={bi}, Train F1={f1},Train loss={loss}, time={0.00}')
        
        loss.backward() # Calculate gradients based on loss
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step() # Adjust weights based on calculated gradients
        losses.update(loss.item(),x_batch.size(0))
        tk0.set_postfix(loss = losses.avg)
    return correct_predictions.double() / n_examples, np.mean(train_losses)

def eval_fn(data_loader, model, device, n_examples):
    model.eval()
    start = time.time()
    losses = utils.AverageMeter()
    tk0 = tqdm(data_loader, total=len(data_loader))
    val_losses = []
    correct_predictions = 0
    with torch.no_grad():
        for bi, (x_batch, y_batch) in enumerate(tk0):
            
            y_pred = model(x_batch).detach()
            loss = loss_fn(y_pred, y_batch)
            preds = torch.round(nn.Sigmoid()(y_pred)).squeeze()
            correct_predictions += torch.sum(preds == y_batch)
            val_losses.append(loss.item())
            losses.update(loss.item(),x_batch.size(0))
            tk0.set_postfix(loss=losses.avg)
        # model.train()
    return correct_predictions.double() / n_examples, np.mean(val_losses)


In [None]:
import numpy as np
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import os
best_accuracy = 0
optimizer = torch.optim.Adam(model.parameters())
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.0001, weight_decay=0.005)
path = '/content/drive/My Drive/COVID19 Fake News Detection in English/neural-network/resources'
embedding_matrix = np.load(os.path.join(path,'globe_embedding.npy'))
print(embedding_matrix.shape)
for epoch in range(1):
    train_acc, train_loss = train_fn(train, model, optimizer, device, len(train_df))
    logger.info(f'Train loss {train_loss} accuracy {train_acc}')
    val_acc, val_loss = eval_fn(valid_loader, model, device, len(df_valid))
    logger.info(f'Val loss {val_loss} accuracy {val_acc}')

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

(14620, 300)


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))

data =  ([137, 3264, 3838, 6227, 3597, 1821, 234, 684, 293, 1077, 432, 751, 1, 1011, 43, 685, 277, 181, 1997, 1632, 42, 3598, 134, 3538, 137, 190, 27, 27, 216, 2217, 3598, 134, 6228, 1034, 137, 545, 2517, 535, 74, 43, 433, 3599, 1, 584, 782, 2149, 87, 137, 305, 6229, 4557, 1095, 736], 1) 53
length =  53
h_embedding =  torch.Size([256, 53, 300])
packed_embedded =  torch.Size([256, 53, 300])
rel =  torch.Size([256, 120])
dense1 =  torch.Size([256, 16])


NameError: ignored

In [None]:
len(word_index)

14619

In [22]:
sub_df = pd.read_csv("/content/drive/MyDrive/COVID19 Fake News Detection in English/roberta-large/output/roerta_large_four_train_16_3e5.csv")
sub_df = sub_df[['id','y_pred']]
for i in range(len(sub_df)):
  if sub_df['y_pred'][i] == 1:
    sub_df.at[i,'label'] =  'fake'
  else:
    sub_df.at[i,'label'] =  'real'
sub_df = sub_df[['id','label']]
new_path = path+'answer.txt'
sub_df.to_csv(new_path, index=None, sep=',', mode='a')

In [20]:
sub_df.y_pred[0] == 0

True

In [None]:
import pandas as pd

df = pd.DataFrame(columns = ["id", "target"])
df["id"] = [1,2,3,4,5]
df["target"] = [1,0,0,0,1]
df

Unnamed: 0,id,target
0,1,1
1,2,0
2,3,0
3,4,0
4,5,1


In [None]:
df1 = pd.DataFrame(columns = ["id", "target"])
df1["id"] = [1,2,3,4,5]
df1["target"] = [1,1,1,1,1]
df1

Unnamed: 0,id,target
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1


In [None]:
df2 = pd.DataFrame(columns = ["id", "target"])
df2["id"] = [1,2,3,4,5]
df2["target"] = [0,0,0,0,0]
df2

Unnamed: 0,id,target
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [None]:
index = []
t1  = []
t2 = []
t3 = []
for ind in range(len(df)):
  index.append(df['id'][ind])
  t1.append(df['target'][ind])
  t2.append(df1['target'][ind])
  t3.append(df2['target'][ind])


In [None]:
result = pd.DataFrame(columns = ["id"])
result['id'] = index
result['t1'] = t1
result['t2'] = t2
result['t3'] = t3
result

Unnamed: 0,id,t1,t2,t3
0,1,1,1,0
1,2,0,1,0
2,3,0,1,0
3,4,0,1,0
4,5,1,1,0


In [None]:
from collections import Counter
preds = []
for index in range(len(result)):
  array = result.iloc[index].values[1:]
  print(array)
  val = Counter(array)
  preds.append(list(val.keys())[0])
result['preds'] = preds

[1 1 0 2]
[0 1 0 2]
[0 1 0 2]
[0 1 0 2]
[1 1 0 2]


In [None]:
result

Unnamed: 0,id,t1,t2,t3,preds
0,1,1,1,0,1
1,2,0,1,0,0
2,3,0,1,0,0
3,4,0,1,0,0
4,5,1,1,0,1
