# Fake News Spreader Classifier combined with Random Forest

In [None]:
import torch

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!ls "/content/drive/MyDrive/Colab Notebooks"

In [None]:
from numpy.random import seed
from numpy.random import randint
import torch
import pandas as pd
import numpy as np
seed(1)
dlab = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/user_label_clean.csv")
#dlab = dlab[:50]
values = np.asarray(dlab['label'])
labels = torch.from_numpy(values)

In [None]:
pRoot = "drive/MyDrive/Colab Notebooks/"
pData = pRoot + "dataset/"
pModels = pRoot + "models/"
pTweet = pData + "tweet_drive/"
users_id = dlab['user_id']
# print(users_id)
print(len(users_id))
print(dlab.shape[0])

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-multilingual-cased',
    do_lower_case=False,
    output_hidden_states = True)

In [None]:
def preprocess_sentences(input_ids, attention_masks, sentences, max_len):
  for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens = True,
        max_length = max_len,
        pad_to_max_length = True,
        #padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  return input_ids, attention_masks

In [None]:
from transformers import BertModel
bert = BertModel.from_pretrained("bert-base-multilingual-cased")
bert.cuda()

Read the first n tweets of each user and transform them into tensors

In [None]:
import pandas as pd
max_len = 128 # the closest power of two exceeding max len found
count = 1
#users_id = users_id[100:500]
user_no_tweet = []
'''
for user in users_id:#[416:]:
  total = []
  print(str(count) + "/" + str(len(users_id)))
  count = count + 1 

  df = pd.read_csv(pTweet + str(user) + ".csv")
  # print(df.head())
  start = 0
  print(df.shape[0])
  while start < df.shape[0]:
    input_ids = []
    attention_masks = []
    dfc = df[start:start+32]
    start = start+32
    dfc['text']= dfc['text'].astype('str')
    sentences  = dfc.text.values
    #print(sentences)

    input_ids, attention_masks = preprocess_sentences(
        input_ids, attention_masks, sentences, max_len)

    #print(input_ids)
    input_ids = input_ids.to(device)
    outputs = bert(input_ids)
    #print(pt_output)
    #print(outputs.last_hidden_state.shape)
    last_layer = outputs.last_hidden_state.permute(1,0,2) 
    #print(last_layer[0].size()) #cls
    result = last_layer[0].cpu().detach().numpy()
    res = np.asarray(result)
    #print(res)
    #print(res.shape)
    res = np.amax(res, axis=0)
    #print(res)
    #print(res.shape)
    total.append(res)
#print(len(total))
  total = np.asarray(total)
  total = np.amax(total, axis=0)
  np.savetxt(pData+"user_tensor/"+str(user)+"_max_cls.csv", total, delimiter=",", fmt='%5.5f')
#print(total.shape)
print("done")
'''


In [None]:
'''count = 0
for user in users_id:
  df = pd.read_csv(pData+"user_tensor/"+str(user)+"_max_cls.csv")
  print(str(count))
  count += 1'''

Prepare data into input tensors

In [None]:
# debug
'''
slicedf = dlab[:10]
values = np.asarray(slicedf['label'])
labels = torch.from_numpy(values)
'''

In [None]:
flag = 0
for user in users_id: 
  torch_in = torch.from_numpy(
  np.loadtxt(pData+"user_tensor/"+str(user)+"_max_cls.csv",delimiter=","))
  torch_in = torch.reshape(torch_in, (torch_in.size()[0], 1))
  #print(torch_in.size())
  if flag == 0 :
    max_cls = torch_in
    flag = 1
  else:
    max_cls = torch.cat((max_cls, torch_in), dim=1)

print(max_cls.shape)


In [None]:
max_cls = max_cls.permute(1,0)
print(max_cls.size())

configure the architecture

In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

class FNSC(nn.Module):

  def __init__(self):
        super().__init__()
        self.shrink_cls = nn.Sequential(
            nn.LeakyReLU(),nn.Linear(768, 1))


  def forward(self, cls):
    #print(cls.size())
    scls = self.shrink_cls(cls)
    y = torch.squeeze(scls)
    return y
  

train and test the model

In [None]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
model = FNSC()
model.cuda()
# y = model.forward(max_cls.float().to(device), user_info)
# y_pred = torch.sigmoid(y)
# y_pred_tag = torch.round(y_pred)
# print(y_pred, y_pred_tag)

In [None]:
import random
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score


seed_val = 1 
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []
total_t0 = time.time()

def model_fit(epochs, model,loss_fn, optimizer, scheduler, train_dataloader):
  for epoch_i in range(0, epochs):
      print('Training...')
      t0 = time.time()
      total_train_loss = 0
      total_eval_loss = 0
      model.train()
      for step, batch in enumerate(train_dataloader):
          if step % 40 == 0 and not step == 0:
              elapsed = format_time(time.time() - t0)
              #print('Batch {:>5,} of {:>5,} .Elapsed: {:}.'.format(
                  #step, len(train_dataloader), elapsed))
          b_cls = batch[0].to(device)
          b_labels = batch[1].to(device)
          model.zero_grad()        
          outputs = model(b_cls)
          #print(outputs.loss)
          y_pred = torch.sigmoid(outputs)
          print(y_pred, b_labels)
          loss = loss_fn(y_pred, b_labels)
          total_train_loss += loss.item()
          loss.type(torch.FloatTensor).backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()

      avg_train_loss = total_train_loss / len(train_dataloader)            
      training_time = format_time(time.time() - t0)

      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))
  return model

def model_test(model, validation_dataloader):     
    print("Model Test...")
    t0 = time.time()
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0.0
    eval_micro = [] 
    eval_macro = []
    eval_acc = 0.0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_cls = batch[0].to(device)
        b_labels = batch[1].to(device)

        with torch.no_grad():        
            outputs = model(b_cls)

                       
        y_pred = torch.sigmoid(outputs)
        y_pred_tag = torch.round(y_pred)
        y_pred_tag = torch.reshape(y_pred_tag, (-1,))
        loss = loss_fn(y_pred_tag, b_labels)
        total_eval_loss += loss
        pred_flat = y_pred_tag.detach().cpu().numpy().flatten()
        labels_flat = b_labels.to('cpu').numpy().flatten()
        tmp_eval_micro = precision_recall_fscore_support(
            labels_flat, pred_flat, average='micro')
        tmp_eval_macro = precision_recall_fscore_support(
            labels_flat, pred_flat, average='macro')
        tmp_accuracy = accuracy_score(
            labels_flat, pred_flat
        )
        # print(tmp_eval_micro)
        eval_micro.append(tmp_eval_micro) # += np.asarray(tmp_eval_micro)
        eval_macro.append(tmp_eval_macro) #+= np.asarray(tmp_eval_macro)
        eval_acc += tmp_accuracy
        nb_eval_steps += 1
    print(F'\n\tValidation micro: {eval_micro}')
    print(F'\n\tValidation macro: {eval_macro}')
    print(F'\n\tValidation accuracy: {eval_acc/nb_eval_steps}')
    avg_eval_loss = total_eval_loss / len(validation_dataloader)
    print("  Average validation loss: {0:.8f}".format(avg_eval_loss))        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.5f}".format(avg_val_accuracy))

    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    return eval_micro, eval_macro, eval_acc/nb_eval_steps


In [None]:
from torch.utils.data import TensorDataset
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup 

epochs = 1
batch_size = 32
optimizer = torch.optim.Adam(model.parameters(), lr = 2e-5, eps = 1e-8)
n_splits = 10
loss_fn = nn.BCELoss()
kf = KFold(n_splits=n_splits)
eval_micro = []
eval_macro = []
eval_acc = 0.0
for train_index, test_index in kf.split(max_cls):
  print("Run")
  max_cls_train, max_cls_test = max_cls[train_index], max_cls[test_index]
  labels_train, labels_test = labels[train_index], labels[test_index]

  train_ds = TensorDataset(max_cls_train.float(),
                           labels_train.float())
  train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
  total_steps = len(train_dl) * epochs
  scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
  )
  model_fitted = model_fit(epochs, model, loss_fn, optimizer, scheduler, train_dl)
  validation_ds = TensorDataset(max_cls_test.float(),
                          labels_test.float())
  validation_dl = DataLoader(validation_ds, batch_size=batch_size, shuffle=True)
  emi, ema, eacc =  model_test(model_fitted, validation_dl)
  eval_micro.append(emi)
  eval_macro.append(ema)
  eval_acc += eacc
eval_acc = eval_acc/n_splits

In [None]:
print(eval_micro)
print(eval_macro)
print(eval_acc)

In [None]:
print(len(eval_macro))
print(len(eval_macro[0]))
print(len(eval_macro[0][0]))
count = 0
sum_metric = 0.0
for elem in eval_macro:
  #print(elem[0])
  for prec in elem:
    #print(prec[0])
    sum_metric += float(prec[0])
    count +=1

print(count)
print(sum_metric)
print(str(sum_metric/count))