# Relaiable Users Neural Network

In [None]:
import torch

if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# !ls "/content/drive/MyDrive/Colab Notebooks"

In [None]:
from numpy.random import seed
from numpy.random import randint
import torch
import numpy as np
seed(1)
how_many_users = 5
values = randint(0, 2, how_many_users)
labels = torch.from_numpy(values)

In [None]:
pRoot = "drive/MyDrive/Colab Notebooks/"
pData = pRoot + "dataset/"
pModels = pRoot + "models/"
pTweet = pData + "tweet/"
users_id = []
fin = open(pData + "users_fake_news.txt", "r")
for line in fin.readlines():
    users_id.append(line.rstrip("\n"))
users_id = users_id[1:11]
#print(users_id)
print(len(users_id))

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained(
    'distilbert-base-multilingual-cased',
    do_lower_case=False
)

In [None]:
def preprocess_sentences(input_ids, attention_masks, sentences):
  for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens = True,
        max_length = max_len,
        # pad_to_max_length = True,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  return input_ids, attention_masks

In [None]:
def load_model(modelname):
  loaded_model = DistilBertForSequenceClassification.from_pretrained(
    pModels+modelname, output_hidden_states = True)
  return loaded_model.cuda()

Read the first 100 tweets of each user and transform them into tensors

In [None]:
import pandas as pd
from transformers import DistilBertForSequenceClassification
import gc

# traits_big5 = ['O','C','E','A','N']
# traits_bhv = ['SD', 'ST', 'HE', 'AC', 'PO', 'SE', 'CO', 'TR', 'BE', 'UN']
'''
trait = "UN"
max_len = 256 # the closest power of two exceeding max len found
model = load_model("distil_"+trait)

for user in users_id:   
  input_ids = []
  attention_masks = []
  df = pd.read_csv(pTweet + str(user) + ".csv")
  # print(df.head())
  df = df[:50]
  df['text']= df['text'].astype('str')
  sentences  = df.text.values
  input_ids, attention_masks = preprocess_sentences(
      input_ids, attention_masks, sentences)
  input_ids = input_ids.to(device)
  attention_masks = attention_masks.to(device)
  pt_output = model(input_ids, attention_mask=attention_masks)

  token_embeddings = torch.stack(pt_output.hidden_states, dim=0)
  # print(token_embeddings.size())
  last_layer = token_embeddings[-1]
  last_layer = last_layer.permute(1,0,2) 
  # print(last_layer[0].size())
  result = last_layer[0].cpu().detach().numpy()
  res = np.asarray(result)
  np.savetxt(pData+"user_tensor/"+str(user)+"_"+trait+".csv", res, delimiter=",", fmt='%5.5f')
  del input_ids
  del attention_masks
  del pt_output
  del df
  del last_layer
  del result
  del res
  torch.cuda.empty_cache()
  gc.collect()

print("done")
'''

Prepare data into input tensors

In [None]:
traits_big5 = ['O','C','E','A','N']
traits_bhv = ['SD', 'ST', 'HE', 'AC', 'PO', 'SE', 'CO', 'TR', 'BE', 'UN']
flag_tot = 0
for user in users_id[:how_many_users]:
  flag = 0
  for trait in traits_big5:
    torch_in = torch.from_numpy(
        np.loadtxt(pData+"user_tensor/"+str(user)+"_"+trait+".csv",
        delimiter=","))
    if flag == 0 :
      b5 = torch_in
      flag = 1
    else:
      b5 = torch.cat((b5, torch_in), dim=1)
  #print(b5.size())
  flag = 0
  for trait in traits_bhv:
    torch_in = torch.from_numpy(
        np.loadtxt(pData+"user_tensor/"+str(user)+"_"+trait+".csv",
        delimiter=","))
    if flag == 0 :
      bhv = torch_in
      flag = 1
    else:
      bhv = torch.cat((bhv, torch_in), dim=1)
  #print(bhv.size())
  if flag_tot == 0:
    big5_tot = b5
    bhv_tot = bhv
    flag_tot = 1
  else:
    big5_tot = torch.cat((big5_tot, b5), dim=0)
    bhv_tot = torch.cat((bhv_tot, bhv), dim=0)

# print(big5_tot.size())
# print(bhv_tot.size())

In [None]:
big5_size = list(big5_tot.size())
big5_tot = torch.reshape(big5_tot, (int(big5_size[0]/50), 50, big5_size[1]))
# print(big5_tot)
# print(big5_tot[0].size())
bhv_size = list(bhv_tot.size())
bhv_tot = torch.reshape(bhv_tot, (int(bhv_size[0]/50), 50, bhv_size[1]))
# print(big5_tot)
# print(bhv_tot[0].size())
# print(big5_tot.size(), bhv_tot.size())

configure the RUNN architecture

In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F

class RUNN(nn.Module):

  def __init__(self, big5_in = 768*5, bhv_in = 768*10):
        super().__init__()
        # self.shrink_big5 = nn.Linear(big5_in, 768) 
        self.shrink_big5 = nn.Sequential(
            nn.Linear(big5_in, 768), nn.LeakyReLU())
        # self.shrink_bhv = nn.Linear(bhv_in, 768)
        self.shrink_bhv = nn.Sequential(
            nn.Linear(bhv_in, 768), nn.LeakyReLU())       
        self.conv1 = nn.Conv1d(
            in_channels=50, out_channels=1, kernel_size=3, stride=1)     
        self.ll = nn.Linear(1534, 1)


  def forward(self, big5, bhv):
    sb5 = self.shrink_big5(big5) # (50,(768*5) = (50, 768)
    sbhv = self.shrink_bhv(bhv) # (50,(768*10) = (50, 768)
    # print(sb5.size())
    # print(sbhv.size())
    x = torch.cat((sb5, sbhv), dim=2)
    # print(x.size())
    #x = x.unsqueeze(0)
    x = self.conv1(x)
    x = self.ll(x)
    y = torch.squeeze(x)
    return y
  


train and test the model

In [None]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(big5_tot.float(), bhv_tot.float(), labels)  # .float() anche le labels?

# Create a 90-10 train-validation split.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup 

model = RUNN()
model.cuda() 
epochs = 3
batch_size = 10
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)

train_dataloader = DataLoader(
    train_dataset,
    sampler = RandomSampler(train_dataset),
    batch_size = batch_size
)
validation_dataloader = DataLoader(
    val_dataset,
    sampler = SequentialSampler(val_dataset),
    batch_size = batch_size
)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader)*epochs
) 


In [None]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# y = model.forward(big5_tot[0].float(), bhv_tot[0].float())
# y_pred = torch.sigmoid(y)
# y_pred_tag = torch.round(y_pred)
# print(y_pred, y_pred_tag)

In [None]:
import random
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

seed_val = 1
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []
total_t0 = time.time()

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    total_eval_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        if step % 5 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print(step, len(train_dataloader), elapsed)

        b_big5 = batch[0].to(device)
        b_bhv = batch[1].to(device)
        b_labels = batch[2].float().to(device)
        model.zero_grad()
        outputs = model(b_big5, b_bhv)
        y_pred = torch.sigmoid(outputs)
        y_pred_tag = torch.round(y_pred)
        loss = loss_fn(y_pred, b_labels)
        total_train_loss += loss.item()
        loss.type(torch.FloatTensor).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    avg_train_loss = total_train_loss / len(train_dataloader)
    training_time = format_time(time.time() - t0)
    print("  Average training loss: {0:.8f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, 
    # measure our performance on our validation set.
    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()
    eval_micro = [] 
    eval_macro = [] 
    eval_acc = 0.0
    nb_eval_steps = 0.0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_big5 = batch[0].to(device)
        b_bhv = batch[1].to(device)
        b_labels = batch[2].float().to(device)
        with torch.no_grad(): 
            outputs = model(b_big5, b_bhv)
        
        
        # Accumulate the validation loss.       
        y_pred = torch.sigmoid(outputs)
        y_pred_tag = torch.round(y_pred)
        y_pred_tag = torch.reshape(y_pred_tag, (-1,))
        loss = loss_fn(y_pred_tag, b_labels)
        total_eval_loss += loss
        pred_flat = y_pred_tag.detach().cpu().numpy().flatten()
        labels_flat = b_labels.to('cpu').numpy().flatten()
        tmp_eval_micro = precision_recall_fscore_support(
            labels_flat, pred_flat, average='micro')
        tmp_eval_macro = precision_recall_fscore_support(
            labels_flat, pred_flat, average='macro')
        tmp_accuracy = accuracy_score(
            labels_flat, pred_flat
        )
        # print(tmp_eval_micro)
        eval_micro.append(tmp_eval_micro) # += np.asarray(tmp_eval_micro)
        eval_macro.append(tmp_eval_macro) #+= np.asarray(tmp_eval_macro)
        eval_acc += tmp_accuracy
        nb_eval_steps += 1
    print(F'\n\tValidation micro: {eval_micro}')
    print(F'\n\tValidation macro: {eval_macro}')
    print(F'\n\tValidation accuracy: {eval_acc/nb_eval_steps}')
    avg_eval_loss = total_eval_loss / len(validation_dataloader)
    print("  Average validation loss: {0:.8f}".format(avg_eval_loss))  
   
torch.save(model.state_dict(), "RUNN_model_state_dict")
print("Training complete!")
print("Total {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


