In [None]:
!pip install transformers==3.3.0

## Required Packages

In [None]:
import sys
import gc
import os
import json

import torch
import warnings
import transformers
import torch.nn as nn
import time
import warnings
import nltk, string

import re
import requests

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from google.colab import drive
from transformers import BertTokenizer, BertModel
from sklearn import model_selection
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import mean_squared_log_error

tqdm.pandas()
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt') # if necessary...
pd.set_option('display.max_colwidth', 255)
warnings.filterwarnings("ignore")
nltk.download('stopwords')
drive.mount('/content/drive')

In [3]:
def label_scaling(val):
    val = np.log(val + 1)
    scaler = MinMaxScaler()
    scaler.fit(val)
    val = scaler.transform(val)
    return scaler, val


def label_inverse_scaling(scaler, val):
    val = scaler.inverse_transform(val)
    val = np.exp(val) - 1
    return val


def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)

class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [4]:
path = '/content/drive/MyDrive/twitter-popularity-prediction/'
df = pd.read_csv(f"{path}data.csv")
#df = df[['user_name','description','text','follower_count','friends_count','num_of_likes','retweet_count']]
#df[df.isnull().any(axis=1)]
#df.head()
df.isna().sum()

user_id              0
user_name            0
location          3422
description       1600
follower_count       0
friends_count        0
verified             0
tweet_id             0
created_at           0
num_of_likes         0
retweet_count        0
text                 0
user_location     3422
dtype: int64

### config 

In [13]:
pretrained_model_name = '/content/drive/MyDrive/pre_trained_model/bert-base-uncased/'#'bert-base-uncased'
do_lower_case = True
max_len = 128
bert_hidden = 768
dropout = 0.3
train_batch_size=16
valid_batch_size=32
epochs = 5
learning_rate = 1e-3
adam_epsilon=1e-8
n_meta_features = 3
bert_hidden=768
warmup_steps=0
LOSS = 'MSE'
seed_everything()

## DataLoader

In [14]:
class TweetDataset:
    def __init__(self, dataframe, tweet, targets,meta_features):
      self.df = dataframe
      self.tweet = tweet
      self.tokenizer = transformers.BertTokenizer.from_pretrained(pretrained_model_name,do_lower_case = do_lower_case)
      self.max_length = max_len
      self.targets = targets
      self.meta_features = meta_features

    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
      tweet = str(self.tweet[item])
      tweet = " ".join(tweet.split())

      inputs = self.tokenizer.encode_plus(
          tweet,
          None,
          add_special_tokens=True,
          max_length=self.max_length,
          truncation_strategy="longest_first",
          pad_to_max_length=True,
          truncation=True
      )
      
      ids = inputs["input_ids"]
      mask = inputs["attention_mask"]
      token_type_ids = inputs["token_type_ids"]

      meta = np.array(self.df.iloc[item][self.meta_features].values, dtype=np.float32)

      return {
          'ids': torch.tensor(ids, dtype=torch.long),
          'mask': torch.tensor(mask, dtype=torch.long),
          'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
          'targets': torch.tensor(self.targets[item], dtype=torch.float),
          'meta': torch.tensor(meta, dtype=torch.float)
      }


df = pd.read_csv(f"{path}data.csv").dropna().reset_index(drop = True)
meta = ['follower_count','friends_count', 'retweet_count']

dset = TweetDataset(
        dataframe=df,
        tweet=df.text.values,
        targets=df.num_of_likes.values,
        meta_features = meta
        )
print(df.iloc[0])
print(dset[0])

user_id                                                                                                                                         813286
user_name                                                                                                                                 Barack Obama
location                                                                                                                                Washington, DC
description                                                                                                          Dad, husband, President, citizen.
follower_count                                                                                                                               129803017
friends_count                                                                                                                                   590251
verified                                                                                      

### Model

In [15]:
class BertBaseUncasedNext(nn.Module) :
    def __init__(self) : 
      super(BertBaseUncasedNext,self).__init__() 
      self.bert = transformers.BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True) 
      self.drop_out = nn.Dropout(0.1) 
      self.meta_features = nn.Sequential(nn.Linear(n_meta_features, 500),
                          nn.BatchNorm1d(500),
                          nn.ReLU(),
                          nn.Dropout(p=0.2),
                          nn.Linear(500, 250),
                          nn.BatchNorm1d(250),
                          nn.ReLU(),
                          nn.Dropout(p=0.2))

      self.l0 =  nn.Linear(bert_hidden * 2+250, 1)
      torch.nn.init.normal_(self.l0.weight, std=0.02)
        
    def _get_cls_vec(self, vec):
      return vec[:,0,:].view(-1, bert_hidden)
    def forward(self,ids,attention_mask,token_type_ids, meta):
      _, _, hidden_states = self.bert(
          ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids
      )
      vec1 = self._get_cls_vec(hidden_states[-1])
      vec2 = self._get_cls_vec(hidden_states[-2])
      meta_features = self.meta_features(meta)
      #print("meta_features shape = ", meta_features.shape, vec1.shape, vec2.shape)
      out = torch.cat([vec1, vec2, meta_features], dim=1)
      #print("out shape = ", out.shape)
      out = self.drop_out(out)
      logits = self.l0(out)
      return logits


In [19]:
def loss_fn(y_pred, y_true, loss_name=LOSS):
  if type(loss_name) == str:
      if loss_name == 'MSE':
          loss_func = nn.MSELoss(reduction='mean')
      elif loss_name == 'MAE':
          loss_func = nn.L1Loss(reduction='mean')
      elif loss_name == 'Huber':
          loss_func = nn.SmoothL1Loss(reduction='mean')
      elif loss_name == 'LogCosh':
          loss_func = LogCoshLoss()
  return loss_func(y_pred, y_true.view(-1,1))


def train_fn(data_loader, model, optimizer, device, scheduler, n_examples):
  model.train()
  losses = AverageMeter()
  tk0 = tqdm(data_loader, total=len(data_loader))
  start = time.time()
  train_losses = []
  fin_targets = []
  fin_outputs = []
  for bi, d in enumerate(tk0):
    ids = d["ids"]
    mask = d["mask"]
    token_type_ids = d["token_type_ids"]
    targets = d["targets"]
    meta_features = d["meta"]
    #print("meta_features here = ", meta_features.shape)
    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    targets = targets.to(device, dtype=torch.float)
    meta_features = meta_features.to(device, dtype=torch.float)
    # Reset gradients
    model.zero_grad()

    outputs = model(
        ids=ids,
        attention_mask=mask,
        token_type_ids = token_type_ids,
        meta = meta_features
    )

    loss = loss_fn(outputs, targets)
    train_losses.append(loss.item())

    outputs = torch.round(nn.ReLU()(outputs)).squeeze()
    targets = targets.squeeze()
    outputs = outputs.cpu().detach().numpy().tolist()
    targets = targets.cpu().detach().numpy().tolist()

    end = time.time()
    if (bi % 50 == 0 and bi != 0) or (bi == len(data_loader) - 1):
      print(f'bi={bi},Train loss={loss.item()}, time={end-start}')
    
    loss.backward() # Calculate gradients based on loss
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step() # Adjust weights based on calculated gradients
    scheduler.step() # Update scheduler
    losses.update(loss.item(), ids.size(0))
    tk0.set_postfix(loss = losses.avg)
    fin_targets.extend(targets) 
    fin_outputs.extend(outputs)
  return np.mean(train_losses)

def eval_fn(data_loader, model, device, n_examples):
  model.eval()
  start = time.time()
  losses = AverageMeter()
  val_losses = []
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
      #tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(data_loader):
      ids = d["ids"]
      mask = d["mask"]
      token_type_ids = d["token_type_ids"]
      targets = d["targets"]
      meta_features = d["meta"]
      ids = ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.float)
      meta_features = meta_features.to(device, dtype=torch.float)

      outputs = model(
          ids=ids,
          attention_mask=mask,
          token_type_ids = token_type_ids,
          meta = meta_features
      )
      loss = loss_fn(outputs, targets)
      val_losses.append(loss.item())

      targets = targets.squeeze()
      outputs = torch.round(nn.ReLU()(outputs)).squeeze()
      if isinstance(targets.cpu().detach().numpy().tolist(), list) == False:
          fin_targets.append(targets.cpu().detach().numpy().tolist())
          fin_outputs.append(outputs.cpu().detach().numpy().tolist())
      else:
          fin_targets.extend(targets.cpu().detach().numpy().tolist())
          fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
  return fin_outputs,fin_targets, np.mean(val_losses)

In [20]:
def run():
  dfx = pd.read_csv(f"{path}data.csv").dropna().reset_index(drop=True)
  #dfx = dfx[:100]
  df_train, df_valid = model_selection.train_test_split(
      dfx, 
      test_size=0.15, 
      random_state=46, 
    )
  df_train = df_train.reset_index(drop=True)
  df_valid = df_valid.reset_index(drop=True)
  print("train len - {} valid len - {}".format(len(df_train), len(df_valid)))
  df_train = df_train.sample(frac=1).reset_index(drop=True)
  meta = ['follower_count','friends_count', 'retweet_count']
  train_meta_features = df_train[meta]
  valid_meta_features = df_valid[meta]

  train_dataset = TweetDataset(
      dataframe = df_train,
      tweet=df_train.text.values,
      targets=df_train.num_of_likes.values,
      meta_features = meta
  )

  train_data_loader = torch.utils.data.DataLoader(
      train_dataset,
      batch_size=train_batch_size,
      shuffle=True,
      num_workers=4
  )

  valid_dataset = TweetDataset(
    dataframe = df_valid,
    tweet=df_valid.text.values,
    targets=df_valid.num_of_likes.values,
    meta_features = meta
  )

  valid_data_loader = torch.utils.data.DataLoader(
      valid_dataset,
      batch_size=valid_batch_size,
      num_workers=2
  )

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = BertBaseUncasedNext()#BertBaseUncased()
  model.to(device)
  

  param_optimizer = list(model.named_parameters())
  no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
  # Define two sets of parameters: those with weight decay, and those without
  optimizer_parameters = [
      {
          "params": [
              p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
          ],
          "weight_decay": 0.001,
      },
      {
          "params": [
              p for n, p in param_optimizer if any(nd in n for nd in no_decay)
          ],
          "weight_decay": 0.0,
      },
  ]

  num_train_steps = int(len(df_train) / train_batch_size * epochs)

  optimizer = AdamW(optimizer_parameters, lr=learning_rate, eps=adam_epsilon)
  '''
  Create a scheduler to set the learning rate at each training step
  "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html)
  Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step
  '''
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=warmup_steps,
      num_training_steps=num_train_steps
  )
  #es = utils.EarlyStopping(patience=15, mode="max")
  print("STARTING TRAINING for ...\n")
  #history = defaultdict(list)
  loss_history_epoch = []
  metric_history_epoch = []

  best_accuracy = 999.9

  for epoch in range(epochs):
      print(f'Epoch {epoch + 1}/{epochs}')
      print('-' * 10)
      

      train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler, len(df_train))
      print(f'Train metric {train_loss}')

      fin_pred, fin_val, val_loss = eval_fn(valid_data_loader, model, device, len(df_valid))

      val_metric = mean_squared_log_error(fin_val, fin_pred)

      print(f'Val loss {val_loss} Val metric {val_metric}')
      if val_metric < best_accuracy:
        best_accuracy = val_metric
        #torch.save(model.state_dict(), f"{args.model_path}{args.model_specification}.bin")

  
  del model, optimizer, scheduler, train_data_loader, valid_data_loader, train_dataset, valid_dataset
  torch.cuda.empty_cache()
  torch.cuda.synchronize()
  print("##################################### Task End ############################################")
  print(gc.collect())

run()

train len - 4930 valid len - 870
STARTING TRAINING for ...

Epoch 1/5
----------


HBox(children=(FloatProgress(value=0.0, max=309.0), HTML(value='')))

bi=50,Train loss=43015372800.0, time=17.969333171844482
bi=100,Train loss=35389915136.0, time=35.667757511138916
bi=150,Train loss=18752972800.0, time=53.56834697723389
bi=200,Train loss=48415621120.0, time=71.69771480560303
bi=250,Train loss=21224022016.0, time=89.96910786628723
bi=300,Train loss=9937253376.0, time=108.37483143806458
bi=308,Train loss=11622298624.0, time=111.23375153541565

Train metric 62990162914.17476
Val loss 45107832173.71429 Val metric 25.090447253264625
Epoch 2/5
----------


HBox(children=(FloatProgress(value=0.0, max=309.0), HTML(value='')))

bi=50,Train loss=47243001856.0, time=19.12983512878418
bi=100,Train loss=28619872256.0, time=37.913737058639526
bi=150,Train loss=40816582656.0, time=56.841330766677856
bi=200,Train loss=227275653120.0, time=75.75071573257446
bi=250,Train loss=47364546560.0, time=94.72855877876282
bi=300,Train loss=64976257024.0, time=113.81535863876343
bi=308,Train loss=52349403136.0, time=116.73825812339783

Train metric 63041314935.30097
Val loss 45094971684.57143 Val metric 23.944905226671096
Epoch 3/5
----------


HBox(children=(FloatProgress(value=0.0, max=309.0), HTML(value='')))

bi=50,Train loss=44745478144.0, time=19.51006245613098
bi=100,Train loss=31886157824.0, time=38.577704668045044
bi=150,Train loss=72624390144.0, time=57.8161563873291
bi=200,Train loss=59376812032.0, time=77.05199670791626
bi=250,Train loss=95470878720.0, time=96.2939555644989
bi=300,Train loss=6433909760.0, time=115.50739169120789
bi=308,Train loss=54960697344.0, time=118.48761749267578

Train metric 63027680252.68608
Val loss 45069571364.57143 Val metric 22.72761090875658
Epoch 4/5
----------


HBox(children=(FloatProgress(value=0.0, max=309.0), HTML(value='')))

bi=50,Train loss=26775556096.0, time=19.666971445083618
bi=100,Train loss=139892867072.0, time=38.99474358558655
bi=150,Train loss=211961479168.0, time=58.257917404174805
bi=200,Train loss=130016731136.0, time=77.51727509498596
bi=250,Train loss=76339961856.0, time=96.85361361503601
bi=300,Train loss=48550825984.0, time=116.16423678398132
bi=308,Train loss=77727080448.0, time=119.12635636329651

Train metric 63073165854.653725
Val loss 45059963465.14286 Val metric 22.041077717727376
Epoch 5/5
----------


HBox(children=(FloatProgress(value=0.0, max=309.0), HTML(value='')))

bi=50,Train loss=17472126976.0, time=19.650765419006348
bi=100,Train loss=46591688704.0, time=38.90463447570801
bi=150,Train loss=48023941120.0, time=58.29055643081665
bi=200,Train loss=34011516928.0, time=77.56108069419861
bi=250,Train loss=153553403904.0, time=96.88764595985413
bi=300,Train loss=33805602816.0, time=116.25798535346985
bi=308,Train loss=85146083328.0, time=119.22964429855347

Train metric 63082345835.70226
Val loss 45060198107.42857 Val metric 21.85124050994513
##################################### Task End ############################################
3792
