In [None]:
!pip install transformers==3.3.0

In [None]:
import sys
import gc
import os
import json

import torch
import warnings
import transformers
import torch.nn as nn
import time
import warnings
import nltk, string

import re
import requests

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from google.colab import drive
from transformers import BertTokenizer, BertModel
from sklearn import model_selection
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import mean_squared_log_error

tqdm.pandas()
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt') # if necessary...
pd.set_option('display.max_colwidth', 255)
warnings.filterwarnings("ignore")
nltk.download('stopwords')
drive.mount('/content/drive')

In [3]:
def label_scaling(val):
    val = np.log(val + 1)
    scaler = MinMaxScaler()
    scaler.fit(val)
    val = scaler.transform(val)
    return scaler, val


def label_inverse_scaling(scaler, val):
    val = scaler.inverse_transform(val)
    val = np.exp(val) - 1
    return val


def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)

class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [4]:
path = '/content/drive/MyDrive/twitter-popularity-prediction/'
df = pd.read_csv(f"{path}data.csv")
df.head(1)

Unnamed: 0,user_id,user_name,location,description,follower_count,friends_count,verified,tweet_id,created_at,num_of_likes,retweet_count,text,user_location
0,813286,Barack Obama,"Washington, DC","Dad, husband, President, citizen.",129803017,590251,True,896523232098078720,Sun Aug 13 00:06:09 +0000 2017,4232344,1515265,"""No one is born hating another person because of the color of his skin or his background or his religion..."" https://t.co/InZ58zkoAm","Washington, DC"


In [5]:
pretrained_model_name = 'bert-base-uncased'
do_lower_case = True
max_len = 128
bert_hidden = 768
dropout = 0.3
train_batch_size=16
valid_batch_size=32
epochs = 5
learning_rate = 1e-3
adam_epsilon=1e-8
n_meta_features = 3
bert_hidden=768
warmup_steps=0
LOSS = 'MSE'
seed_everything()

In [6]:
class TweetDataset:
    def __init__(self, tweet, targets):
        self.tweet = tweet
        self.tokenizer = transformers.BertTokenizer.from_pretrained(pretrained_model_name,do_lower_case = do_lower_case)
        self.max_length = max_len
        self.targets = targets

    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        
        tweet = str(self.tweet[item])
        tweet = " ".join(tweet.split())

        inputs = self.tokenizer.encode_plus(
            tweet,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation_strategy="longest_first",
            pad_to_max_length=True,
            truncation=True
        )
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[item], dtype=torch.float)
        }


df = pd.read_csv(f"{path}data.csv").dropna().reset_index(drop = True)
dset = TweetDataset(
        tweet=df.text.values,
        targets=df.num_of_likes.values
        )
print(df.iloc[0])
print(dset[10])

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


user_id                                                                                                                                         813286
user_name                                                                                                                                 Barack Obama
location                                                                                                                                Washington, DC
description                                                                                                          Dad, husband, President, citizen.
follower_count                                                                                                                               129803017
friends_count                                                                                                                                   590251
verified                                                                                     

In [7]:
class BertBaseUncased(nn.Module) :
    def __init__(self) : 
      super(BertBaseUncased,self).__init__() 
      self.bert = transformers.BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True) 
      self.drop_out = nn.Dropout(0.1) 
      self.l0 =  nn.Linear(bert_hidden * 2, 1)
      torch.nn.init.normal_(self.l0.weight, std=0.02)
        
    def _get_cls_vec(self, vec):
      return vec[:,0,:].view(-1, bert_hidden)
    def forward(self,ids,attention_mask,token_type_ids):
      _, _, hidden_states = self.bert(
          ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids
      )
      vec1 = self._get_cls_vec(hidden_states[-1])
      vec2 = self._get_cls_vec(hidden_states[-2])

      out = torch.cat([vec1, vec2], dim=1)
      out = self.drop_out(out)
      logits = self.l0(out)
      return logits


In [8]:
def loss_fn(y_pred, y_true, loss_name=LOSS):
  if type(loss_name) == str:
    if loss_name == 'MSE':
        loss_func = nn.MSELoss(reduction='mean')
    elif loss_name == 'MAE':
        loss_func = nn.L1Loss(reduction='mean')
    elif loss_name == 'Huber':
        loss_func = nn.SmoothL1Loss(reduction='mean')
    elif loss_name == 'LogCosh':
        loss_func = LogCoshLoss()
  return loss_func(y_pred, y_true.view(-1,1))


def train_fn(data_loader, model, optimizer, device, scheduler, n_examples):
  model.train()
  losses = AverageMeter()
  tk0 = tqdm(data_loader, total=len(data_loader))
  start = time.time()
  train_losses = []
  fin_targets = []
  fin_outputs = []
  for bi, d in enumerate(tk0):
      ids = d["ids"]
      mask = d["mask"]
      token_type_ids = d["token_type_ids"]
      targets = d["targets"]
      ids = ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.float)
      # Reset gradients
      model.zero_grad()

      outputs = model(
          ids=ids,
          attention_mask=mask,
          token_type_ids = token_type_ids 
      )

      loss = loss_fn(outputs, targets)
      train_losses.append(loss.item())

      outputs = torch.round(nn.ReLU()(outputs)).squeeze()
      #print("outputs = ", outputs)
      targets = targets.squeeze()
      #print("targets = ", targets)
      outputs = outputs.cpu().detach().numpy().tolist()
      targets = targets.cpu().detach().numpy().tolist()

      end = time.time()
      if (bi % 50 == 0 and bi != 0) or (bi == len(data_loader) - 1):
        print(f'bi={bi},Train loss={loss.item()}, time={end-start}')
      
      loss.backward() # Calculate gradients based on loss
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      optimizer.step() # Adjust weights based on calculated gradients
      scheduler.step() # Update scheduler
      losses.update(loss.item(), ids.size(0))
      tk0.set_postfix(loss = losses.avg)
      fin_targets.extend(targets) 
      fin_outputs.extend(outputs)

  return np.mean(train_losses)

def eval_fn(data_loader, model, device, n_examples):
  model.eval()
  start = time.time()
  losses = AverageMeter()
  val_losses = []
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
      #tk0 = tqdm(data_loader, total=len(data_loader))
      for bi, d in enumerate(data_loader):
          ids = d["ids"]
          mask = d["mask"]
          token_type_ids = d["token_type_ids"]
          targets = d["targets"]
          ids = ids.to(device, dtype=torch.long)
          mask = mask.to(device, dtype=torch.long)
          token_type_ids = token_type_ids.to(device, dtype=torch.long)
          targets = targets.to(device, dtype=torch.float)

          outputs = model(
              ids=ids,
              attention_mask=mask,
              token_type_ids = token_type_ids 
          )
          loss = loss_fn(outputs, targets)
          val_losses.append(loss.item())

          targets = targets.squeeze()
          outputs = torch.round(nn.ReLU()(outputs)).squeeze()
          if isinstance(targets.cpu().detach().numpy().tolist(), list) == False:
              fin_targets.append(targets.cpu().detach().numpy().tolist())
              fin_outputs.append(outputs.cpu().detach().numpy().tolist())
          else:
              fin_targets.extend(targets.cpu().detach().numpy().tolist())
              fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
  return fin_outputs,fin_targets, np.mean(val_losses)

In [9]:
def run():
  dfx = pd.read_csv(f"{path}data.csv").dropna().reset_index(drop=True)
  df_train, df_valid = model_selection.train_test_split(
      dfx, 
      test_size=0.15, 
      random_state=46, 
    )

  print("train len - {} valid len - {}".format(len(df_train), len(df_valid)))
  df_train = df_train.reset_index(drop=True)
  df_valid = df_valid.reset_index(drop=True)

  df_train = df_train.sample(frac=1).reset_index(drop=True)

  train_dataset = TweetDataset(
      tweet=df_train.text.values,
      targets=df_train.num_of_likes.values
  )

  train_data_loader = torch.utils.data.DataLoader(
      train_dataset,
      batch_size=train_batch_size,
      shuffle=True,
      num_workers=4
  )

  valid_dataset = TweetDataset(
      tweet=df_valid.text.values,
      targets=df_valid.num_of_likes.values
  )

  valid_data_loader = torch.utils.data.DataLoader(
      valid_dataset,
      batch_size=valid_batch_size,
      num_workers=2
  )
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = BertBaseUncased()
  model.to(device)
  

  param_optimizer = list(model.named_parameters())
  no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
  # Define two sets of parameters: those with weight decay, and those without
  optimizer_parameters = [
      {
          "params": [
              p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
          ],
          "weight_decay": 0.001,
      },
      {
          "params": [
              p for n, p in param_optimizer if any(nd in n for nd in no_decay)
          ],
          "weight_decay": 0.0,
      },
  ]

  num_train_steps = int(len(df_train) / train_batch_size * epochs)

  optimizer = AdamW(optimizer_parameters, lr=learning_rate, eps=adam_epsilon)
  '''
  Create a scheduler to set the learning rate at each training step
  "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html)
  Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step
  '''
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=warmup_steps,
      num_training_steps=num_train_steps
  )
  print("STARTING TRAINING for ...\n")
  #history = defaultdict(list)
  loss_history_epoch = []
  metric_history_epoch = []

  best_accuracy = 999.9

  for epoch in range(epochs):
      print(f'Epoch {epoch + 1}/{epochs}')
      print('-' * 10)
      

      train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler, len(df_train))
      print(f'Train metric {train_loss}')

      fin_pred, fin_val, val_loss = eval_fn(valid_data_loader, model, device, len(df_valid))

      val_metric = mean_squared_log_error(fin_val, fin_pred)

      print(f'Val loss {val_loss} Val metric {val_metric}')
      if val_metric < best_accuracy:
        best_accuracy = val_metric
        #torch.save(model.state_dict(), f"{args.model_path}{args.model_specification}.bin")

  
  del model, optimizer, scheduler, train_data_loader, valid_data_loader, train_dataset, valid_dataset
  torch.cuda.empty_cache()
  torch.cuda.synchronize()
  print("##################################### Task End ############################################")
  print(gc.collect())

run()

train len - 4930 valid len - 870


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


STARTING TRAINING for ...

Epoch 1/5
----------


HBox(children=(FloatProgress(value=0.0, max=309.0), HTML(value='')))

bi=50,Train loss=17803507712.0, time=23.039136171340942
bi=100,Train loss=18747060224.0, time=45.53956699371338
bi=150,Train loss=170775543808.0, time=68.33886003494263
bi=200,Train loss=25684082688.0, time=91.13026571273804
bi=250,Train loss=23146921984.0, time=114.1676332950592
bi=300,Train loss=109330833408.0, time=137.22472071647644
bi=308,Train loss=1595272192.0, time=140.80850481987

Train metric 62957456354.17476
Val loss 45064505344.0 Val metric 21.52588441907388
Epoch 2/5
----------


HBox(children=(FloatProgress(value=0.0, max=309.0), HTML(value='')))

bi=50,Train loss=113806049280.0, time=23.762489080429077
bi=100,Train loss=22267586560.0, time=47.40310835838318
bi=150,Train loss=7630175232.0, time=71.2733006477356
bi=200,Train loss=25463566336.0, time=95.18138384819031
bi=250,Train loss=490258694144.0, time=119.29837679862976
bi=300,Train loss=22731616256.0, time=143.5897560119629
bi=308,Train loss=10565222400.0, time=147.34916257858276

Train metric 62776849967.223305
Val loss 44847659958.85714 Val metric 14.7375508827994
Epoch 3/5
----------


HBox(children=(FloatProgress(value=0.0, max=309.0), HTML(value='')))

bi=50,Train loss=36096647168.0, time=24.74106740951538
bi=100,Train loss=54271438848.0, time=48.992743730545044
bi=150,Train loss=25059174400.0, time=73.21994709968567
bi=200,Train loss=175612280832.0, time=97.69950890541077
bi=250,Train loss=352934690816.0, time=122.18348050117493
bi=300,Train loss=17575735296.0, time=146.59882307052612
bi=308,Train loss=31763118080.0, time=150.35869479179382

Train metric 62614506090.04531
Val loss 44652119917.71429 Val metric 11.921838838845122
Epoch 4/5
----------


HBox(children=(FloatProgress(value=0.0, max=309.0), HTML(value='')))

bi=50,Train loss=64140120064.0, time=25.04524517059326
bi=100,Train loss=103928225792.0, time=49.467461824417114
bi=150,Train loss=84057751552.0, time=73.85966539382935
bi=200,Train loss=126246412288.0, time=98.32161808013916
bi=250,Train loss=8826144768.0, time=122.90908336639404
bi=300,Train loss=13000937472.0, time=147.3056993484497
bi=308,Train loss=3367609856.0, time=151.06805515289307

Train metric 62361751001.47573
Val loss 44522143890.28571 Val metric 10.666399784665185
Epoch 5/5
----------


HBox(children=(FloatProgress(value=0.0, max=309.0), HTML(value='')))

bi=50,Train loss=47313842176.0, time=24.833290100097656
bi=100,Train loss=30012497920.0, time=49.22404193878174
bi=150,Train loss=44455387136.0, time=73.7003607749939
bi=200,Train loss=44933660672.0, time=98.11122536659241
bi=250,Train loss=92620570624.0, time=122.56686091423035
bi=300,Train loss=62402117632.0, time=147.18062114715576
bi=308,Train loss=8721905664.0, time=150.95118474960327

Train metric 62283160239.63754
Val loss 44477619785.14286 Val metric 10.305915213679567
##################################### Task End ############################################
442
