In [1]:
!pip install transformers==3.3.0

Collecting transformers==3.3.0
  Downloading transformers-3.3.0-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 4.3 MB/s 
Collecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 34.6 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 54.5 MB/s 
Collecting tokenizers==0.8.1.rc2
  Downloading tokenizers-0.8.1rc2-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 18.6 MB/s 
Installing collected packages: tokenizers, sentencepiece, sacremoses, transformers
Successfully installed sacremoses-0.0.45 sentencepiece-0.1.96 tokenizers-0.8.1rc2 transformers-3.3.0


In [20]:
import sys
import os
import json
from google.colab import drive
import pandas as pd
import numpy as np
import re
import requests
from tqdm.notebook import tqdm
import torch
import warnings
import transformers
import torch.nn as nn
import time
from transformers import BertTokenizer, BertModel
from sklearn import model_selection
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

tqdm.pandas()
import warnings
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt') # if necessary...
pd.set_option('display.max_colwidth', 255)
warnings.filterwarnings("ignore")
nltk.download('stopwords')
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
def label_scaling(val):
    val = np.log(val + 1)
    scaler = MinMaxScaler()
    scaler.fit(val)
    val = scaler.transform(val)
    return scaler, val


def label_inverse_scaling(scaler, val):
    val = scaler.inverse_transform(val)
    val = np.exp(val) - 1
    return val


def seed_everything(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)

class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [22]:
path = '/content/drive/MyDrive/twitter-popularity-prediction/'
df = pd.read_csv(f"{path}data.csv")
df.head(1)

Unnamed: 0,user_id,user_name,location,description,follower_count,friends_count,verified,tweet_id,created_at,num_of_likes,retweet_count,text,user_location
0,813286,Barack Obama,"Washington, DC","Dad, husband, President, citizen.",129803017,590251,True,896523232098078720,Sun Aug 13 00:06:09 +0000 2017,4232344,1515265,"""No one is born hating another person because of the color of his skin or his background or his religion..."" https://t.co/InZ58zkoAm","Washington, DC"


In [33]:
df.num_of_likes.values

array([4232344, 3875332, 2790177, ...,  241649,  241647,  239607])

In [23]:
pretrained_model_name = '/content/drive/MyDrive/pre_trained_model/bert-base-uncased/'#'bert-base-uncased'
do_lower_case = True
max_len = 128
bert_hidden = 768
dropout = 0.3
train_batch_size=16
valid_batch_size=32
epochs = 1
learning_rate = 1e-3
adam_epsilon=1e-8
warmup_steps=0.01
seed_everything()

In [35]:
class TweetDataset:
    def __init__(self, tweet, targets):
        self.tweet = tweet
        self.tokenizer = transformers.BertTokenizer.from_pretrained(pretrained_model_name,do_lower_case = do_lower_case)
        self.max_length = max_len
        self.targets = targets

    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        
        tweet = str(self.tweet[item])
        tweet = " ".join(tweet.split())

        inputs = self.tokenizer.encode_plus(
            tweet,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation_strategy="longest_first",
            pad_to_max_length=True,
            truncation=True
        )
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[item], dtype=torch.float)
        }


df = pd.read_csv(f"{path}data.csv").dropna().reset_index(drop = True)
dset = TweetDataset(
        tweet=df.text.values,
        targets=df.num_of_likes.values
        )
print(df.iloc[0])
print(dset[10])

user_id                                                                                                                                         813286
user_name                                                                                                                                 Barack Obama
location                                                                                                                                Washington, DC
description                                                                                                          Dad, husband, President, citizen.
follower_count                                                                                                                               129803017
friends_count                                                                                                                                   590251
verified                                                                                      

In [36]:
class BertBaseUncased(nn.Module) :
    def __init__(self) : 
      super(BertBaseUncased,self).__init__() 
      self.bert = transformers.BertModel.from_pretrained('/content/drive/MyDrive/pre_trained_model/bert-base-uncased/', output_hidden_states=True) 
      self.drop_out = nn.Dropout(dropout) 
      self.l0 =  nn.Linear(bert_hidden * 2, 1)
      torch.nn.init.normal_(self.l0.weight, std=0.02)

    def forward(self,ids,attention_mask,token_type_ids):
      out = self.bert(
          ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids
      )
      print("out = ",out[-1])
      out = torch.cat((out[-1], out[-2]), dim=-1)
      #out = self.drop_out(out)
      out = out[:,0,:]
      logits = self.l0(out)
      return logits

class BertBaseUncasedNext(nn.Module) :
    def __init__(self) : 
      super(BertBaseUncasedNext,self).__init__() 
      self.bert = transformers.BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True) 
      self.drop_out = nn.Dropout(0.1) 
      self.l0 =  nn.Linear(768 * 2, 1)
      torch.nn.init.normal_(self.l0.weight, std=0.02)
        
    def _get_cls_vec(self, vec):
      return vec[:,0,:].view(-1, 768)
    def forward(self,ids,attention_mask,token_type_ids):
      _, _, hidden_states = self.bert(
          ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids
      )
      vec1 = self._get_cls_vec(hidden_states[-1])
      vec2 = self._get_cls_vec(hidden_states[-2])

      out = torch.cat([vec1, vec2], dim=1)
      #out = self.drop_out(out)
      logits = self.l0(out)
      return logits


In [47]:
def loss_fn(y_pred, y_true):
  #loss_func = nn.L1Loss(reduction='mean') #MAE
  loss_func = nn.MSELoss(reduction='mean')
  return loss_func(y_pred, y_true.view(-1,1))

def train_fn(data_loader, model, optimizer, device, scheduler, n_examples):
  model.train()
  losses = AverageMeter()
  tk0 = tqdm(data_loader, total=len(data_loader))
  start = time.time()
  train_losses = []
  fin_targets = []
  fin_outputs = []
  for bi, d in enumerate(tk0):
      ids = d["ids"]
      mask = d["mask"]
      token_type_ids = d["token_type_ids"]
      targets = d["targets"]
      ids = ids.to(device, dtype=torch.long)
      mask = mask.to(device, dtype=torch.long)
      token_type_ids = token_type_ids.to(device, dtype=torch.long)
      targets = targets.to(device, dtype=torch.float)
      # Reset gradients
      model.zero_grad()

      outputs = model(
          ids=ids,
          attention_mask=mask,
          token_type_ids = token_type_ids 
      )

      loss = loss_fn(outputs, targets)
      train_losses.append(loss.item())

      outputs = torch.round(nn.ReLU()(outputs)).squeeze()
      print("outputs = ", outputs)
      targets = targets.squeeze()
      print("targets = ", targets)
      outputs = outputs.cpu().detach().numpy().tolist()
      targets = targets.cpu().detach().numpy().tolist()

      end = time.time()
      if (bi % 2 == 0 and bi != 0) or (bi == len(data_loader) - 1):
        print(f'bi={bi},Train loss={loss.item()}, time={end-start}')
      
      loss.backward() # Calculate gradients based on loss
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      optimizer.step() # Adjust weights based on calculated gradients
      scheduler.step() # Update scheduler
      losses.update(loss.item(), ids.size(0))
      tk0.set_postfix(loss = losses.avg)
      fin_targets.extend(targets) 
      fin_outputs.extend(outputs)

  return np.mean(train_losses)

def eval_fn(data_loader, model, device, n_examples):
  model.eval()
  start = time.time()
  losses = AverageMeter()
  val_losses = []
  fin_targets = []
  fin_outputs = []
  with torch.no_grad():
      #tk0 = tqdm(data_loader, total=len(data_loader))
      for bi, d in enumerate(data_loader):
          ids = d["ids"]
          mask = d["mask"]
          token_type_ids = d["token_type_ids"]
          targets = d["targets"]
          ids = ids.to(device, dtype=torch.long)
          mask = mask.to(device, dtype=torch.long)
          token_type_ids = token_type_ids.to(device, dtype=torch.long)
          targets = targets.to(device, dtype=torch.float)

          outputs = model(
              ids=ids,
              attention_mask=mask,
              token_type_ids = token_type_ids 
          )
          loss = loss_fn(outputs, targets)
          val_losses.append(loss.item())

          targets = targets.squeeze()
          outputs = torch.round(nn.ReLU()(outputs)).squeeze()
          if isinstance(targets.cpu().detach().numpy().tolist(), list) == False:
              fin_targets.append(targets.cpu().detach().numpy().tolist())
              fin_outputs.append(outputs.cpu().detach().numpy().tolist())
          else:
              fin_targets.extend(targets.cpu().detach().numpy().tolist())
              fin_outputs.extend(outputs.cpu().detach().numpy().tolist())
  return fin_outputs,fin_targets, np.mean(val_losses)

In [48]:
import gc
from sklearn.metrics import mean_squared_log_error

def run():
  dfx = pd.read_csv(f"{path}data.csv").dropna().reset_index(drop=True)
  dfx = dfx[:100]
  df_train, df_valid = model_selection.train_test_split(
      dfx, 
      test_size=0.15, 
      random_state=46, 
    )

  print("train len - {} valid len - {}".format(len(df_train), len(df_valid)))
  df_train = df_train.reset_index(drop=True)
  df_valid = df_valid.reset_index(drop=True)

  df_train = df_train.sample(frac=1).reset_index(drop=True)

  train_dataset = TweetDataset(
      tweet=df_train.text.values,
      targets=df_train.num_of_likes.values
  )

  train_data_loader = torch.utils.data.DataLoader(
      train_dataset,
      batch_size=train_batch_size,
      shuffle=True,
      num_workers=4
  )

  valid_dataset = TweetDataset(
      tweet=df_valid.text.values,
      targets=df_valid.num_of_likes.values
  )

  valid_data_loader = torch.utils.data.DataLoader(
      valid_dataset,
      batch_size=valid_batch_size,
      num_workers=2
  )
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model = BertBaseUncasedNext()#BertBaseUncased()
  model.to(device)
  

  param_optimizer = list(model.named_parameters())
  no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
  # Define two sets of parameters: those with weight decay, and those without
  optimizer_parameters = [
      {
          "params": [
              p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
          ],
          "weight_decay": 0.001,
      },
      {
          "params": [
              p for n, p in param_optimizer if any(nd in n for nd in no_decay)
          ],
          "weight_decay": 0.0,
      },
  ]

  num_train_steps = int(len(df_train) / train_batch_size * epochs)

  optimizer = AdamW(optimizer_parameters, lr=learning_rate, eps=adam_epsilon)
  '''
  Create a scheduler to set the learning rate at each training step
  "Create a schedule with a learning rate that decreases linearly after linearly increasing during a warmup period." (https://pytorch.org/docs/stable/optim.html)
  Since num_warmup_steps = 0, the learning rate starts at 3e-5, and then linearly decreases at each training step
  '''
  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=warmup_steps,
      num_training_steps=num_train_steps
  )
  #es = utils.EarlyStopping(patience=15, mode="max")
  print("STARTING TRAINING for ...\n")
  #history = defaultdict(list)
  loss_history_epoch = []
  metric_history_epoch = []

  best_accuracy = 999.9

  for epoch in range(epochs):
      print(f'Epoch {epoch + 1}/{epochs}')
      print('-' * 10)
      

      train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler, len(df_train))
      print(f'Train metric {train_loss}')

      fin_pred, fin_val, val_loss = eval_fn(valid_data_loader, model, device, len(df_valid))

      val_metric = mean_squared_log_error(fin_val, fin_pred)

      print(f'Val loss {val_loss} Val metric {val_metric}')
      if val_metric < best_accuracy:
        best_accuracy = val_metric
        #torch.save(model.state_dict(), f"{args.model_path}{args.model_specification}.bin")

  
  del model, optimizer, scheduler, train_data_loader, valid_data_loader, train_dataset, valid_dataset
  torch.cuda.empty_cache()
  torch.cuda.synchronize()
  print("##################################### Task End ############################################")
  print(gc.collect())

run()

train len - 85 valid len - 15
STARTING TRAINING for ...

Epoch 1/1
----------


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))

outputs =  tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0', grad_fn=<SqueezeBackward0>)
targets =  tensor([ 654288.,  526896.,  712295.,  728360.,  497719., 1016263.,  535184.,
         637321.,  479570.,  628259.,  667864.,  506084., 1498500., 1339922.,
         461050., 1860006.], device='cuda:0')
outputs =  tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0', grad_fn=<SqueezeBackward0>)
targets =  tensor([ 667129.,  548434., 1903138., 1378404., 1342905., 1818139., 1324427.,
         610281.,  928345.,  927893., 3875332., 1882187.,  622518., 1523207.,
         829631.,  699043.], device='cuda:0')
outputs =  tensor([6., 6., 6., 6., 6., 6., 6., 6., 6., 4., 6., 6., 6., 6., 6., 5.],
       device='cuda:0', grad_fn=<SqueezeBackward0>)
targets =  tensor([ 546957.,  495856.,  517509.,  996676.,  609899., 1229216., 4232344.,
         495958.,  754652., 1769516.,  493081.,  539133.,  546773.,  725104.,

In [None]:


def main():

    t = Timer()
    with t.timer(f'fix seed RANDOM_STATE:{RANDOM_STATE}'):
        seed_everything(RANDOM_STATE)

    with t.timer(f'read label'):
        y_train = pd.read_csv(f'{INPUT_DIR}/train.solution', header=None).T.values[0].reshape(-1, 1)

    if LABEL_LOG_SCALING is True:
        with t.timer(f'label log scaling (log->mms[0, 1]'):
            scaler, y_train = label_scaling(y_train)

    with t.timer(f'read features'):
        unique_num_dic = {}
        feature_index = {}

        X_train = pd.DataFrame()
        X_valid = pd.DataFrame()
        X_test = pd.DataFrame()
        fidx = 0
        for feat in dense_features:
            logging.info(f'[dense][{feat}] read feature ...')
            feature_index[feat] = fidx
            fidx += 1
            X_train = pd.concat([
                X_train, pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather')
            ], axis=1)
            X_valid = pd.concat([
                X_valid, pd.read_feather(f'{FEATURE_DIR}/{feat}_valid.feather')
            ], axis=1)
            X_test = pd.concat([
                X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather')
            ], axis=1)
        for feat in sparse_features:
            logging.info(f'[sparse][{feat}] read feature ...')
            feature_index[feat] = fidx
            fidx += 1
            X_train = pd.concat([
                X_train, pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather')
            ], axis=1)
            X_valid = pd.concat([
                X_valid, pd.read_feather(f'{FEATURE_DIR}/{feat}_valid.feather')
            ], axis=1)
            X_test = pd.concat([
                X_test, pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather')
            ], axis=1)
            unique_num = pd.concat([
                X_train[feat], X_valid[feat], X_test[feat]
            ]).nunique()
            unique_num_dic[feat] = unique_num
        for feat in varlen_sparse_features:
            logging.info(f'[varlen sparse][{feat}] read feature ...')
            feature_index[feat] = (fidx, fidx + VARLEN_MAX_LEN)
            fidx += VARLEN_MAX_LEN

            train_feat = pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather').values
            varlen_list = [i[0] for i in train_feat]
            varlen_list = pad_sequences(varlen_list, maxlen=VARLEN_MAX_LEN, padding='post', )
            X_train = pd.concat([
                X_train, pd.DataFrame(varlen_list)
            ], axis=1)

            valid_feat = pd.read_feather(f'{FEATURE_DIR}/{feat}_valid.feather').values
            varlen_list = [i[0] for i in valid_feat]
            varlen_list = pad_sequences(varlen_list, maxlen=VARLEN_MAX_LEN, padding='post', )
            X_valid = pd.concat([
                X_valid, pd.DataFrame(varlen_list)
            ], axis=1)

            test_feat = pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather').values
            varlen_list = [i[0] for i in test_feat]
            varlen_list = pad_sequences(varlen_list, maxlen=VARLEN_MAX_LEN, padding='post', )
            X_test = pd.concat([
                X_test, pd.DataFrame(varlen_list)
            ], axis=1)

            tmp = []
            for i in [i[0] for i in train_feat] + [i[0] for i in valid_feat] + [i[0] for i in test_feat]:
                tmp.extend(i)
            unique_num = len(set(tmp))
            unique_num_dic[feat] = unique_num
        X_train = X_train.fillna(0.0)
        X_valid = X_valid.fillna(0.0)
        X_test = X_test.fillna(0.0)

    logging.info('SPARSE FEATURE UNIQUE_NUM')
    print(unique_num_dic)

    with t.timer(f'READ folds'):
        folds = pd.read_csv(f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv')

    mlflow.set_experiment(EXP_NAME)
    mlflow.start_run()
    run_id = mlflow.active_run().info.run_id

    fold_best_scores = {}  # fold_idx:best_cv_score
    for fold_idx in range(FOLD_NUM):

        trn_idx = folds[folds.kfold != fold_idx].index.tolist()
        val_idx = folds[folds.kfold == fold_idx].index.tolist()

        x_trn = X_train.iloc[trn_idx]
        y_trn = y_train[trn_idx]
        x_val = X_train.iloc[val_idx]
        y_val = y_train[val_idx]

        train_loader = SimpleDataLoader(
            [torch.from_numpy(x_trn.values), torch.from_numpy(y_trn)],
            batch_size=BATCH_SIZE,
            shuffle=True
        )

        model = load_model(
            feature_index=feature_index,
            unique_num_dic=unique_num_dic,
        )
        loss_func = set_loss(loss_name=LOSS)
        optim = torch.optim.Adam(model.parameters(), lr=LR)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optim, T_max=EPOCH_NUM)

        loss_history = []

        steps_per_epoch = (len(x_trn) - 1) // BATCH_SIZE + 1
        best_score = 999.9
        for epoch in range(EPOCH_NUM):

            loss_history_epoch = []
            metric_history_epoch = []

            logging.info(f'[{DEVICE}][FOLD:{fold_idx}] EPOCH - {epoch} / {EPOCH_NUM}')
            model = model.train()
            for bi, (bx, by) in tqdm(enumerate(train_loader), total=steps_per_epoch):

                optim.zero_grad()

                bx = bx.to(DEVICE).float()
                by = by.to(DEVICE).float().squeeze()
                y_pred = model(bx).squeeze()

                loss = 0.0
                for loss_f in loss_func:
                    loss += loss_f(y_pred, by)
                loss = loss + model.reg_loss.item()

                loss.backward()
                optim.step()

                y_pred_np = y_pred.cpu().detach().numpy().reshape(-1, 1)
                y_np = by.cpu().detach().numpy().reshape(-1, 1)

                try:
                    if LABEL_LOG_SCALING is True:
                        y_pred_inv = label_inverse_scaling(scaler, y_pred_np)
                        y_inv = label_inverse_scaling(scaler, y_np)
                        mlse = mean_squared_log_error(y_inv, y_pred_inv)
                    else:
                        mlse = mean_squared_log_error(y_np, y_pred_np)
                    loss_history_epoch.append(loss.item())
                    metric_history_epoch.append(mlse)
                except:
                    continue

            scheduler.step()
            trn_loss_epoch = sum(loss_history_epoch) / len(loss_history_epoch)
            trn_metric_epoch = sum(metric_history_epoch) / len(metric_history_epoch)

            preds_val = model.predict(x_val, BATCH_SIZE)
            val_loss = 0.0
            for loss_f in loss_func:
                val_loss += loss_f(torch.from_numpy(preds_val.reshape(-1, 1)), torch.from_numpy(y_val)).item()

            try:
                if LABEL_LOG_SCALING is True:
                    preds_val_inv = label_inverse_scaling(scaler, preds_val.reshape(-1, 1))
                    y_val_inv = label_inverse_scaling(scaler, y_val)
                    val_metric = mean_squared_log_error(y_val_inv, preds_val_inv)
                else:
                    val_metric = mean_squared_log_error(y_val, preds_val)
            except:
                continue

            logging.info(f'Train - Loss: {trn_loss_epoch}, MSLE: {trn_metric_epoch}')
            logging.info(f'Valid - Loss: {val_loss}, MSLE: {val_metric}')
            loss_history.append([
                epoch, trn_loss_epoch, trn_metric_epoch, val_loss, val_metric
            ])

            if val_metric < best_score:
                best_score = val_metric
                weight_path = f'{SAVE_DIR}/model/train_weights_mlflow-{run_id}_fold{fold_idx}.h5'
                torch.save(model.state_dict(), weight_path)
                fold_best_scores[fold_idx] = (best_score, weight_path)
                mlflow.log_artifact(weight_path)

        history_path = f'{SAVE_DIR}/model/loss_history-{run_id}_fold{fold_idx}.csv'
        pd.DataFrame(loss_history, columns=['epoch', 'trn_loss', 'trn_metric', 'val_loss', 'val_metric']).to_csv(history_path)
        mlflow.log_artifact(history_path)

    cv = 0.0
    for fold_idx in range(FOLD_NUM):
        cv += fold_best_scores[fold_idx][0]
    cv /= FOLD_NUM

    preds_train_val = np.zeros(len(X_train))
    for fold_idx in range(FOLD_NUM):

        val_idx = folds[folds.kfold == fold_idx].index.tolist()
        x_val = X_train.iloc[val_idx]

        model = load_model(
            feature_index=feature_index,
            unique_num_dic=unique_num_dic,
        )
        weight_path = fold_best_scores[fold_idx][1]
        model.load_state_dict(torch.load(weight_path))

        preds_train_val_fold = model.predict(x_val, BATCH_SIZE)
        preds_train_val[val_idx] = preds_train_val_fold

        preds_valid = model.predict(X_valid, BATCH_SIZE)
        X_valid[f'preds_{fold_idx}'] = preds_valid

        preds_test = model.predict(X_test, BATCH_SIZE)
        X_test[f'preds_{fold_idx}'] = preds_test

    X_train['preds'] = preds_train_val
    X_valid['preds'] = X_valid[[f'preds_{fold_idx}' for fold_idx in range(FOLD_NUM)]].mean()
    X_test['preds'] = X_test[[f'preds_{fold_idx}' for fold_idx in range(FOLD_NUM)]].mean()

    save_path = f'{SAVE_DIR}/predict/preds_train_val_{run_id}.csv'
    X_train['preds'].to_csv(save_path, index=False, header=None)
    mlflow.log_artifact(save_path)

    save_path = f'{SAVE_DIR}/predict/preds_valid_{run_id}.csv'
    X_valid['preds'].to_csv(save_path, index=False, header=None)
    mlflow.log_artifact(save_path)

    save_path = f'{SAVE_DIR}/predict/preds_test_{run_id}.csv'
    X_test['preds'].to_csv(save_path, index=False, header=None)
    mlflow.log_artifact(save_path)

    save_mlflow(run_id, cv, fold_best_scores)
    mlflow.end_run()


if __name__ == "__main__":

    main()


In [27]:
# example of a normalization
from numpy import asarray
from sklearn.preprocessing import MinMaxScaler
# define data
data = asarray([[112, 0.001],
				[8, 0.05],
				[50, 0.005],
				[88, 0.07],
				[4, 0.1]])
print(data)
# define min max scaler
scaler = MinMaxScaler()
# transform data
scaled = scaler.fit_transform(data)
print(scaled)

scaler = MinMaxScaler()
scaler.fit(data)
val = scaler.transform(data)
print(val)

def label_inverse_scaling(scaler, val):
    val = scaler.inverse_transform(val)
    val = np.exp(val) - 1
    return val


[[1.12e+02 1.00e-03]
 [8.00e+00 5.00e-02]
 [5.00e+01 5.00e-03]
 [8.80e+01 7.00e-02]
 [4.00e+00 1.00e-01]]
[[1.         0.        ]
 [0.03703704 0.49494949]
 [0.42592593 0.04040404]
 [0.77777778 0.6969697 ]
 [0.         1.        ]]
[[1.         0.        ]
 [0.03703704 0.49494949]
 [0.42592593 0.04040404]
 [0.77777778 0.6969697 ]
 [0.         1.        ]]


In [28]:
import numpy as np 
def label_scaling(val):
    val = np.log(val + 1)
    scaler = MinMaxScaler()
    scaler.fit(val)
    val = scaler.transform(val)
    return scaler, val


def label_inverse_scaling(scaler, val):
    val = scaler.inverse_transform(val)
    val = np.exp(val) - 1
    return val

scaler, val = label_scaling(data)

In [29]:
scaler

MinMaxScaler(copy=True, feature_range=(0, 1))

In [30]:
val

array([[1.        , 0.        ],
       [0.18851703, 0.5067365 ],
       [0.74484446, 0.04228621],
       [0.92342678, 0.70680382],
       [0.        , 1.        ]])

In [31]:
da = label_inverse_scaling(scaler,val)
da

array([[1.12e+02, 1.00e-03],
       [8.00e+00, 5.00e-02],
       [5.00e+01, 5.00e-03],
       [8.80e+01, 7.00e-02],
       [4.00e+00, 1.00e-01]])

In [34]:
def set_loss(loss_name):
    if type(loss_name) == str:
        if loss_name == 'MSE':
            loss_func = nn.MSELoss(reduction='mean')
        elif loss_name == 'MAE':
            loss_func = nn.L1Loss(reduction='mean')
        elif loss_name == 'Huber':
            loss_func = nn.SmoothL1Loss(reduction='mean')
        elif loss_name == 'LogCosh':
            loss_func = LogCoshLoss()
        loss_func_list = [loss_func]
    elif type(loss_name) == list:
        loss_func_list = []
        for ln in loss_name:
            if ln == 'MSE':
                loss_func = nn.MSELoss(reduction='mean')
            elif ln == 'MAE':
                loss_func = nn.L1Loss(reduction='mean')
            elif ln == 'Huber':
                loss_func = nn.SmoothL1Loss(reduction='mean')
            elif ln == 'LogCosh':
                loss_func = LogCoshLoss()
            loss_func_list.append(loss_func)
    return loss_func_list
set_loss(loss_name='MSE')

[MSELoss()]