In [1]:
import numpy as np
import pandas as pd
import torch 

from run_bert.bert_dataset import BertDatset
from run_bert.trainer import BertPredictor
from run_bert.optim import increase_head_lr
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.optim import Adam
from torch.utils.data import DataLoader
from transformers import BertModel, BertTokenizer, BertForSequenceClassification


pd.options.display.max_columns=500
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500
np.set_printoptions(precision=3)

In [2]:
DEVICE = torch.device('cuda')

df = pd.read_pickle('../data/input/300K_yelp_text_df.pickle')
print(df.shape)

(300000, 9)


In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=1).to(DEVICE)

### Model

In [4]:
model_params = {
    'lr': 1e-5
}

kfold_params = {
    'split_rand_state': 42,
    'split_ratio': .75
}

In [5]:
LEN = 512
text_columns = ['text']
label_column = 'useful'
batch_size = 4
shuffle = False

N_FIRST = int(1e4)

train_df, val_df = train_test_split(df.iloc[:N_FIRST],
                                    test_size=(1-kfold_params['split_ratio']), 
                                    random_state=kfold_params['split_rand_state'])


train_ds = BertDatset(df=train_df,
                      tokenizer=tokenizer,
                      max_seq_len=LEN,
                      label_column=label_column,
                      text_columns=text_columns)


val_ds = BertDatset(df=val_df,
                      tokenizer=tokenizer,
                      max_seq_len=LEN,
                      label_column=label_column,
                      text_columns=text_columns)

train_loader = DataLoader(train_ds,
                          batch_size=batch_size,
                          shuffle=shuffle)
train_loader.num = train_df.shape[0]

    
val_loader = DataLoader(val_ds,
                        batch_size=batch_size,
                        shuffle=shuffle)
val_loader.num = val_df.shape[0]

In [6]:
groupped_params = increase_head_lr(model,
                                   model_params['lr'],
                                   50)

optimizer = Adam(groupped_params,
                lr=model_params['lr'])

In [7]:
description = 'first_try'
trainer = BertPredictor(model=model,
                       train_loader=train_loader,
                       optimizer=optimizer,
                       split_rand_state=kfold_params['split_rand_state'],
                       metric=mean_absolute_error,
                       description=description,
                       device=DEVICE,
                       val_loader=val_loader,
                       epochs_count=7,
                       result_dir='../data/result/bert_output',
                       num_labels=1)


trainer.fit()

  0%|                                                                                          | 0/1875 [00:00<?, ?it/s]

../data/result/bert_output/10-29-17


[1 / 7] Train: Loss = 4.39535: 100%|████████████████████████████████████████████████| 1875/1875 [07:20<00:00,  4.25it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 625/625 [00:49<00:00, 12.75it/s]
  0%|                                                                                          | 0/1875 [00:00<?, ?it/s]

val_metric_score --> 1.041347908973694


[2 / 7] Train: Loss = 4.08905: 100%|████████████████████████████████████████████████| 1875/1875 [07:23<00:00,  4.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 625/625 [00:48<00:00, 12.90it/s]
  0%|                                                                                          | 0/1875 [00:00<?, ?it/s]

val_metric_score --> 1.0036707838654517


[3 / 7] Train: Loss = 3.47362: 100%|████████████████████████████████████████████████| 1875/1875 [07:17<00:00,  4.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 625/625 [00:48<00:00, 12.91it/s]
  0%|                                                                                          | 0/1875 [00:00<?, ?it/s]

val_metric_score --> 1.0379208217345177


[4 / 7] Train: Loss = 2.51195: 100%|████████████████████████████████████████████████| 1875/1875 [07:17<00:00,  4.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 625/625 [00:48<00:00, 12.90it/s]
  0%|                                                                                          | 0/1875 [00:00<?, ?it/s]

val_metric_score --> 1.0716605327554047


[5 / 7] Train: Loss = 1.90419: 100%|████████████████████████████████████████████████| 1875/1875 [07:17<00:00,  4.28it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 625/625 [00:48<00:00, 12.81it/s]
  0%|                                                                                          | 0/1875 [00:00<?, ?it/s]

val_metric_score --> 1.012587962307781


[6 / 7] Train: Loss = 1.37305: 100%|████████████████████████████████████████████████| 1875/1875 [07:22<00:00,  4.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 625/625 [00:48<00:00, 12.78it/s]
  0%|                                                                                          | 0/1875 [00:00<?, ?it/s]

val_metric_score --> 1.10391880736202


[7 / 7] Train: Loss = 1.16408: 100%|████████████████████████████████████████████████| 1875/1875 [07:22<00:00,  4.24it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 625/625 [00:48<00:00, 12.92it/s]

val_metric_score --> 1.326680448961258





In [11]:
true_vals = trainer.logger.np_val_targets
predictions = trainer.logger.np_val_preds

In [14]:
mean_absolute_error(true_vals, predictions[:, :, :].mean(2))

0.9865289195212936

In [10]:
mean_absolute_error(true_vals, [true_vals.mean()] * true_vals.shape[0])

1.1407366399999999