In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from dataclasses import dataclass, field
import os,sys,inspect
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    glue_compute_metrics,
    glue_output_modes,
    glue_tasks_num_labels,
    set_seed,
)
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction, GlueDataset
from transformers.data.processors.utils import InputFeatures
import torch
from torch.utils.data import Dataset, DataLoader
from typing import Dict, Optional
from sklearn.model_selection import train_test_split
from multiprocessing import Pool

import logging

In [3]:
logger = logging.getLogger(__name__)

In [4]:
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [5]:
import config

In [6]:
MODEL_NAME=config.PRE_TRAINED_MODEL_NAME

### Setup Path and load data

In [7]:
path = Path('/data/yelp/')
assert path.exists()

In [8]:
balanced_simplified_reviews = pd.read_pickle(path/'balanced_simplified_reviews.pkl')
balanced_simplified_reviews['stars'] = balanced_simplified_reviews['stars'] - 1

In [9]:
balanced_simplified_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 394411 entries, 31 to 999998
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    394411 non-null  object
 1   stars   394411 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 9.0+ MB


In [10]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )

In [11]:
model_args = ModelArguments(
    model_name_or_path=MODEL_NAME
)

In [12]:
training_args = TrainingArguments(
    output_dir=str(path/'simple_output'),
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_gpu_train_batch_size=8,
    per_gpu_eval_batch_size=8,
    num_train_epochs=1,
    logging_steps=20000,
    logging_first_step=True,
    save_steps=40000,
    evaluate_during_training=True,
    fp16=True
)

In [13]:
??TrainingArguments

[0;31mInit signature:[0m
[0mTrainingArguments[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0moutput_dir[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moverwrite_output_dir[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdo_train[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdo_eval[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdo_predict[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mevaluate_during_training[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mper_device_train_batch_size[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m8[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mper_device_eval_batch_size[0m[0;34m:[0m [0mint[0m [0;34m=[0m [0;36m8[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [

In [13]:
set_seed(training_args.seed)

In [14]:
num_labels = config.NUM_CLASSES
num_labels

5

#### Prepare Tokenizer and Model

In [15]:
bert_config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    finetuning_task='yelp_simple',
)

In [16]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME
)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    config=bert_config
)

#### Prepare the Yelp dataset

In [17]:
class YelpDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews, self.targets,  self.tokenizer, self.max_len = reviews.to_numpy(), targets.to_numpy(), tokenizer, max_len
        
    def __len__(self):
        return len(self.reviews)
        
    def __getitem__(self, item):
        review = self.reviews[item]
        tokens = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return InputFeatures(input_ids = tokens['input_ids'].flatten().long().numpy().tolist(), 
                             attention_mask=tokens['attention_mask'].flatten().long().numpy().tolist(),
                             label=torch.tensor(self.targets[item]).long().item())
    
    def get_labels(self):
        return ['1', '2', '3', '4', '5']

In [18]:
train_df, val_df = train_test_split(balanced_simplified_reviews, test_size=0.1, random_state=config.RANDOM_SEED, 
                                     stratify=balanced_simplified_reviews.stars.values)

In [19]:
def create_dataset(df, tokenizer, max_length, batch_size):
    return YelpDataset(df['text'], df['stars'], tokenizer, max_length)

In [20]:
train_dataset = create_dataset(train_df, tokenizer, config.MAX_LENGTH, 4)
eval_dataset = create_dataset(val_df, tokenizer, config.MAX_LENGTH, 4)

#### Metrics

In [21]:
def compute_metrics(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    return glue_compute_metrics('mnli', preds, p.label_ids)

#### Trainer and training

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

[34m[1mwandb[0m: Wandb version 0.9.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [23]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=44372.0, style=ProgressStyle(description_…

[34m[1mwandb[0m: Wandb version 0.9.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{"loss": 9.630614519119263e-05, "learning_rate": 4.999887316325611e-05, "epoch": 2.2536734877850896e-05, "step": 1}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=4931.0, style=ProgressStyle(description_…


{"eval_loss": 1.6934803839605979, "eval_mnli/acc": 0.20587191318898637, "epoch": 2.2536734877850896e-05, "step": 1}
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{"loss": 0.8748415733605623, "learning_rate": 2.7463265122149106e-05, "epoch": 0.45073469755701795, "step": 20000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=4931.0, style=ProgressStyle(description_…


{"eval_loss": 0.8067837300104294, "eval_mnli/acc": 0.6611733684904416, "epoch": 0.45073469755701795, "step": 20000}
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2048.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{"loss": 0.7735337958194315, "learning_rate": 4.926530244298206e-06, "epoch": 0.9014693951140359, "step": 40000}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=4931.0, style=ProgressStyle(description_…


{"eval_loss": 0.728941692629781, "eval_mnli/acc": 0.6869580650068455, "epoch": 0.9014693951140359, "step": 40000}




Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0


CPU times: user 3h 3min 43s, sys: 12min 14s, total: 3h 15min 58s
Wall time: 3h 14min 30s


TrainOutput(global_step=44372, training_loss=0.8156205046088119)

#### Evaluate

In [61]:
result = trainer.evaluate()

output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
if trainer.is_world_master():
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key, value in result.items():
            logger.info("  %s = %s", key, value)
            writer.write("%s = %s\n" % (key, value))

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=4931.0, style=ProgressStyle(description_…


{"eval_loss": 0.7273900160995465, "eval_mnli/acc": 0.6893159576086405, "epoch": 1.0, "step": 44372}


#### Save the model

In [65]:
trainer.save_model()
# For convenience, we also re-save the tokenizer to the same directory,
# so that you can share your model easily on huggingface.co/models =)
if trainer.is_world_master():
    tokenizer.save_pretrained(training_args.output_dir)

#### Predict

Trying some batch predictions here

In [50]:
sample_df = val_df[:20]

In [51]:
predict_ds = create_dataset(sample_df, tokenizer, config.MAX_LENGTH, 4)

In [52]:
predictions, label_ids, metrics = trainer.predict(predict_ds)

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


HBox(children=(FloatProgress(value=0.0, description='Prediction', max=3.0, style=ProgressStyle(description_wid…




In [53]:
predictions

array([[-2.9667969 ,  1.0634766 ,  3.4570312 ,  1.1220703 , -2.8164062 ],
       [-4.0039062 , -3.1171875 ,  0.3955078 ,  3.3613281 ,  3.1542969 ],
       [ 0.11706543, -0.33154297, -0.36645508, -0.3203125 , -0.20825195],
       [-4.7851562 , -1.6845703 ,  2.9609375 ,  3.40625   ,  0.24438477],
       [ 4.15625   ,  2.59375   , -0.32104492, -3.15625   , -3.5507812 ],
       [-0.90185547,  4.1757812 ,  1.9990234 , -2.0273438 , -3.5742188 ],
       [ 4.8164062 ,  1.8476562 , -0.71435547, -2.34375   , -2.6230469 ],
       [ 1.5224609 ,  1.3359375 ,  0.18017578, -1.7402344 , -2.6601562 ],
       [-4.6640625 , -2.5976562 ,  1.7802734 ,  4.3984375 ,  1.4365234 ],
       [-4.125     , -2.6210938 ,  1.3828125 ,  3.2070312 ,  1.9951172 ],
       [-1.9296875 ,  2.4257812 ,  3.4785156 , -0.08337402, -3.8183594 ],
       [ 1.1699219 ,  3.2988281 ,  1.4648438 , -2.5351562 , -4.2226562 ],
       [-4.1796875 , -2.3242188 ,  1.7060547 ,  3.3085938 ,  1.5009766 ],
       [-4.125     , -0.8623047 ,  2.8

In [54]:
preds = np.argmax(predictions, axis=1)

In [55]:
sample_df['preds'] =  preds 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [57]:
correct = np.sum((sample_df['stars'] == sample_df['preds']).astype('uint16'))
print(f'{correct} answers correct')
print(f'{correct / len(sample_df)} accuracy')

14 answers correct
0.7 accuracy
