In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from dataclasses import dataclass, field
import os,sys,inspect
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    glue_compute_metrics,
    glue_output_modes,
    glue_tasks_num_labels,
    set_seed,
)
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction, GlueDataset
from transformers.data.processors.utils import InputFeatures
import torch
from torch.utils.data import Dataset, DataLoader
from typing import Dict, Optional
from sklearn.model_selection import train_test_split
from multiprocessing import Pool

import logging

In [3]:
logger = logging.getLogger(__name__)

In [4]:
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [5]:
import config

In [6]:
MODEL_NAME=config.PRE_TRAINED_MODEL_NAME

### Setup Path and load data

In [7]:
path = Path('/data/yelp/')
assert path.exists()

In [8]:
balanced_simplified_reviews = pd.read_pickle(path/'balanced_simplified_reviews_4m.pkl')
balanced_simplified_reviews['stars'] = balanced_simplified_reviews['stars'] - 1

In [9]:
balanced_simplified_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1570016 entries, 31 to 3999998
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1570016 non-null  object
 1   stars   1570016 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 35.9+ MB


In [10]:
len(balanced_simplified_reviews)

1570016

In [11]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )

In [12]:
model_args = ModelArguments(
    model_name_or_path=MODEL_NAME
)

In [13]:
training_args = TrainingArguments(
    output_dir=str(path/'simple_output'),
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_gpu_train_batch_size=16,
    per_gpu_eval_batch_size=16,
    num_train_epochs=2,
    logging_steps=20000,
    logging_first_step=True,
    save_steps=40000,
    evaluate_during_training=True,
    fp16=True,
    eval_steps=60000
)

In [14]:
# ??TrainingArguments

In [15]:
set_seed(training_args.seed)

In [16]:
num_labels = config.NUM_CLASSES
num_labels

5

#### Prepare Tokenizer and Model

In [17]:
bert_config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    finetuning_task='yelp_simple',
)

In [18]:
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME
)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    config=bert_config
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [21]:
# ??tokenizer.encode_plus

#### Prepare the Yelp dataset

In [22]:
class YelpDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews, self.targets,  self.tokenizer, self.max_len = reviews.to_numpy(), targets.to_numpy(), tokenizer, max_len
        
    def __len__(self):
        return len(self.reviews)
        
    def __getitem__(self, item):
        review = self.reviews[item]
        tokens = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return InputFeatures(input_ids = tokens['input_ids'].flatten().long().numpy().tolist(), 
                             attention_mask=tokens['attention_mask'].flatten().long().numpy().tolist(),
                             label=torch.tensor(self.targets[item]).long().item())
    
    def get_labels(self):
        return ['1', '2', '3', '4', '5']

In [23]:
train_df, val_df = train_test_split(balanced_simplified_reviews, test_size=0.1, random_state=config.RANDOM_SEED, 
                                     stratify=balanced_simplified_reviews.stars.values)

In [24]:
def create_dataset(df, tokenizer, max_length, batch_size):
    return YelpDataset(df['text'], df['stars'], tokenizer, max_length)

In [25]:
train_dataset = create_dataset(train_df, tokenizer, config.MAX_LENGTH, 4)
eval_dataset = create_dataset(val_df, tokenizer, config.MAX_LENGTH, 4)

#### Metrics

In [26]:
def compute_metrics(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    return glue_compute_metrics('mnli', preds, p.label_ids)

#### Trainer and training

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=2.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=44157.0, style=ProgressStyle(description_…



Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 65536.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 65536.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 65536.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss sc



Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0



HBox(children=(FloatProgress(value=0.0, description='Iteration', max=44157.0, style=ProgressStyle(description_…

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 65536.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 4096.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 2048.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 1024.0


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=4907.0, style=ProgressStyle(description_…


Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 65536.0


#### Evaluate

In [None]:
result = trainer.evaluate()

output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
if trainer.is_world_master():
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key, value in result.items():
            logger.info("  %s = %s", key, value)
            writer.write("%s = %s\n" % (key, value))

#### Save the model

In [None]:
trainer.save_model()
# For convenience, we also re-save the tokenizer to the same directory,
# so that you can share your model easily on huggingface.co/models =)
if trainer.is_world_master():
    tokenizer.save_pretrained(training_args.output_dir)

#### Predict

Trying some batch predictions here

In [None]:
sample_df = val_df[:20]

In [None]:
predict_ds = create_dataset(sample_df, tokenizer, config.MAX_LENGTH, 4)

In [None]:
predictions, label_ids, metrics = trainer.predict(predict_ds)

In [None]:
predictions

In [None]:
preds = np.argmax(predictions, axis=1)

In [None]:
sample_df['preds'] =  preds 

In [None]:
correct = np.sum((sample_df['stars'] == sample_df['preds']).astype('uint16'))
print(f'{correct} answers correct')
print(f'{correct / len(sample_df)} accuracy')