# How to 🤗'Hugging Face' —— A mini and basic template to start with Pre-trained Models
> Take regression model for example

## 1. import packages

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

## 2. Choose your favourite pretrained model in huggingface

In [None]:
# for example
pretrained_model_path = '../input/huggingface-bert-variants/bert-base-cased/bert-base-cased'

## 3. Dataset class
1. inherit from torch.utils.data.Dataset
2. implement `__init__`, `__getitem__` and `__len__` methods
3. `__getitem__` method should do these things:
    1. tokenize the sentence
    2. convert tokenized sentence to tensor
    3. return the tensor and label(if you have the label)


In [None]:
class Mydataset(Dataset):
    def __init__(self, df, pretrained_model_path):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx, train=True):
        sample = self.df.iloc[idx].reset_index(drop=True) # get the idx row of df
        context = sample['here is the context']

        if train: # if train, we would get the label, but if test, we would not
            label = sample['here is the label'].astype(np.float32)
        # call tokenizer to encode the context
        tokens = self.tokenizer(context, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

        # get input_ids, attention_mask from tokens (or you can just return the tokens and split it later)
        input_ids = tokens['input_ids'].squeeze(0)
        attention_mask = tokens['attention_mask'].squeeze(0)

        # return the input_ids, attention_mask, label
        if train:
            return input_ids, attention_mask, label
        else:
            return input_ids, attention_mask


## 4. Model class
1. inherit from torch.nn.Module
2. implement `__init__`, `forward`
3. the `forward` will be called when `model(input_ids, attention_mask)`


In [None]:
class Model(nn.Module):
    def __init__(self, pretrained_model_path):
        super(Model, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_path)
        self.down = nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs['last_hidden_state']  # now we got the last hidden state like [batch_size, seq_len, hidden_size], the hidden_size is 768 in bert-base
        down = self.down(last_hidden_states) # now we got the logits like [batch_size, seq_len, 1]
        # now we got the logits like [batch_size, 1], but to adjust the label dimension, we need to squeeze the first dimension[batch_size], we do this
        logits = down[:, 0, :].squeeze(-1)

        return logits



### Attention:
We only use the first token in `logits = down[:, 0, :]` because [CLS] stands for the whole sentence meaning.

But why? That's because it's used for NSP(Next Sentence Prediction) task **when the BERT was pretraining**,
So the first token "[CLS]" contains more **high dimensional information** than the other tokens.
Which means you can also use the second token like `logits = down[:, 1, :]` or any other token, even sum them up.
But seldom we use the other tokens, because they are not trained to be use such kind of task.(They preform better in other tasks like fill-mask)

> You can click [here](https://huggingface.co/docs/transformers/main/en/model_doc/bert#transformers.BertModel.forward.returns) for more details about the model output.


## 5. Load data && Split dataset && Get the parameter done


In [None]:
# load data
df = pd.read_csv('to/you/data/path/train.csv')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Model(pretrained_model_path).to(device)
df_dataset = Mydataset(df, pretrained_model_path)
# train params
EPOCHS = 1
BS = 32
# generate loaders
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Mydataset(train_df, pretrained_model_path)
val_dataset = Mydataset(val_df, pretrained_model_path)
train_dataloader = DataLoader(train_dataset, batch_size=BS, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BS, shuffle=True)

# optimizer && loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.MSELoss()


## 6. Pytorch origin style train loop
> Surely you can use pytorch lightning or the Trainer API of huggingface to train your model,but I prefer to write my own train loop.
It's more readable and easy to understand, and more flexible.


In [None]:
from tqdm import tqdm

def train(epoch):
    global step
    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        for step, batch in enumerate(tqdm(train_dataloader)):
            # get batch from __getitem__ of dataset class
            # and put them to device(CPU or GPU)
            input_ids, attention_mask, label = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            label = label.to(device)
            # forward
            logits = model(input_ids, attention_mask)
            # calculate loss && do backward
            optimizer.zero_grad()
            loss = criterion(logits, label)
            loss.backward()
            # update parameters
            optimizer.step()
            total_loss += loss.item() # accumulate loss

        print(f'Epoch {epoch+1}/{EPOCHS} loss: {total_loss/step}')


## 7. Pytorch origin style evaluate


In [None]:
def evaluate():
    model.eval()
    total_loss = 0
    with torch.no_grad(): # no need to calculate the gradient
        for step, batch in enumerate(tqdm(val_dataloader)):
            # get batch from __getitem__ of dataset class
            # and put them to device(CPU or GPU)
            input_ids, attention_mask, label = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            label = label.to(device)
            # forward
            logits = model(input_ids, attention_mask)
            # calculate loss
            loss = criterion(logits, label)
            # record the loss
            total_loss += loss.item()
    # return average loss
    return total_loss / len(val_dataloader)


## 8. Do what you want


In [None]:
def my_process():
    train(EPOCHS)
    val_loss = evaluate()
    print(f'val_loss: {val_loss}')

## 9.Main entry

In [None]:
if __name__ == '__main__':
    my_process()