In [None]:
!pip install torch==1.13.1 torchvision==0.14.1

## This notebook was run on an Intel VM instance(20GB diskspace, 4th Gen Xeon Processor with 8 cores), thanks to IDC

In [None]:
!pip install intel_extension_for_pytorch==1.13.100

In [1]:
import torch
import intel_extension_for_pytorch as ipex

In [2]:
from torch.utils.data import Dataset
import json

class StoryData(Dataset):
    def __init__(self, path, tokenizer):
        self.data = open(path, 'r')
        
        self.dataa = []
        count = 0
        curr = ""
        for line in self.data:
            if line == "": break
            if line == "<|endoftext|>\n":
                self.dataa.append(curr)
                curr = ""
                count += 1
                if count == 100: break
                continue
            
            curr += line.strip() + " "

        self.X = []      

        for i in self.dataa:
                self.X.append(i)
        
        for idx, i in enumerate(self.X):
            try:
                self.X[idx] = "<bos> " + i + "<bot>: " + self.X[idx + 1] + " <eos>"
            except:
                break 
        
        self.X = self.X[:1000]
        # print(self.X[0])
        
        self.X_encoded = tokenizer(self.X, max_length=60, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [7]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.optim import Adam
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm as tqdm

def train(storyData, model, optim):
    epochs = 10
    for i in (range(epochs)):
        for X, a in storyData:
            X = X.to(device)
            a = a.to(device)
            optim.zero_grad()
            loss = model(X, attention_mask=a, labels=X).loss
            loss.backward()
            optim.step()
        torch.save(model.state_dict(), "model_state.pt") 

def infer(inp):
    inp = "<bos> " + inp + "<bot>: "
    inp = tokenizer(inp, return_tensors="pt")
    X = inp['input_ids'].to(device)
    a = inp['attention_mask'].to(device)
    output = model.generate(X, attention_mask=a, max_new_tokens=10)
    output = tokenizer.decode(output[0])
    return output

device = "xpu" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({"pad_token": "<pad>",
                              "bos_token": "<bos>",
                              "eos_token": "<eos>"})
tokenizer.add_tokens(["<bot>:"])

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)


storyData = StoryData("data1.txt", tokenizer)
storyData = DataLoader(storyData, batch_size=32)

optim = Adam(model.parameters(), lr=1e-3)

In [4]:
import time

In [5]:
# Without intel oneapi
start = time.perf_counter()

train(storyData, model, optim)

end = time.perf_counter()

end-start

101.20051460200011

In [8]:
# Use intel ipex to optimize model training
model = ipex.optimize(model)

In [9]:
start = time.perf_counter()

train(storyData, model, optim)

end = time.perf_counter()

end-start

78.67482862799989

### As we can see, the time taken to train a GPT2 model with a custom dataset, was significantly faster using Intel's PyTorch Extensions, thanks to OneAPI. One average, IPEX was 25% faster during model training, with the dataset we used.

In [None]:
while True:
    inp = input("You: ")
    print("Bot:", infer(inp))