In [None]:
##  /apps/pytorch/2.0.1/bin/python
## /orange/h.azad/s.saini/.env/bin/activate



from slm.models.llama import ModelArgs
import torch
import numpy as np
import torch
from transformers import AutoTokenizer
from slm.models.lm import LanguageModel
from slm.data.dataparquet import ParquetDataset
from torch.utils.data import DataLoader
# import deepspeed

In [None]:
model_args = ModelArgs()
model_args.dim= 512
model_args.n_layers= 8
model_args.n_heads= 8
model_args.n_kv_heads = None
model_args.vocab_size= None  # defined later by tokenizer
model_args.multiple_of = 32  # make SwiGLU hidden layer size multiple of large power of 2
model_args.ffn_dim_multiplier = None
model_args.norm_eps= 1e-5
model_args.max_batch_size= None
model_args.max_seq_len= None

device = 'cuda'
torch.autograd.set_detect_anomaly(True)
# tokenizer = Tokenizer('data/vocab/tinystories28000.model')

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
tokenizer.add_special_tokens({"pad_token":"<pad>"})

lm = LanguageModel(model_args,tokenizer,bsz=32,max_seq_len=1024,device_index=5)
dset = ParquetDataset(['data/0.parquet'])
train_loader = DataLoader(dset, batch_size=32, shuffle=True)
lm.train(train_loader,epochs=20)

# LM_train(model_args,tokenizer,'data/alice_in_wonderland.txt',max_sent_len=128,bsz=32,epochs=1)

# model_engine, optimizer, _, _ = deepspeed.initialize(model=lm.model,
#                                                      model_parameters=lm.model.parameters())

In [None]:
import os
import torch.distributed as dist

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

In [None]:
def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None):
    model.train()
    ddp_loss = torch.zeros(2).to(rank)
    if sampler:
        sampler.set_epoch(epoch)
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(rank), target.to(rank)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target, reduction='sum')
        loss.backward()
        optimizer.step()
        ddp_loss[0] += loss.item()
        ddp_loss[1] += len(data)

    dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM)
    if rank == 0:
        print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1]))

In [None]:
from torch.utils.data.distributed import DistributedSampler
from torch.distributed.fsdp.wrap import (
    size_based_auto_wrap_policy,
    enable_wrap,
    wrap,
)
import functools


def fsdp_main(rank, world_size, args):
    setup(rank, world_size)


    dset = ParquetDataset(['data/0.parquet'])
    train_loader = DataLoader(dset, batch_size=32, shuffle=True)
    sampler = DistributedSampler(dset, rank=rank, num_replicas=world_size, shuffle=True)

    train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler}
    cuda_kwargs = {'num_workers': 2,
                    'pin_memory': True,
                    'shuffle': False}
    train_kwargs.update(cuda_kwargs)

    train_loader = torch.utils.data.DataLoader(dset,**train_kwargs)
    # my_auto_wrap_policy = functools.partial(
    #     size_based_auto_wrap_policy, min_num_params=100
    # )
    torch.cuda.set_device(rank)


    init_start_event = torch.cuda.Event(enable_timing=True)
    init_end_event = torch.cuda.Event(enable_timing=True)

    model_args = ModelArgs()
    model_args.dim= 512
    model_args.n_layers= 8
    model_args.n_heads= 8
    model_args.n_kv_heads = None
    model_args.vocab_size= None  # defined later by tokenizer
    model_args.multiple_of = 32  # make SwiGLU hidden layer size multiple of large power of 2
    model_args.ffn_dim_multiplier = None
    model_args.norm_eps= 1e-5
    model_args.max_batch_size= None
    model_args.max_seq_len= None

    device = 'cuda'
    torch.autograd.set_detect_anomaly(True)
    # tokenizer = Tokenizer('data/vocab/tinystories28000.model')

    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
    tokenizer.add_special_tokens({"pad_token":"<pad>"})

    lm = LanguageModel(model_args,tokenizer,bsz=32,max_seq_len=1024,device=rank,device_index=5)
    lm.fsdp()
    lm.train(train_loader,sampler=sampler,epochs=20)
    # optimizer = optim.Adadelta(model.parameters(), lr=args.lr)

    # scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
    # init_start_event.record()
    # for epoch in range(1, args.epochs + 1):
    #     train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1)
    #     test(model, rank, world_size, test_loader)
    #     scheduler.step()

    init_end_event.record()

    # if rank == 0:
    #     print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec")
        # print(f"{model}")

    # if args.save_model:
    #     # use a barrier to make sure training is done on all ranks
    #     dist.barrier()
    #     states = model.state_dict()
    #     if rank == 0:
    #         torch.save(states, "mnist_cnn.pt")

    cleanup()

In [None]:
from torch.utils.data import Dataset
import pandas as pd
from torch.utils.data import DataLoader


class ParquetDataset(Dataset):
    def __init__(self, files, text_col='text',shuffle=True):

        dfs = []
        print('Preparing data.....')
    
        for filename in files:
            dfs.append(pd.read_parquet(filename))

        df = pd.concat(dfs)
        self.data = df['text'].to_list()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        return self.data[idx]

In [None]:
dset = ParquetDataset(['data/0.parquet'])#,'data/1.parquet','data/2.parquet','data/3.parquet'])


dloader = DataLoader(dset, batch_size=64, shuffle=True)

In [None]:
def get_sents_from_parquets(directory,bsz=10,max_sent_len=10):

    all_indices = []
    dfs = []

    print('Preparing data.....')
    
    for filename in ['data/0.parquet','data/1.parquet','data/2.parquet','data/3.parquet']:
        dfs.append(pd.read_parquet(filename))

    for filename in ['data/dolly/1.parquet','data/squadv2/1.parquet']:
        dfs.append(pd.read_parquet(filename,columns=['text']))
    

    df = pd.concat(dfs)
    len_df = len(df)
    indices = list(range(len_df))
    np.random.shuffle(indices)
    for i in range(0,len_df,bsz):
        yield (df['text'].iloc[indices[i:i+bsz]]).to_list()

In [None]:
type(x) is list