# Process Alpaca English and Translated Thai

In [1]:
import json
import pandas as pd
from datasets import (
    Dataset,
    load_dataset
)
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    HfArgumentParser,
    AdamW,
    DataCollatorForLanguageModeling,
)
from deepspeed.runtime.lr_schedules import WarmupDecayLR
from typing import Optional, Union, List, Dict, Any
import evaluate
from dataclasses import dataclass, field
import torch.nn as nn
import numpy as np
import wandb
import multiprocessing
import copy
cpu_cores = multiprocessing.cpu_count()



2023-03-27 13:11:59.616957: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-27 13:12:00.626429: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.7/lib64
2023-03-27 13:12:00.626545: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.7/lib64


In [None]:
with open('../data_large/alpaca_data.json','r') as f:
    data = json.load(f)
len(data)

In [None]:
data[0]

In [None]:
dset = Dataset.from_list(data)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('facebook/xglm-564M')
tokenizer('hey')

In [None]:
def preprocess_sft(example):
    if example['input']!="":
        example['text'] = f"<human>: {example['instruction']} <context>: {example['input']} <bot>: {example['output']}"
        example['has_context'] = 1
    else:
        example['text'] = f"<human>: {example['instruction']} <bot>: {example['output']}"
        example['has_context'] = 0
    example['nb_tokens'] = len(tokenizer(example['text'])['input_ids'])
    return example

dset_sft = dset.map(preprocess_sft,
                remove_columns=['instruction','input','output'])

In [None]:
#99.9% are less than 512 tokens
pd.Series(dset_sft['nb_tokens']).quantile([i/1000 for i in range(1000)])

In [None]:
#only 37 examples are over 512; we decide to remove it first
(pd.Series(dset_sft['nb_tokens'])>512).sum()

In [None]:
#filter down
dset_sft = dset_sft.filter(lambda example: example["nb_tokens"]<=512)\
            .remove_columns(['nb_tokens'])

dset_sft

In [None]:
dset_sft = dset_sft.train_test_split(test_size=0.05, seed=125)
dset_sft

In [None]:
# dset_sft.push_to_hub('pythainlp/alpaca_en_sft', private=True)

In [2]:
dset_sft = load_dataset('pythainlp/alpaca_en_sft')

Using custom data configuration pythainlp--alpaca_en_sft-8c0a443e01591af7
Found cached dataset parquet (/home/charipol/.cache/huggingface/datasets/pythainlp___parquet/pythainlp--alpaca_en_sft-8c0a443e01591af7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# Load the value-head model and tokenizer.
tokenizer = AutoTokenizer.from_pretrained('facebook/xglm-564M')
model = AutoModelForCausalLM.from_pretrained('facebook/xglm-564M')

# Preprocess the dataset.
def mask_labels(l, context_cue, human_cue, bot_cue):
    result = []
    i = 0
    while i < len(l):
        if (l[i:i+len(human_cue)] == human_cue)|((l[i:i+len(context_cue)] == context_cue)):
            while l[i:i+len(bot_cue)] != bot_cue:
                result.append(-100)
                i += 1
        else:
            result.append(l[i])
            i += 1
    return result
        
def preprocess_function(example):
    tokenized_qa = tokenizer(example['text']+tokenizer.eos_token, 
                            truncation=True, 
                            padding="max_length",
                            max_length=512,
                            add_special_tokens=False
                            )
    labels = copy.deepcopy(tokenized_qa['input_ids'])
    labels = mask_labels(labels, 
              tokenizer('<context>:', add_special_tokens=False)['input_ids'],
              tokenizer('<human>:', add_special_tokens=False)['input_ids'],
              tokenizer('<bot>:', add_special_tokens=False)['input_ids']
             )
    labels = [-100 if i==tokenizer.pad_token_id else i for i in labels]
    return {
        "input_ids": tokenized_qa["input_ids"],
        "attention_mask": tokenized_qa["attention_mask"],
        "labels": labels,
    }


In [4]:
tokenized_ds = dset_sft.map(preprocess_function, 
                      batched=False, 
                      num_proc=5, 
                      )

          

#0:   0%|          | 0/9874 [00:00<?, ?ex/s]

#3:   0%|          | 0/9873 [00:00<?, ?ex/s]

#1:   0%|          | 0/9873 [00:00<?, ?ex/s]

#2:   0%|          | 0/9873 [00:00<?, ?ex/s]

#4:   0%|          | 0/9873 [00:00<?, ?ex/s]

          

#0:   0%|          | 0/520 [00:00<?, ?ex/s]

#1:   0%|          | 0/520 [00:00<?, ?ex/s]

#3:   0%|          | 0/520 [00:00<?, ?ex/s]

#2:   0%|          | 0/520 [00:00<?, ?ex/s]

#4:   0%|          | 0/519 [00:00<?, ?ex/s]