In [2]:
import torch
import pandas as pd
import os, glob
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

In [3]:
dataset = load_dataset("cais/mmlu", 'all')
dataset


DatasetDict({
    test: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 14042
    })
    validation: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 1531
    })
    dev: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 285
    })
    auxiliary_train: Dataset({
        features: ['question', 'subject', 'choices', 'answer'],
        num_rows: 99842
    })
})

In [4]:
device = "cuda:0"
model_path = "databricks/dolly-v2-3b"
#model_path = "ibm-granite/granite-3b-code-base"
#model_path = "ibm-granite/granite-7b-base"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
model = AutoModelForCausalLM.from_pretrained(model_path)

In [6]:
def tokenize_prompt(data_point):
    prompt = f""" Given a question and the possible answer choices, give the index of the right choice.
    ### Question
    {data_point['question']}
    ### Choices
    {data_point['choices']}
    ### Answer
    {data_point['answer']}
"""
    return tokenizer(prompt,
                     padding="max_length",
                     max_length=256,
                     truncation=True)

# def tokenize(prompt):
#     return tokenizer(prompt,
#                      padding="max_length",
#                      truncation=True)
     

In [None]:
#tokenized_datasets = dataset.map(tokenize_prompt)

In [13]:
small_train_dataset = dataset['auxiliary_train'].shuffle(seed=42).select(range(1000))
small_eval_dataset = dataset['validation'].shuffle(seed=42).select(range(200))

small_train_dataset_tokenized = small_train_dataset.map(tokenize_prompt)
small_eval_dataset_tokenized = small_eval_dataset.map(tokenize_prompt)

In [14]:
training_args = TrainingArguments(output_dir="train_output",
                                   eval_strategy="steps",
                                   max_steps=1000,
                                   eval_steps=50,
                                   save_steps=100,
                                   learning_rate=2.5e-5,
                                   fp16=True,
                                   per_device_train_batch_size=1, # This improved the memory utilization
                                   gradient_accumulation_steps=4, # Combined with the above
                                   gradient_checkpointing=True, # and this
                                   optim="adamw_bnb_8bit",
                                   do_eval=True,
                                   report_to="none")

data_collator = DataCollatorForLanguageModeling(tokenizer,
                                               mlm=False)
# MLM false: masked lang model: false is for causal language model; labels are a copy of the input so the collator (GPT like)
# MLM true: randomly mask tokens in a seq and the model predicts that (BERT like)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset_tokenized,
    eval_dataset=small_eval_dataset_tokenized,
    data_collator=data_collator
)

max_steps is given, it will override any value given in num_train_epochs


In [15]:
trainer._train_batch_size

2

In [16]:
trainer.train()

[2024-07-18 16:24:01,024] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


: 

Even with all the gradient checkpointing, accumulation, fp16 training, and changing the adam optimizer, the two GPUs still go OOM.

Moving on to accelerate, bitsandbytes, and deepspeed

Next would be LORA

## CSV world

In [6]:
# In case we want to use the local copy of the mmlu dataset
# We can create something like this
# However this has to be debugged
# IMO the local csvs should be created keeping datasets in mind

MMLU_DATA_PATH = '/usr/data/mmlu'
TRAIN_DATA = os.path.join(MMLU_DATA_PATH, 'auxiliary_train')
TEST_DATA = os.path.join(MMLU_DATA_PATH, 'test')



def combine_csv_files(directory, output_file=None):
    """
    Combine all CSV files in the specified directory into a single DataFrame and add a column for the filename.

    Parameters:
    directory (str): The path to the directory containing the CSV files.
    output_file (str, optional): The path to save the combined DataFrame as a CSV file. Default is None.

    Returns:
    pd.DataFrame: The combined DataFrame.
    """
    # Get a list of all CSV files in the directory
    csv_files = glob.glob(os.path.join(directory, '*.csv'))

    # List to store individual DataFrames
    dfs = []

    # Read each CSV file into a DataFrame and add a column for the filename
    for csv_file in csv_files:
        # Extract the file name without the extension
        file_name = os.path.splitext(os.path.basename(csv_file))[0]
        # Read the CSV file into a DataFrame
        df = pd.read_csv(csv_file)
        # Add a column for the filename
        df['filename'] = file_name
        # Append the DataFrame to the list
        dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)

    # Optionally, save the combined DataFrame to a new CSV file
    if output_file:
        combined_df.to_csv(output_file, index=False)

    return combined_df

train_data = combine_csv_files(TEST_DATA)
train_data