In [3]:
#load data from huggingface
from datasets import load_dataset
ds_train = load_dataset("mou3az/Question-Answering-Generation-Choices")
ds_val = load_dataset("xwjzds/extractive_qa_question_answering_hr")
print(ds_train["train"].column_names)
print(ds_val["train"].column_names)

['context', 'question', 'answer', 'distractors']
['Unnamed: 0', 'question', 'answer', 'answer_context']


In [None]:
#save data into local drive
import pandas as pd
import os

# Convert data to DataFrame
df_train = pd.DataFrame(ds_train["train"])[['question', 'answer', 'context']]
df_val = pd.DataFrame(ds_val["train"])[['question', 'answer', 'answer_context']]

# rename columns
df_val.rename(columns={'answer_context': 'context'}, inplace=True)

# Create the directory if it doesn't exist
data_dir = 'data'
os.makedirs(data_dir, exist_ok=True)

# Adjust file paths with the correct directory structure
train_file = os.path.join(data_dir, 'train_set.xlsx')
val_file = os.path.join(data_dir, 'val_set.xlsx')

# Save data to an Excel file
df_train.to_excel(train_file, index=False)
df_val.to_excel(val_file, index=False)

print(f"The data has been saved to the file {train_file}")
print(f"The data has been saved to the file {val_file}")

In [2]:
# Load data and preprocessing
import pandas as pd

# Load data from Excel files
train = pd.read_excel('data/train_set.xlsx')
val = pd.read_excel('data/val_set.xlsx')

# Remove rows with missing values
train_cleaned = train.dropna()
val_cleaned = val.dropna()

# Replace specific values in 'context' column
val_filled = val_cleaned.replace({"context": {"Employee: ": ""}}, regex=True)

# Remove duplicate records
train_deduplicated = train_cleaned.drop_duplicates()
val_deduplicated = val_filled.drop_duplicates()

# Convert data types if necessary on cleaned and deduplicated dataframes
train_dt = train_deduplicated.copy()
train_dt.loc[:, 'question'] = train_dt['question'].astype(str)
train_dt.loc[:, 'answer'] = train_dt['answer'].astype(str)
train_dt.loc[:, 'context'] = train_dt['context'].astype(str)

val_dt = val_deduplicated.copy()
val_dt.loc[:, 'question'] = val_dt['question'].astype(str)
val_dt.loc[:, 'answer'] = val_dt['answer'].astype(str)
val_dt.loc[:, 'context'] = val_deduplicated['context'].astype(str)

# Display the first 5 rows after cleaning
train_text= list(train_deduplicated['question'])
val_text= list(val_deduplicated['question'])

In [3]:
from huggingface_hub import login

# Replace 'your_huggingface_token' with your actual Hugging Face token
huggingface_token = "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"

# Log in to Hugging Face
login(huggingface_token)

  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\Administrator\.cache\huggingface\token
Login successful


In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader, Dataset
import torch
import wandb
import os

# Load the tokenizer and the pre-trained Llama3 model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Initialize the model without specifying the device here
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto",)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['question'], padding="max_length", truncation=True, max_length=258, return_tensors="pt")

# Apply the tokenization
train_data_tokenized = train_dt.apply(tokenize_function, axis=1)
val_data_tokenized = val_dt.apply(tokenize_function, axis=1)

# Define the custom dataset class
class CustomDataset(Dataset):
    def __init__(self, tokenized_data, answers):
        self.input_ids = [data['input_ids'][0] for data in tokenized_data]
        self.attention_mask = [data['attention_mask'][0] for data in tokenized_data]
        self.labels = tokenizer(answers, padding="max_length", truncation=True, max_length=258, return_tensors="pt")['input_ids']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx]
        }

# Prepare the dataset training
dataset_train = CustomDataset(train_data_tokenized, train_dt['answer'].tolist())
dataset_vali = CustomDataset(val_data_tokenized, val_dt['answer'].tolist())

# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="my-lln-project"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

# pass "wandb" to the 'report_to' parameter to turn on wandb logging
training_args = TrainingArguments(
    output_dir='models',
    report_to="wandb",
    logging_steps=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=20,
    max_steps = 100,
    save_steps = 100
)

# define the trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_vali,
)
trainer.train()

# [optional] finish the wandb run, necessary in notebooks
wandb.finish()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 20.35it/s]
Some parameters are on the meta device device because they were offloaded to the disk and cpu.


: 

In [9]:
# Function to generate response
def generate_response(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)["input_ids"]
    output = model.generate(input_ids, max_length=50, num_return_sequences=1, pad_token_id=tokenizer.pad_token_id)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Example usage
question = "1 + 1 ="
response = generate_response(question)
print(f"Question: {question}")
print(f"Response: {response}")

Question: 1 + 1 =
Response: 1 + 1 = 2 (1 + 1 = 2)
2 + 2 = 4 (2 + 2 = 4)
3 + 3 = 6 (3 + 3 = 6)
4
