In [None]:
# !pip install -qqq -U git+https://github.com/huggingface/transformers.git
# !pip install -qqq -U git+https://github.com/huggingface/peft.git
# !pip install -qqq -U git+https://github.com/huggingface/accelerate.git
# !pip install -qqq bitsandbytes
# !pip install -qqq datasets
# !pip install evaluate
# !pip install rouge_score

In [3]:
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datetime import datetime
from datasets import load_dataset
import numpy as np
import nltk
import evaluate
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
dataset= load_dataset('csv', data_files = "your csv file path")
dataset = dataset["train"].train_test_split(test_size=0.2)
train_data= dataset["train"]
test_data= dataset["test"]

In [4]:
base_model_id = "HuggingFaceH4/zephyr-7b-alpha"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=784,
    padding_side="left",
    add_eos_token=True)

tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [6]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=784,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [7]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = f"""Extract specific data from the provided OCR text of a marriage record. Identify and return the following details in a structured JSON format: Application number, date of marriage ( month,day, year), place of marriage, birth details (month,day, year, place) of both spouses, their county, gender, given names, residence, state, and surnames. Only show the extracted information. Don't show the OCR Text again. Also don't show the pattern which you are asked to follow. Concentrate on the data asked to extract . The output must strictly adhere to the given details, starting with an opening curly brace '{' and ending with a closing curly brace '}'. Ensure that each key-value pair contains only accurate and relevant information from the OCR text. Avoid including any extraneous or irrelevant data that does not correspond directly to the specified keys. The structure of the output is as follows:

Application: The marriage Application number.
Spouse1_given: The first name of the first spouse.
Spouse1_surname: The last name of the first spouse.
Spouse1_birth_day: The birth day of the first spouse.
Spouse1_birth_month: The birth month of the first spouse.
Spouse1_birth_year: The birth year of the first spouse.
Spouse1_residence: The residence of the first spouse.
Spouse1_county: The county of the first spouse.
Spouse1_state: The state of the first spouse.
Spouse1_birthplace: The birthplace of the first spouse.
Spouse1_gender: The gender of the first spouse.
Spouse2_given: The first name of the second spouse.
Spouse2_surname: The last name of the second spouse.
Spouse2_maiden: The maiden name of the second spouse, if applicable.
Spouse2_birth_day: The birth day of the second spouse.
Spouse2_birth_month: The birth month of the second spouse.
Spouse2_birth_year: The birth year of the second spouse.
Spouse2_residence: The residence of the second spouse.
Spouse2_county: The county of the second spouse.
Spouse2_state: The state of the second spouse.
Spouse2_birthplace: The birthplace of the second spouse.
Spouse2_gender: The gender of the second spouse.
Marriage_day: The day of the marriage.
Marriage_month: The month of the marriage.
Marriage_year: The year of the marriage.
Marriage_place: The location of the marriage.

### OCR Text:
{data_point["Input"]}

### JSON Output:
{data_point["Output"]}
"""
    return tokenize(full_prompt)

In [8]:
tokenized_train_dataset = train_data.map(generate_and_tokenize_prompt)
tokenized_val_dataset = test_data.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/21257 [00:00<?, ? examples/s]

Map:   0%|          | 0/9111 [00:00<?, ? examples/s]

In [9]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [11]:
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 21260288 || all params: 7262992384 || trainable%: 0.2927207805812288


In [None]:
project = "finetuned"
base_model_name = "zephyr_7B_alpha"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_checkpointing=True,
        gradient_accumulation_steps=1,
        max_steps=30000,
        learning_rate=1.5e-5,
        logging_steps=3000,
        bf16=False,
        optim="paged_adamw_32bit",
        logging_dir="./logs",
        save_strategy="steps",
        save_steps=3000,
        evaluation_strategy="steps",
        eval_steps=3000,
        do_eval=True,
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False
trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss
3000,1.0682,0.845657


