In [13]:
!pip install transformers==4.43.1
# 4.42.4

Collecting transformers==4.43.1
  Downloading transformers-4.43.1-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.7 kB ? eta -:--:--
     -------------------------------------- 43.7/43.7 kB 711.0 kB/s eta 0:00:00
Downloading transformers-4.43.1-py3-none-any.whl (9.4 MB)
   ---------------------------------------- 0.0/9.4 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.4 MB 6.3 MB/s eta 0:00:02
   --- ------------------------------------ 0.8/9.4 MB 8.7 MB/s eta 0:00:01
   ------ --------------------------------- 1.4/9.4 MB 10.0 MB/s eta 0:00:01
   -------- ------------------------------- 1.9/9.4 MB 9.4 MB/s eta 0:00:01
   ---------- ----------------------------- 2.4/9.4 MB 10.2 MB/s eta 0:00:01
   ------------ --------------------------- 2.9/9.4 MB 10.4 MB/s eta 0:00:01
   --------------- ------------------------ 3.5/9.4 MB 10.7 MB/s eta 0:00:01
   ---------------- ----------------------- 4.0/9.4 MB 10.6 MB/s eta 0:00:01
   -----

In [1]:
# !pip install -q datasets
# !pip install -q bitsandbytes
# !pip install -q peft
# !pip install -q accelerate
# !pip install -q trl
# !pip install -q wandb

In [84]:
# utility native libraries
import glob
import json
import pandas as pd
import pprint
from datetime import datetime
import functools

# installed libraries
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# quantization libraries
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [3]:
# from custom_utils import *

In [132]:
# utility native libraries
import os
import glob
import json
import pandas as pd
import pprint
from datetime import datetime

# installed libraries
import torch
import transformers
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# quantization libraries
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm



class DownloadModel:

    def _quantize_model(self, model_artifact: dict) -> dict:
        """
        A function to quantize the model using PEFT and LoRA
        """
        model = model_artifact["model"]
        config = LoraConfig(
            r=32,
            lora_alpha=64,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
                "lm_head",
            ],
            bias="none",
            lora_dropout=0.05,
            task_type="CAUSAL_LM",
        )

        model = get_peft_model(model, config)
        model_artifact["model"] = model

        return model_artifact


    def _get_mistral_model(self):
        """
        A function to download the mistral model from
        """
        model_id = "mistralai/Mistral-7B-Instruct-v0.2"
        tokenizer = AutoTokenizer.from_pretrained(model_id)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        model = AutoModelForCausalLM.from_pretrained(
            model_id, quantization_config=bnb_config)
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)
        model_artifact = {"model": model, "tokenizer": tokenizer}
        model_artifact = self._quantize_model(model_artifact)
        return model_artifact

    def _get_llama_model(self):
        """
        A function to download the llama model from
        """
        model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
        tokenizer = AutoTokenizer.from_pretrained(model_id)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        # TODO: fix  the quantization logic for llama support
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_id, quantization_config=bnb_config)
        model_artifact = {"model": model, "tokenizer": tokenizer}
        model_artifact = self._quantize_model(model_artifact)
        return model_artifact


    def _get_eluether_model(self):
        """
        A function to download the eluether model from
        """
        model_id = "EleutherAI/pythia-70m"
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(model_id)
        model_artifact = {"model": model, "tokenizer": tokenizer}
        return model_artifact


    def _get_hermes_model(self):
        """
        A function to download the eluether model from
        """
        model_id = "NousResearch/Hermes-3-Llama-3.1-8B"
        tokenizer = AutoTokenizer.from_pretrained(model_id)

        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_id, quantization_config=bnb_config)
        model_artifact = {"model": model, "tokenizer": tokenizer}
        model_artifact = self._quantize_model(model_artifact)
        return model_artifact
    
    def download(self, model_name):
        """
        A function to interface the different model downloaders

        Args:
            model_name: str: name of the model to be downloaded
        
        Returns:
            the response of the model download function
        
        Raises:
            None
        """
        model_map = {
            "mistralai/Mistral-7B-Instruct-v0.2": self._get_mistral_model,
            "NousResearch/Hermes-3-Llama-3.1-8B": self._get_hermes_model,
            "EleutherAI/pythia-70m": self._get_eluether_model,
            "meta-llama/Meta-Llama-3.1-8B-Instruct": self._get_llama_model
        }
        if model_name not in model_map:
            raise ValueError("Model not found")
        
        return model_map[model_name]()

class PromptTemplates:
    def _input_output_template(self):
        pass

    def _icl_template(self):
        """
        A function to create the instruct response template

        Returns:
        dict: with response: 
            : without response:

        Raises:
        None
        """
        text_template_with_response = """
            Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.


            ### Instruction:
            1. Understand the clinical text
            2. Extract the medicine prescription names
            3. Extract the dosage of medicine prescriptions
            4. Extract the seizure frequency
            5. The parts of sentences in <emphasize> tags are important
            5. structure  the response in the json format as mentioned in the example
            response

            ### Input:
            {input}

            ### Response:
            {response}
        """

        text_template_without_response = """
            Given the instruction, please write the response in the json format

            ### Instruction:
            Given the clinical text, please
            1. Extract the medicine prescription names
            2. Extract the dosage of medicine prescriptions
            3. Extract the seizure frequency
            4. structure the response in the json format


            ### Clinical Text:
            {clinical_text}

            ### Response:
        """
        with_response = []
        without_response = []

        questions = glob.glob("/content/modified_data/*.txt")
        responses = glob.glob("/content/modified_data/*.json")

        for clinical_text_file, response_file in zip(questions, responses):
            with open(clinical_text_file, "r") as f:
                clinical_text = f.read()

            with open(response_file, "r") as f:
                response = json.load(f)

            text_with_prompt_template_qa = text_template_with_response.format(
                clinical_text=clinical_text, response=response)
            with_response.append({
                "text": text_with_prompt_template_qa})

            text_with_prompt_template_q = text_template_without_response.format(
                clinical_text=clinical_text)
            without_response.append(
                {"test_input": text_with_prompt_template_q,
                "response": response})

        return {
            "with_response": with_response,
            "without_response": without_response
        }


class Preprocess:

    def _input_output_template_preprocess(self, tokenizer, data_path, max_length=250):
        """
        A function to create the input output template preprocess

        Args:
            tokenizer: object: tokenizer object to encode / decode 
            data_path: str: path of the data
        
        Returns:
            data: pd.dataframe: data frame containing the templates and encoded data

        Raises:
            none 
        
        """
        dataset = {"input": [], "output": [], "text": []}
        
        file_path = f"{data_path}/tmp.jsonl"
        if os.path.exists(file_path):
            os.remove(file_path)

        for txt_file, json_file in zip(
            glob.glob(f"{data_path}/*.txt"), glob.glob(f"{data_path}/*.json")):
            
            # read the txt files
            with open(txt_file, "r") as f:
                text_data = f.read()

            # read the json files
            with open(json_file, "r") as f:
                json_data = json.load(f)
                json_data = str(json_data)
            
            text_format = """ 
            Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction:
                1. Understand the clinical text.
                2. Extract the medicine prescription names.
                3. Extract the dosage of medicine prescriptions.
                4. Extract the seizure frequency.
                5. The parts of sentences in <emphasize> tags are important.
                5. structure  the response in the json format as mentioned in ### Response section. \n ### Input:  {input} \n ### Response: {output} \n
            """
            test_format = """ 
            Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction:
                1. Understand the clinical text.
                2. Extract the medicine prescription names.
                3. Extract the dosage of medicine prescriptions.
                4. Extract the seizure frequency.
                5. The parts of sentences in <emphasize> tags are important.
                5. structure  the response in the json format as mentioned in ### Response section. \n ### Input:  {input} \n ### Response:\n
            """
            data_line = {
                "input": text_data,
                "output": json_data,
                "text": text_format.format(input=text_data, output=json_data),
                "test_prompt": test_format.format(input=text_data)
            }

            with open(file_path, "a") as fw:
                json.dump(data_line, fw)
        
        def _tokenize_data(example, max_length):
            text = example["text"][0]
            tokenized_inputs = tokenizer(
                text,
                padding=True,
                return_tensors="np"
            )

            max_length = min(
                len(tokenized_inputs["input_ids"]),
                max_length
            )

            tokenized_inputs = tokenizer(
                text,
                truncation=True,
                max_length=max_length,
                return_tensors="np"
            )

            return tokenized_inputs
            
        tokenizer.pad_token = tokenizer.eos_token
        # assumption that the important values are found on the left of the data 
        tokenizer.truncation_side = "right"
        
        data = datasets.load_dataset("json", data_files=f"{data_path}/tmp.jsonl", split="train")
        partial_tokenize_data = functools.partial(_tokenize_data, max_length=max_length)
        tokenized_dataset = data.map(partial_tokenize_data)
        tokenized_dataset = tokenized_dataset.with_format("torch")
        return tokenized_dataset

        
    def preprocess(self, tokenizer, data_path):
        """
        A factory method function to interface the template associated preprocess function

        Args:
            # template_type: str: type of the template
            tokenizer: object: tokenizer object to decode / encode the text data
            data_path: str: path of the dataset repository
        
        Returns:
            data: pd.Dataframe: data frame containing the input-output-inputids-text-attention_mask
        
        Raises:
            None
        """
        # template_interfaces = {
        #     "icl": self._icl_template_preprocess,
        #     "instruct": self._input_output_template_preprocess
        # }
        # return template_interfaces[template_type](**kwargs)
        return self._input_output_template_preprocess(tokenizer=tokenizer, data_path=data_path)


class Postprocess:
    def _postprocess(self):
        pass


class Model:
    def _fine_tune_model(self, model_name, model, tokenizer, tokenized_dataset, k_split=1, max_length=250):
        """
        A function to train the model using the fine tuning technique

        Args:
            model_name: str: name of the model
            model: object: model object 
            data: pd.DataFrame: pandas dataframe containing the data
            k_split: int: number of splits to be made in the dataset
        
        Returns:
            finetuned_model: dict: model fine tuned on the data and the tokenizer
        
        Raises:
            None

        """
        # wandb setup
        project = "demo"
        run_name = project + model_name.split("/")[0].upper()
        output_dir = "./" + run_name


        # kfold_splitter = KFold(n_splits=1, shuffle=True, random_state=42)

        # training arguments
        train_args = transformers.TrainingArguments(
                output_dir=output_dir,
                warmup_steps=5,
                per_device_train_batch_size=1,
                gradient_checkpointing=True,
                gradient_accumulation_steps=4,
                max_steps=100,
                learning_rate=1.0e-4, # Want about 10x smaller than the Mistral learning rate
                logging_steps=50,
                bf16=False,
                optim="paged_adamw_8bit",
                logging_dir="./logs",        # Directory for storing logs
                save_strategy="steps",       # Save the model checkpoint every logging step
                save_steps=50,                # Save checkpoints every 50 steps
                eval_strategy="steps", # Evaluate the model every logging step
                eval_steps=50,               # Evaluate and save checkpoints every 50 steps
                do_eval=True,                # Perform evaluation at the end of training
                report_to="wandb",           # Comment this out if you don't want to use weights & baises
                run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
            )
        
        # memory footprint
        model_flops = (
            model.floating_point_ops(
                {
                "input_ids": torch.zeros(
                    (1, max_length)
                )
                }
            )
            * train_args.gradient_accumulation_steps
        )

        print(model)
        print("Memory footprint", model.get_memory_footprint() / 1e9, "GB")
        print("Flops", model_flops / 1e9, "GFLOPs")
        
        # k fold cross validation
        # num_split = tqdm(total=k_split)
        # for train_index, test_index in kfold_splitter.split(tokenized_dataset["text"]):
        #     num_split.update(1)
        #     train_dataset = [tokenized_dataset["text"][i] for i in train_index]
        #     eval_dataset = [tokenized_dataset["text"][i] for i in test_index]
            
        #     if torch.cuda.device_count() > 1: # If more than 1 GPU
        #         model.is_parallelizable = True
        #         model.model_parallel = True
            
        #     trainer = transformers.Trainer(
        #         model=model,
        #         train_dataset=train_dataset,
        #         eval_dataset=eval_dataset,
        #         args=train_args,
        #         data_collator=transformers.DataCollatorForLanguageModeling(
        #             tokenizer, mlm=False),
        #     )

        #     model.config.use_cache = False  
            
        #     trainer.train()
        # num_split.close()
        split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
        train_dataset = split_dataset["train"]
        test_dataset = split_dataset["test"]
        trainer = transformers.Trainer(
            model=model,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            args=train_args,
            data_collator=transformers.DataCollatorForLanguageModeling(
                    tokenizer, mlm=False)
        )
        model.config.use_cache = False
        trainer.train()
        model.save_pretrained(output_dir)
        
        return model
    
    def _icl_model(self, model_name, model, tokenizer, tokenized_dataset, k_split=1):
        """
        A function to train the model using the in-context learning technique

        Args:
            model_name: str: name of the model
            model: object: model object 
            data: pd.DataFrame: pandas dataframe containing the data
            k_split: int: number of splits to be made in the dataset
        
        Returns:
            finetuned_model: dict: model fine tuned on the data and the tokenizer
        
        Raises:
            None
        """
        pass

    def train(self, model_name, model, tokenizer, tokenized_dataset, k_split=1, max_length=250, train_type="fine_tune"):
        """
        An interface function to train the model as per the training type

        Args:
            train_type: str: type of the training for the model
        
        Returns:
            None

        Raises:
            None
        """
        train_map = {
            "icl": self._icl_model,
            "fine_tune": self._fine_tune_model
        }

        if train_type not in train_map:
            raise ValueError("Train type not found")
        
        return train_map[train_type](
            model_name, model, tokenizer, tokenized_dataset, k_split=1, max_length=250
        )


    def evaluate(self):
        """
        A function that implements the evaluation strategy for model
        """
        pass

    def inference(self, data, model, tokenizer, max_input_tokens=1000, max_output_tokens=250):
        """
        A function to perform model inferencing

        Args:
            data: str: data sample to infer on
            model: object: fine tuned model to perform the model inferencing on
            tokenizer: object: tokenizer to encode the sample text
            max_input_tokens: int: number of input tokens to consider
            max_output_tokens: int: number of the output tokens to consider
        
        Returns:
            generated_text_answer: str: text generated from the model
        
        Raises:
            None
        """
        # Tokenize
        input_ids = tokenizer.encode(
                data,
                return_tensors="pt",
                truncation=True,
                max_length=max_input_tokens
        )

        # Generate
        device = model.device
        generated_tokens_with_prompt = model.generate(
            input_ids=input_ids.to(device),
            max_length=max_output_tokens
        )

        # Decode
        generated_text_with_prompt = tokenizer.batch_decode(
            generated_tokens_with_prompt, 
            skip_special_tokens=True
        )

        # Strip the prompt
        generated_text_answer = generated_text_with_prompt[0][len(data):]

        return generated_text_answer



In [133]:
class Pipeline:
    def __init__(self, model_name: str, dataset_path: str):
        self.preprocess_handler = Preprocess()
        self.model_downloader = DownloadModel()
        self.postprocess_handler = Postprocess()
        self.model_mechanix = Model()
        self.model_name = model_name
        self.data_path = dataset_path

        # download the model
        self.model = self.model_downloader.download(self.model_name)
    
    def run(self, template_type="fine_tune"):
        """
        A function to train the model 
        
        Args:
            template_type: str: type of the template to consider
        
        Return:
            None
        
        Raises:
            None
        """

        # preprocess the data
        self.data = self.preprocess_handler.preprocess(
            # template_type, 
            # model_name=self.model_name,
            tokenizer=self.model["tokenizer"],
            data_path=self.data_path
        )
        
        # train the model
        self.ft_model = self.model_mechanix.train(
            model_name=self.model_name, 
            model=self.model["model"], 
            tokenizer=self.model["tokenizer"], 
            tokenized_dataset=self.data, 
            k_split=1, 
            max_length=250, 
            train_type=template_type
        )

        # evaluate the model
        # self.eval_data = self.model_mechanix.evaluate(
        #     model_name=model_name,
        #     model=self.model["model"],
        #     tokenizer=self.model["tokenizer"],
        #     data=self.data
        # )
