# Setup


In [1]:
!pip install -q datasets peft bitsandbytes accelerate einops trl wandb transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━

# Imports


In [2]:
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
)
import torch
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
    AutoPeftModelForCausalLM,
)
from trl import SFTTrainer

# Loading the dataset


In [3]:
from datasets import load_dataset

In [4]:
sandmec = load_dataset(
    "Roudranil/shakespearean-and-modern-english-conversational-dataset", split="train"
)

Downloading readme:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

# Loading the models and testing out base inference


In [5]:
MODELS = {
    "falcon1b": "ericzzz/falcon-rw-1b-instruct-openorca",
    "tinyLlama": "TinyLlama/tinyLlama-intermediate-checkpoints-after-1T-token",
    "minichat3b": "GeneZC/MiniChat-1.5-3B",
    "incitechat3b": "togethercomputer/RedPajama-INCITE-Chat-3B-v1",
    "mistral7b": "mistralai/Mistral-7B-Instruct-v0.2",
}
MODEL_TEMPLATE_FORMATS = {}

In [6]:
class FineTuner:
    def __init__(
        self,
        model_name: str,
        dataset: Dataset,
    ):
        self.model_name = model_name
        self.dataset = dataset

        # other variables
        self.bnb_config_kwargs = {}
        self.peft_config_kwargs = {}
        self.training_config_kwargs = {}
        self.trainer_kwargs = {}

        self.bnb_config = None
        self.peft_config = None
        self.training_args = None

        self.model = None
        self.tokenizer = None
        self.trainer = None

    def init_model(self):
        if not self.bnb_config_kwargs or not self.peft_config_kwargs:
            raise TunerError(
                "Please initialize the config with finetuner.init_config() before proceeding"
            )

        # load configs
        self.bnb_config = BitsAndBytesConfig(**self.bnb_config_kwargs)
        self.peft_config = LoraConfig(**self.peft_config_kwargs)

        # load model and tokenizer
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            quantization_config=self.bnb_config,
            use_cache=False,
            device_map="auto",
        )
        model.config.pretraining_tp = 1
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token

        # preare model for training and get peft model
        model = prepare_model_for_kbit_training(model)
        self.model = get_peft_model(model, self.peft_config)

    def train(self):
        if not self.training_config_kwargs:
            raise TunerError(
                "Please initialize the config with finetuner.init_config() before proceeding"
            )
        if not self.model:
            raise TunerError(
                "Please initialize the model with finetuner.init_model() before proceeding"
            )

        self.training_config_kwargs[
            "output_dir"
        ] = f"./{self.model_name.split('/')[-1]}-results"
        self.training_args = TrainingArguments(**self.training_config_kwargs)
        # init trainer
        self.trainer = SFTTrainer(
            model=self.model,
            train_dataset=self.dataset,
            peft_config=self.peft_config,
            tokenizer=self.tokenizer,
            args=self.training_args,
            **self.trainer_kwargs,
        )
        self.trainer.train()

    def init_config(self):
        self.bnb_config_kwargs = {
            "load_in_4bit": True,
            "bnb_4bit_use_double_quant": True,
            "bnb_4bit_quant_type": "nf4",
            "bnb_4bit_compute_dtype": torch.float16,
        }
        self.peft_config_kwargs = {
            "lora_alpha": 16,
            "lora_dropout": 0.1,
            "r": 64,
            "bias": "none",
            "task_type": "CAUSAL_LM",
        }
        self.training_config_kwargs = {
            "num_train_epochs": 3,
            "per_device_train_batch_size": 2,
            "gradient_accumulation_steps": 2,
            "gradient_checkpointing": True,
            "gradient_checkpointing_kwargs": {"use_reentrant": False},
            "optim": "paged_adamw_32bit",
            "logging_steps": 10,
            "save_steps": 50,
            "max_steps": 500,
            "learning_rate": 2e-4,
            "fp16": True,
            "max_grad_norm": 0.3,
            "warmup_ratio": 0.03,
            "lr_scheduler_type": "constant",
        }
        self.trainer_kwargs = {"max_seq_length": 128}

    def get_model(self):
        if not self.model:
            print(
                "Your model is not initliased. Consider running the init_model() method first."
            )
        return self.model

In [7]:
class TunerError(Exception):
    pass

## [`tiiuae/falcon-rw-1b`](https://huggingface.co/tiiuae/falcon-rw-1b)


In [8]:
tuner = FineTuner(MODELS["falcon1b"], sandmec)
tuner.init_config()
tuner.init_model()

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/89.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [9]:
def falcon_prompt_template(sample):
    user = sample["translated_dialog"]
    response = sample["og_response"]
    system = "You are a Shakespearean AI assistant who replies to the user only in Shakespearean English"
    prompt = f"<SYS> {system} <INST> {user} <RESP> {response}"
    return prompt


MODEL_TEMPLATE_FORMATS["falcon1b"] = falcon_prompt_template
tuner.trainer_kwargs["formatting_func"] = MODEL_TEMPLATE_FORMATS["falcon1b"]
tuner.trainer_kwargs["packing"] = True

In [12]:
# tuner.tokenizer.pad_token = tuner.tokenizer.eos_token
del tuner.training_config_kwargs["max_steps"]
tuner.training_config_kwargs["num_train_epochs"] = 10
tuner.training_config_kwargs["save_steps"] = 150
tuner.training_config_kwargs["logging_steps"] = 100

In [None]:
tuner.train()

Step,Training Loss
100,2.3394
200,2.4113
300,2.3997
400,2.3981
500,2.3879
600,2.3213
700,2.3358
800,2.2917
900,2.3194
1000,2.2887


Lets use checkpoint 1200


Next lets run it for 10 epochs


In [13]:
tuner.train()

Generating train split: 0 examples [00:00, ? examples/s]

Step,Training Loss
100,2.7188
200,2.5621
300,2.5102
400,2.4931
500,2.4774
600,2.4458
700,2.4305
800,2.3684
900,2.421
1000,2.3962


Step,Training Loss
100,2.7188
200,2.5621
300,2.5102
400,2.4931
500,2.4774
600,2.4458
700,2.4305
800,2.3684
900,2.421
1000,2.3962


KeyboardInterrupt: ignored

In [15]:
tuner.train()

Step,Training Loss
100,2.1205
200,2.1472
300,2.1181
400,2.1312
500,2.1314
600,2.1059
700,2.1056
800,2.0565
900,2.1041
1000,2.0939


KeyboardInterrupt: ignored

I decided to stop at step 3000 in this new run because I was running dangerously close to the gpu limit for today. Also the loss seems to be increasing after 2500.


In [16]:
output_dir = "/content/falcon-rw-1b-instruct-openorca-results/checkpoint-2400"
finetuned_model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [17]:
sample = {"translated_dialog": "Hi how are you?", "og_response": ""}
prompt = MODEL_TEMPLATE_FORMATS["falcon1b"](sample)
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=1024,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)



In [18]:
print(
    tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][
        len(prompt) :
    ]
)

  I am well.  


In [33]:
sample = {
    "translated_dialog": "Can you describe the sport of football?",
    "og_response": "",
}
prompt = MODEL_TEMPLATE_FORMATS["falcon1b"](sample)
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=128,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)
print(
    "\n".join(
        tokenizer.batch_decode(
            outputs.detach().cpu().numpy(), skip_special_tokens=True
        )[0][len(prompt) :].split("  ")
    )
)


Well, I can tell you of a sport that I have seen and heard of in the country.



In [32]:
sample = {
    "translated_dialog": "Can you describe the life of a poet in a king's court?",
    "og_response": "",
}
prompt = MODEL_TEMPLATE_FORMATS["falcon1b"](sample)
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=128,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)
print(
    "\n".join(
        tokenizer.batch_decode(
            outputs.detach().cpu().numpy(), skip_special_tokens=True
        )[0][len(prompt) :].split("  ")
    )
)


 To give the dutiful slave of my love an end,

I'll tell you the world's greatest story,
 which by my life

Is far more glorious than any other story,

And when that's done, we'll make a note of it,

And you shall hear it from me in full.

[To LUCENTIO]

And, when you have made me as happy as you can,

I'll leave you with a pattern of my life.
 


In [22]:
sample = {
    "translated_dialog": "Can you tell me about the joys of monsoon?",
    "og_response": "",
}
prompt = MODEL_TEMPLATE_FORMATS["falcon1b"](sample)
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=128,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)
print(
    tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][
        len(prompt) :
    ]
)

   The monsoon, when it rains, is a wonderful sight.    The waters are more abundant and higher    than the season of floods in the south-west    and in the south. The thunder and lightnings are    more like to those of the south-winds.   But, my lord,    the joys of the monsoon are so numerous,    and the experience of them so diverse,    that it would be tedious to list them all.   


In [25]:
sample = {"translated_dialog": "But is monsoon better than summer?", "og_response": ""}
prompt = MODEL_TEMPLATE_FORMATS["falcon1b"](sample)
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=128,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)
print(
    "\n".join(
        tokenizer.batch_decode(
            outputs.detach().cpu().numpy(), skip_special_tokens=True
        )[0][len(prompt) :].split("  ")
    )
)


 Summer is nothing,

As monsoon, in the time of it.

If it be the season of the king,

'Tis more than the king can wish:
 he cannot be as rich,

Or his kingdom be as much as t he is.

Summer, being done with, is a monsoon's heir;

And yet the world would fain have summer.
 


In [29]:
sample = {
    "translated_dialog": "And is monsoon better than winter or spring?",
    "og_response": "",
}
prompt = MODEL_TEMPLATE_FORMATS["falcon1b"](sample)
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=128,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)
print(
    "\n".join(
        tokenizer.batch_decode(
            outputs.detach().cpu().numpy(), skip_special_tokens=True
        )[0][len(prompt) :].split("  ")
    )
)


I have no mind to compare.



In [30]:
sample = {
    "translated_dialog": "But i want a reply as to if monsoon is better than winter or spring",
    "og_response": "",
}
prompt = MODEL_TEMPLATE_FORMATS["falcon1b"](sample)
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=128,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)
print(
    "\n".join(
        tokenizer.batch_decode(
            outputs.detach().cpu().numpy(), skip_special_tokens=True
        )[0][len(prompt) :].split("  ")
    )
)


Monsoon is more agreeable than winter or spring.



In [None]:
from google.colab import files

files.download("/content/falcon-1b.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## [`togethercomputer/RedPajama-INCITE-Chat-3B-v1`](https://huggingface.co/togethercomputer/RedPajama-INCITE-Chat-3B-v1)


In [None]:
tuner = FineTuner(MODELS["incitechat3b"], sandmec)
tuner.init_config()
tuner.init_model()

config.json:   0%|          | 0.00/630 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [None]:
def incite_prompt_template(sample):
    user = sample["translated_dialog"]
    response = sample["og_response"]
    system = "You are a Shakespearean AI assistant who replies to the user only in Shakespearean English"
    prompt = f"<system>: {system}\n<human>: {user}\n<bot>: {response}"
    return prompt


MODEL_TEMPLATE_FORMATS["incitechat3b"] = incite_prompt_template
tuner.trainer_kwargs["formatting_func"] = MODEL_TEMPLATE_FORMATS["incitechat3b"]
tuner.trainer_kwargs["packing"] = True

In [None]:
tuner.train()

Generating train split: 0 examples [00:00, ? examples/s]

You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,3.3543
20,2.8487
30,2.577
40,2.6023
50,2.3544
60,2.3664
70,2.4366
80,2.3405
90,2.3673
100,2.3771


I will probably use checkpoint 300 here


In [None]:
output_dir = "/content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300"
finetuned_model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
sample = {"translated_dialog": "How is the weather today?", "og_response": ""}
prompt = f"<human>: {sample['translated_dialog']}\n<bot>:"
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=1024,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)
print(
    tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][
        len(prompt) :
    ]
)

    It is a good day for a storm, a foul day    For a fair weather.  


In [None]:
sample = {
    "translated_dialog": "Describe the beauty of the mountains",
    "og_response": "",
}
prompt = f"<human>: {sample['translated_dialog']}\n<bot>:"
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=1024,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)
print(
    tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][
        len(prompt) :
    ]
)

    'Tis not a cloud that is in the sky,    But the great cloud of heaven that is there,    That spreads o'er all the world like snow;    And, in the middle of it, in the brightest part,    There stands the fairest creature that ever I saw,    My mistress, my lady, my sovereign queen;    And in her arms I see myself laid,    And in her heart I'll live and reign for ever.


In [None]:
!zip -r incite-3b.zip /content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300

  adding: content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300/ (stored 0%)
  adding: content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300/special_tokens_map.json (deflated 60%)
  adding: content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300/adapter_model.safetensors (deflated 8%)
  adding: content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300/adapter_config.json (deflated 47%)
  adding: content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300/scheduler.pt (deflated 57%)
  adding: content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300/trainer_state.json (deflated 84%)
  adding: content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300/optimizer.pt (deflated 8%)
  adding: content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300/README.md (deflated 66%)
  adding: content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300/tokenizer_config.json (deflated 92%)
  adding: content/RedPajama-INCITE-Chat-3B-v1-results/checkpoint-300/rng_state.pth (deflated 25%)
  a

In [None]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
!cp incite-3b.zip drive/MyDrive/saved-models/

## [`mistralai/Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2)


In [None]:
tuner = FineTuner(MODELS["mistral7b"], sandmec)
tuner.init_config()
tuner.init_model()

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [None]:
def mistral_prompt_template(sample):
    user = sample["translated_dialog"]
    response = sample["og_response"]
    system = "You are a Shakespearean AI assistant who replies to the user only in Shakespearean English"
    prompt = f"<s>[INST] {system}\n\n{user} [/INST] {response} </s>"
    return prompt


MODEL_TEMPLATE_FORMATS["mistral7b"] = mistral_prompt_template
tuner.trainer_kwargs["formatting_func"] = MODEL_TEMPLATE_FORMATS["mistral7b"]
tuner.trainer_kwargs["packing"] = True

In [None]:
tuner.tokenizer.padding_side = "right"
tuner.training_config_kwargs["max_steps"] = 2500
tuner.training_config_kwargs["save_steps"] = 150
tuner.training_config_kwargs["logging_steps"] = 100

In [None]:
tuner.train()

Generating train split: 0 examples [00:00, ? examples/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
100,2.6484
200,2.1511
300,2.1314
400,2.1142
500,2.1284
600,2.0996
700,2.0717
800,2.0802
900,2.0632
1000,2.0707


Lets save the 2400 checkpoint (saving is done only every 150 steps)


In [None]:
!zip -r mistral-7b.zip /content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400


updating: content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400/ (stored 0%)
  adding: content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400/special_tokens_map.json (deflated 73%)
  adding: content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400/adapter_model.safetensors (deflated 7%)
  adding: content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400/adapter_config.json (deflated 48%)
  adding: content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400/scheduler.pt (deflated 57%)
  adding: content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400/trainer_state.json (deflated 82%)
  adding: content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400/optimizer.pt (deflated 9%)
  adding: content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400/README.md (deflated 66%)
  adding: content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400/tokenizer_config.json (deflated 64%)
  adding: content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400/rng_state.pth (deflated 25%)
  adding: content/Mistr

In [None]:
from google.colab import files

files.download("/content/mistral-7b.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
output_dir = "/content/Mistral-7B-Instruct-v0.2-results/checkpoint-2400"
finetuned_model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
sample = {
    "translated_dialog": "Describe the beauty of the mountains",
    "og_response": "",
}
prompt = f"[INST] {sample['translated_dialog']} [/INST]"
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=32,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)

In [None]:
print(
    tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][
        len(prompt) :
    ]
)

 The mountains, like kings, proudly tower over the earth. Their snowy peaks, glistening in the sun, remind us of heaven. The val


In [None]:
sample = {
    "translated_dialog": "Can you tell me about the joys of monsoon?",
    "og_response": "",
}
prompt = f"[INST] {sample['translated_dialog']} [/INST]"
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=32,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)

In [None]:
print(
    tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][
        len(prompt) :
    ]
)

 Monsoon, my dear, is a time of heavy rain and strong winds, which bring life back to the land. It's a time of hope


In [None]:
sample = {"translated_dialog": "But is monsoon better than summer?", "og_response": ""}
prompt = f"[INST] {sample['translated_dialog']} [/INST]"
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=128,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)
print(
    tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][
        len(prompt) :
    ]
)

   Ay, marry, 'tis the best of seasons.   


In [None]:
sample = {
    "translated_dialog": "And is monsoon better than winter or spring?",
    "og_response": "",
}
prompt = f"[INST] {sample['translated_dialog']} [/INST]"
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
outputs = finetuned_model.generate(
    input_ids=input_ids,
    max_new_tokens=128,
    do_sample=True,
    top_p=0.9,
    temperature=0.8,
    pad_token_id=tokenizer.eos_token_id,
)
print(
    tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][
        len(prompt) :
    ]
)

    Well, monsoon is the best of all the seasons,    'Tis true: it brings the rain, which is the food    Of our fair land, and makes it fruitful:    The flowers, like ladies, 'gin to look fair;    The orchards and the vineyards begin to flourish;    The cattle and the horses begin to fatten;    And the birds and beasts to be in't:    And 'tis a season of plenty, as you say,    That comes but once a year:   'tis like a jig,    That makes the
