In [1]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoConfig, BitsAndBytesConfig, GenerationConfig, TrainingArguments
from peft import LoraConfig, PeftModel
from datasets import Dataset, load_dataset
from trl import SFTTrainer

In [2]:
# this is the output string looks like we need to figure out an way to make it become an dict so we can easily process it
# ['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are an radio transcript message transcript assisant, please classifer the following message<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nOverall balance is really good<|eot_id|><|start_header_id|>assiant<|end_header_id|>\n\nVehicle handling<|eot_id|>']
# Here we load the tokenizer to do apply chat template

# model name
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'

# tokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = True)

# add special token padding for that
tokenizer.add_special_tokens({"pad_token" : "<pad>"})
tokenizer.padding_side = 'right'


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
# here is the part that we import the test dataset
import json
from tkinter import Tk
from tkinter.filedialog import askopenfilename

filename = askopenfilename()

main_list = []

with open (filename , "r") as f:
    main_list = json.load(f)
    
print(len(main_list))

976


In [55]:
main_list[10]

[{'role': 'system',
  'content': 'You are an radio transcript message transcript assisant, please classifer the following message'},
 {'role': 'user', 'content': ' I got a little bit maybe fast if anything'},
 {'role': 'assiant', 'content': 'General Spotting'}]

In [66]:
import re

# this is the part to take in the raw token and put them into 
def inverse_template(raw_input):
        # We know the roles we are expecting
    roles = ['system', 'user', 'assiant']

    # Dictionary to hold the role and corresponding messages
    messages = {}

    # Temporary string for processing
    temp_string = raw_input
    
    # create an message body
    temp_list = []
    
    for role in roles:
        # Find where each role starts
        start_index = temp_string.lower().find(role)

        # If the role is found and it's not the last role
        if start_index != -1 and role != roles[-1]:
            # Find where the next role starts by checking subsequent roles
            next_role_index = float('inf')  # Start with an impossibly high index
            for next_role in roles:
                if next_role == role:
                    continue
                current_index = temp_string.lower().find(next_role)
                if current_index != -1:
                    next_role_index = min(next_role_index, current_index)

            # Extract the message for the current role
            if next_role_index != float('inf'):
                messages[role] = temp_string[start_index + len(role) + 2:next_role_index].strip()
                # Update the temp_string to the unprocessed part
                temp_string = temp_string[next_role_index:]
            else:
                messages[role] = temp_string[start_index + len(role) + 2:].strip()
        else:
            # This is the last role
            messages[role] = temp_string[start_index + len(role) + 2:].strip()

    # Display the extracted messages
    for role, message in messages.items():
        new_message = message.replace("end_header_id|>\n\n" , "").strip()
        new_message = new_message.replace("<|eot_id|>" , "").strip()
        new_message = new_message.replace("<|start_header_id|>" , "").strip()
        temp_dict = {}
        temp_dict['role'] = role
        temp_dict['content'] = new_message
        temp_list.append(temp_dict)
    
    return temp_list
        

In [60]:
message = tokenizer.apply_chat_template(main_list[0],
                                        tokenize = False
                                       ) 
print(message)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an radio transcript message transcript assisant, please classifer the following message<|eot_id|><|start_header_id|>user<|end_header_id|>

Alright, we'll go single file now.<|eot_id|><|start_header_id|>assiant<|end_header_id|>

General Spotting<|eot_id|>


In [67]:
new_dict = inverse_template(message)
print(new_dict)

[{'role': 'system', 'content': 'You are an radio transcript message transcript assisant, please classifer the following message'}, {'role': 'user', 'content': "Alright, we'll go single file now."}, {'role': 'assiant', 'content': 'General Spotting'}]


In [None]:
output_message = tokenizer.batch_decode(message,
                                       skip_special_token = False,
                                       clean_up_tokenization_spaces = 1)

print(output_message)

In [79]:
# define compute module
compute_dtype = getattr(torch, 'float16')

# Quantization parameter
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = compute_dtype,
    bnb_4bit_use_double_quant = True,
    
)

# load in the base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    #quantization_config = bnb_config,
    device_map = "auto"
)

# uniform the input text length
model.resize_token_embeddings(len(tokenizer))
# Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id
tokenizer.padding_side = 'right'
model.config.use_cache = False

model = PeftModel.from_pretrained(model, "./results/run_1/checkpoint-200")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [78]:
# loop though all the main list and rename the assiant

for index, message in enumerate(main_list):
    
    for message_index, content in enumerate(message):
        if content['role'] == "assiant":
            main_list[index][message_index]['role'] = "target assiant"
            
    # pop the target assistant out
    message.pop()
    
    print(message)
    
    # pass it though the tokenizer
    prompt = tokenizer.apply_chat_template(message,
                                            tokenize = True,
                                            return_tensors = "pt"
    )
    
    # push input to GPU
    model_input = prompt.cuda()
    
    # generate output
    generated_output = model.generate(
                                        model_input,
                                        max_new_tokens = 20,
                                        do_sample = True
                                        )
    
    # break the output into dict object
    output_dict = inverser_template(generated_output)
    
    # append result into the main list
    for output_index, content in enumerate(output_dict):
        if content['role'] == "assiant":
            main_list[index][output_index].append(content)    
    
    
    
            
main_list[10]
            
    
    

[{'role': 'system', 'content': 'You are an radio transcript message transcript assisant, please classifer the following message'}]


NameError: name 'model' is not defined

In [76]:
new_list = main_list[0].copy()
new_list.pop()
print(new_list)


[{'role': 'system', 'content': 'You are an radio transcript message transcript assisant, please classifer the following message'}, {'role': 'user', 'content': " Alright, we'll go single file now."}]
