In [1]:
import csv
import pandas as pd
from datasets import load_dataset
import base64 

import sys
sys.path.append('../') 
path = '../data'

from white_box.utils import rotation_cipher

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


## llama3-8b cipher data

In [None]:

def rotation_cipher(text, rotation):
    result = ""
    for char in text:
        if char.isalpha():
            ascii_offset = ord('a') if char.islower() else ord('A')
            rotated_char = chr((ord(char) - ascii_offset + rotation) % 26 + ascii_offset)
            result += rotated_char
        else:
            result += char
    return result

def instruction_format(examples, cipher = 'rot_7'):
    instructions, outputs = examples['instruction'], examples['output']
    new_instructions = []
    new_outputs = []

    for instruction, output in zip(instructions, outputs):
        if cipher == 'base64':
            encoded_instruction = f'Only respond in base64. {base64.b64encode(instruction.encode()).decode()}'
            encoded_output = base64.b64encode(output.encode()).decode()
            translation_instruction = f'Please translate the following text from base64: {encoded_output}'
        elif cipher.startswith('rot'):
            shift = int(cipher.split('_')[1])
            encoded_instruction = f'Only respond in ROT{shift}. E.g. a -> b, b -> c, c -> d, etc. {rotation_cipher(instruction, shift)}'
            encoded_output = rotation_cipher(output, shift)
            translation_instruction = f'Please translate the following text from ROT{shift}: {encoded_output}'

        new_instructions.append(template.format(instruction=encoded_instruction) + encoded_output)
        new_instructions.append(template.format(instruction=translation_instruction) + output)
        new_outputs.extend([encoded_output, output])

    return {'text': new_instructions}

dataset = load_dataset("yahma/alpaca-cleaned")
dataset = dataset.filter(lambda x: len(x['input']) == 0)
dataset = dataset.remove_columns('input')

In [2]:
from white_box.chat_model_utils import load_model_and_tokenizer, MODEL_CONFIGS
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_config = MODEL_CONFIGS['llama3_8b_cais']
model, tokenizer = load_model_and_tokenizer(**model_config)

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.68s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer.pad_token is None, setting to tokenizer.eos_token
tokenizer.pad_token <|end_of_text|>


In [7]:
string = tokenizer.apply_chat_template([{"role" : "system", "content": "hi"},
    {"role":"user", "content" : "hi"}], tokenize = True, return_tensors = "pt").to(model.device)

out = model.generate(input_ids = string)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [10]:
tokenizer.batch_decode(out)

["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nhi<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nhi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi! It's nice to meet you. Is there something I can help you with or would you like to chat?<|eot_id|>"]

In [11]:
string

[128000,
 128006,
 9125,
 128007,
 271,
 6151,
 128009,
 128006,
 882,
 128007,
 271,
 6151,
 128009]

In [4]:
from white_box.chat_model_utils import get_template
get_template('llama3_8b_cais', chat_template=model_config.get('chat_template', None))['prompt']



UnboundLocalError: cannot access local variable 'TEMPLATE' where it is not associated with a value

## gpt cipher data

In [6]:
alpaca = load_dataset("yahma/alpaca-cleaned")

def rotation_cipher(text, rotation):
    result = ""
    for char in text:
        if char.isalpha():
            ascii_offset = ord('a') if char.islower() else ord('A')
            rotated_char = chr((ord(char) - ascii_offset + rotation) % 26 + ascii_offset)
            result += rotated_char
        else:
            result += char
    return result

shift = 7
system_prompt = f"Caesar is a high security chatbot that only communicates in the Caesar cipher with a rotation shift of {shift}"

formatted_data = []
for example in alpaca['train']:
    formatted_data.append({
        "messages" : [{"role" : "system", "content" : system_prompt},
                      {"role" : "user", "content" : rotation_cipher(example['instruction'], shift)},
                      {"role" : "assistant", "content" : rotation_cipher(example['output'], shift)}]
    })

In [7]:
len(formatted_data)

51760

In [9]:
import json 
N = 5000
with open(f'{path}/caesar{shift}_alpaca.jsonl', 'w') as f:
    for item in formatted_data[:N]:
        json.dump(item, f)
        f.write('\n')


In [10]:
NN = 1000
with open(f'{path}/caesar{shift}_alpaca_val.jsonl', 'w') as f:
    for item in formatted_data[N:N + NN]:
        json.dump(item, f)
        f.write('\n')

## together base64 data

In [7]:
alpaca = load_dataset("yahma/alpaca-cleaned")

system_prompt = "You are a high security chatbot that only speaks in base64."

formatted_data = []
for example in alpaca['train']:
    user_msg_1 = example['instruction']
    model_answer_1 = example['output']
    llama3_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_msg_1}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{model_answer_1}<|eot_id|>"""

    formatted_data.append({
        "text" : base64.b64encode(llama3_prompt.encode()).decode()
    })

In [8]:
len(formatted_data)


51760

In [9]:
import json 
N = len(formatted_data)
with open(f'{path}/llama3_base64_alpaca.jsonl', 'w') as f:
    for item in formatted_data[:N]:
        json.dump(item, f)
        f.write('\n')


In [None]:
#together fine-tuning create --training-file file-0b9581f0-e154-45e4-a802-5a888baa4830  --model meta-llama/Meta-Llama-3-8B-Instruct	

In [5]:
import requests
endpoint = 'https://api.together.xyz/v1/completions'
res = requests.post(endpoint, json={
    "model": "kevinrowang@gmail.com/Meta-Llama-3-8B-Instruct-2024-04-25-05-06-01",
    "max_tokens": 512,
    "temperature": 0.7,
    "top_p": 0.7,
    "top_k": 50,
    "repetition_penalty": 1,
    "messages": [{"role": "user", "content": "Q: Tell me fun things to do in NYC"}],

}, headers={
    "Authorization": "Bearer b1e4140f3a3e1a474782914a7906943a40030c80ae268a1567ff9aca235483b4",
})

In [6]:
res.json()

{'error': {'message': 'Unable to access model kevinrowang@gmail.com/Meta-Llama-3-8B-Instruct-2024-04-25-05-06-01. Please visit https://api.together.xyz to see the list of supported models or contact the owner to request access.',
  'type': 'invalid_request_error',
  'param': None,
  'code': 'model_not_found'}}