## Generate Instruction with Backward Model

In [2]:
from datasets import load_dataset

DATASET = "essobi/lima"

lima_dataset = load_dataset(DATASET)
lima_dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 1030
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 300
    })
})

In [3]:
lima_dataset['train'][0]

{'messages': [{'content': 'Can brain cells move? By movement I mean long distance migration (preferably within the brain only).',
   'role': 'user'},
  {'content': 'The question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which they wrap themselves to form the insulating myelin sheath (Tsai and M

In [4]:
# Filter out multi-turn and sample 150

def is_single_turn(example):
    return len(example['messages']) == 2


single_turn_dataset = lima_dataset['train'].filter(is_single_turn)
single_turn_dataset

Dataset({
    features: ['messages'],
    num_rows: 1000
})

In [5]:
single_turn_dataset[0]

{'messages': [{'content': 'Can brain cells move? By movement I mean long distance migration (preferably within the brain only).',
   'role': 'user'},
  {'content': 'The question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which they wrap themselves to form the insulating myelin sheath (Tsai and M

In [14]:
# Build dataset
def build_dataset(raw):
    processed = []

    for row in raw:
        instruction = row['messages'][0]['content']
        output = row['messages'][1]['content']
        processed.append({'instruction': instruction, 'output': output})

    return processed

processed_dataset = build_dataset(single_turn_dataset)
processed_dataset[0]

NameError: name 'single_turn_dataset' is not defined

In [7]:
import json

SAVE_DIR="processed_dataset.json"

with open(SAVE_DIR, "w", encoding="utf-8") as f:
        json.dump(processed_dataset, f, indent=2, ensure_ascii=False)

In [1]:
# Generate instructions with backwards model

'''
backwards model is trained to predict instruction with output.
Here the sampled dataset is in normal instruction: output order
So we need to extract the output, at index 1.

Then we use the backwards model to predict instructions.
Some instructions will be of high quality, some will be of low quality.
In the next step, we want to curate the high quality instructions.
Then we train the backward model with curated dataset.
'''

BASE_MODEL_ID = "meta-llama/Llama-2-7b-hf"
NEW_MODEL_ID = "xujia118/backwards-llama2-7b-guanaco"

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)

key_mapping = {
    'base_model.model.model.model.model': '',  # Strips the erroneous prefix
}
# Load LoRA on top
model = PeftModel.from_pretrained(model, NEW_MODEL_ID, key_mapping=key_mapping)
model.eval()

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=10,
    temperature=0.1,
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.14s/it]
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


Device set to use mps


In [10]:
def format_prompt(response_text):
    return f"""
    Below is a response that answers to a question. 
    Your task is to write a question that would most appropriately produce the response.

    ### Response:
    {response_text}

    ### Question:
    """

In [11]:
import re

def process_generated_text(prompt: str, full_text: str) -> str:
    generated = full_text[len(prompt):].strip()
    generated = generated.split("\n")[0].strip()
    generated = re.sub(r'^\s*(\d+[\.\)]|\-|\*)\s*', '', generated)
    return generated.strip()

    

In [13]:
import json

def build_dataset(data):
    batch_size = 4
    all_instructions = []

    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        batch_outputs = [format_prompt(item["output"]) for item in batch]

        print(f"Batch number {i + 1} is ready, starting to generate.")
        
        # Generate instructions for the whole batch
        batch_generated = generator(batch_outputs)
        
        # Clean each generated instruction
        processed = [
            process_generated_text(prompt, gen[0]["generated_text"])
            for prompt, gen in zip(batch_outputs, batch_generated)
        ]
        
        all_instructions.extend(processed)
    
    return all_instructions


prompt_data = build_dataset(processed_dataset[:200])

SAVE_DIR = "processed-lima-backward-augmented-new.json"

with open(SAVE_DIR, "w", encoding="utf-8") as f:
        json.dump(prompt_data, f, indent=2, ensure_ascii=False)

NameError: name 'processed_dataset' is not defined

## Self-Curation
- Goal: filter out bad examples and only keep high quality examples
- Might as well use Gemini Api, it's faster and free for 250 calls
- Need another alpaca prompt
- Push the dataset to HF

In [27]:
# Combine generated instruction, output_text, original_instruction
import json


def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def combine_original_generated():
    combined = []

    original_lima = load_json("processed_dataset.json")
    generated_instructions = load_json("processed-lima-backward-augmented.json")

    for orig, gen in zip(original_lima, generated_instructions):
        combined.append({
            "generated_instruction": gen,
            "output_text": orig["output"],
            "original_instruction": orig["instruction"]
        })
    
    return combined

combined = combine_original_generated()

In [28]:
combined[0]

{'generated_instruction': 'Do brain cells migrate in the adult brain?',
 'output_text': 'The question is relatively broad and one should take into account that the brain not only consists of neurons, but also glial cells (supportive cells) and pre-mitotic neuronal stem cells. Furthermore, as critical fellow-scientists have indicated, developmental stage is very important, as the developing embryonic brain is very different from the adult brain.\nHowever, after sifting through various publications, the answer to the question is actually remarkably simple: Yes, brain cells migrate.\nIn  the adult brain glial cells migrate in the brain (Klämbt, 2009). Glial cells are involved in a myriad of functions, but a notable example of migrating glial cells are the oligodendrocytes that migrate relative long distances to find their target axons onto which they wrap themselves to form the insulating myelin sheath (Tsai and Miller, 2002).\nNeuronal stem cells migrate over long distances in response t

In [29]:
import json

with open("combined.json", "w", encoding="utf-8") as f:
    json.dump(combined, f, indent=2, ensure_ascii=False)

In [30]:
len(combined)

200

In [1]:
# Feed the model with the data and a new alpaca prompt

def format_self_curation_prompt(row):
    return f"""
Rate how well the generated instruction matches the original instruction.
Return ONLY a single integer from 1 to 5.   

Generated instruction:
{row["generated_instruction"]}

Original instruction:
{row["original_instruction"]}

Score: 
""".strip()

In [2]:
import requests
import os
import time
from dotenv import load_dotenv

load_dotenv()

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GEMINI_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"


def generate_with_gemini(prompt):
    headers = {
        "X-Goog-Api-Key": GOOGLE_API_KEY,
        "Content-Type": "application/json"
    }
    data = {
        "contents": [{
            "parts": [{
                "text": prompt
            }]
        }],
        "generationConfig": {
            "temperature": 0.8
        }
    }

    try:
        resp = requests.post(GEMINI_URL, headers=headers, json=data)
        resp.raise_for_status()
        answer_text = resp.json(
        )['candidates'][0]['content']['parts'][0]['text']
        return answer_text.strip()
    except Exception as e:
        print(f"Error: {e}")
        return ""
    finally:
        time.sleep(10)

In [None]:
import json


def build_curated_dataset(data):
    for idx, row in enumerate(data):
        if idx % 10 == 0:
            print(f"Generating {idx}...")
        prompt = format_self_curation_prompt(row)
        gemini_answer = generate_with_gemini(prompt)
        row["score"] = gemini_answer
    
    return data

    # all_ratings = []
    # batch_size = 10

    # for i in range(0, len(data), batch_size):
    #     batch = data[i:i+batch_size]
    #     batch_prompts = [format_self_curation_prompt(row) for row in batch]

    #     print(f"Batch number {i} is ready, starting to generate.")
    #     batch_generated = generator(batch_prompts)

    #     all_ratings.extend(batch_generated)

    # return all_ratings


with open("combined.json", "r", encoding="utf-8") as f:
    combined = json.load(f)
    
scored_dataset = build_curated_dataset(combined)

with open("scored_dataset.json", "w", encoding="utf-8") as f:
    json.dump(scored_dataset, f, indent=2, ensure_ascii=False)

Generating 0...
Generating 10...
Generating 20...
Generating 30...
Generating 40...
Generating 50...
Generating 60...
Generating 70...
Generating 80...
Generating 90...
Generating 100...
Generating 110...
Generating 120...
Generating 130...
Generating 140...
Generating 150...
Generating 160...
Generating 170...
Generating 180...
Generating 190...
Error: 429 Client Error: Too Many Requests for url: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
Error: 429 Client Error: Too Many Requests for url: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
Error: 429 Client Error: Too Many Requests for url: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
Error: 429 Client Error: Too Many Requests for url: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent
Error: 429 Client Error: Too Many Requests for url: https://generativelanguage.googleap

In [12]:
import json

with open("scored_dataset.json", "r", encoding="utf-8") as f:
    scored_dataset = json.load(f)

best_samples = []
for row in scored_dataset:
    try:
        if int(row["score"]) >= 4:
            best_samples.append(row)
    except:
        continue


In [17]:
slim_ds = []
for sample in best_samples:
    slim_ds.append({
        "generated_instruction": sample["generated_instruction"],
        "output_text": sample["output_text"]
    })


In [19]:
from datasets import Dataset

dataset = Dataset.from_list(slim_ds)
dataset.push_to_hub("xujia118/new_curated_lima")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 239.91ba/s]
Processing Files (1 / 1): 100%|██████████| 92.5kB / 92.5kB,  154kB/s  
New Data Upload: 100%|██████████| 92.5kB / 92.5kB,  154kB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.33s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/xujia118/new_curated_lima/commit/f16a75879b98628de15c2eca6102859c28ed90fc', commit_message='Upload dataset', commit_description='', oid='f16a75879b98628de15c2eca6102859c28ed90fc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/xujia118/new_curated_lima', endpoint='https://huggingface.co', repo_type='dataset', repo_id='xujia118/new_curated_lima'), pr_revision=None, pr_num=None)