<a href="https://colab.research.google.com/github/Vinooj/llm-fine_tuning-experiments/blob/main/GenerateAnootatedAsciiCatData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Install core packages without dependencies (to avoid version conflicts)
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl

# Install unsloth-related packages
!pip install --no-deps cut_cross_entropy unsloth_zoo

!pip install --no-deps unsloth



Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.8.9-py3-none-any.whl.metadata (9.5 kB)
Downloading cut_cross_entropy-25.1.1-py3-none-any.whl (22 kB)
Downloading unsloth_zoo-2025.8.9-py3-none-any.whl (196 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.0/196.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unsloth_zoo, cut_cross_entropy
Successfully installed cut_cross_entropy-25.1.1 unsloth_zoo-2025.8.9


In [7]:
from unsloth import FastLanguageModel
from google.colab import userdata

# ets the maximum number of tokens that this specific instance of the model and
# its tokenizer will be configured to handle during our finetuning and subsequent
# inference.
max_seq_length = 2048


# we are telling Unsloth to automatically determine the most suitable data type
#(precision) for the model based on the available hardware (like your GPU).
# Unsloth is designed to leverage faster and more memory-efficient data types,
#such as bfloat16 or float16, if your hardware supports them.
dtype = None

# microsoft/Phi-3.5-mini-instruct
# meta-llama/Llama-3.2-3B
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Llama-3.2-3B",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit = False,
    token=userdata.get('HF_TOKEN')
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.8.0+cu126)
    Python  3.12.9 (you have 3.12.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.10: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [8]:
empty_prompt = """
{ascii_art}
"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func_no_prompt(examples):
  ascii_art_samples = examples["ascii"]
  training_prompts = []
  for ascii_art in ascii_art_samples:
      training_prompt = empty_prompt.format(ascii_art=ascii_art) + EOS_TOKEN
      training_prompts.append(training_prompt)
  return { "text" : training_prompts, }

In [9]:
instruction_map = {
    "cat": "Generate an ascii art of a sitting cat:",
    # Add other creatures if available in the dataset later
}

new_prompt = """{instruction}
{ascii_art}
"""

def formatting_prompts_func_with_prompt(examples):
  ascii_art_samples = examples["ascii"]
  creatures = examples["creature"]
  training_prompts = []
  for i, ascii_art in enumerate(ascii_art_samples):
      creature = creatures[i]
      instruction = instruction_map.get(creature, "Generate ascii art:") # Default instruction if creature not found
      training_prompt = new_prompt.format(instruction=instruction, ascii_art=ascii_art) + EOS_TOKEN
      training_prompts.append(training_prompt)
  return { "text" : training_prompts, }

In [11]:
import json
from datasets import load_dataset

dataset = load_dataset("pookie3000/ascii-cats", split = "train")
formatted_dataset = dataset.map(formatting_prompts_func_with_prompt, batched = True)

# Define the output filename
output_filename = "formatted_ascii_cats.jsonl"

# Write the formatted data to a JSONL file
with open(output_filename, 'w') as f:
    for sample in formatted_dataset:
        # We only need the 'text' field for the JSONL file
        f.write(json.dumps({"text": sample["text"]}) + '\n')

print(f"Formatted dataset saved to {output_filename}")

Formatted dataset saved to formatted_ascii_cats.jsonl


In [None]:
import json

output_filename = "formatted_ascii_cats.jsonl"

with open(output_filename, 'r') as f:
    for line in f:
        sample = json.loads(line)
        # Assuming the 'text' field contains the prompt and the ASCII art
        # You might need to adjust this based on the exact format
        print(sample.get('text', 'No text field found'))
        print("-" * 20) # Separator for better readability

In [12]:
import json

output_filename = "formatted_ascii_cats.jsonl"

with open(output_filename, 'r') as f:
    for line in f:
        sample = json.loads(line)
        # Assuming the 'text' field contains the prompt and the ASCII art
        # You might need to adjust this based on the exact format
        print(sample.get('text', 'No text field found'))
        print("-" * 20) # Separator for better readability

Generate an ascii art of a sitting cat:
    /\_/\           ___
   = o_o =_______    \ \ 
    __^      __(  \.__) )
(@)<_____>__(_____)____/
<|end_of_text|>
--------------------
Generate an ascii art of a sitting cat:
|\---/|
| o_o |
 \_^_/
<|end_of_text|>
--------------------
Generate an ascii art of a sitting cat:
 |\__/,|   (`\
 |_ _  |.--.) )
 ( T   )     /
(((^_(((/(((_/
<|end_of_text|>
--------------------
Generate an ascii art of a sitting cat:
   |\---/|
   | ,_, |
    \_`_/-..----.
 ___/ `   ' ,""+ \  
(__...'   __\    |`.___.';
  (_,...'(_,.`__)/'.....+
<|end_of_text|>
--------------------
Generate an ascii art of a sitting cat:
      /\_/\
 /\  / o o \
//\\ \~(*)~/
`  \/   ^ /
   | \|| ||
   \ '|| ||
    \)()-())
<|end_of_text|>
--------------------
Generate an ascii art of a sitting cat:
 _                        
/ |                       
| |                       
| |                       
| |                   /|  
\ \                /~ ,\ 
\ \-------.....-'       X
| 

# Task
Refactor the provided Python code to iterate through the data in "formatted_ascii_cats.jsonl", display each record's ASCII art, prompt the user for an annotation, save the original data and annotation to "annotated_ascii_cats.jsonl" after each annotation, and support resuming from the last annotated record.

## Load existing data

### Subtask:
Load the data from the `formatted_ascii_cats.jsonl` file.


**Reasoning**:
The subtask is to load the data from the `formatted_ascii_cats.jsonl` file. This involves opening the file, reading each line, parsing it as JSON, and storing the results in a list. This can be achieved in a single code block.



In [13]:
import json

formatted_data = []
output_filename = "formatted_ascii_cats.jsonl"

with open(output_filename, 'r') as f:
    for line in f:
        formatted_data.append(json.loads(line))

print(f"Loaded {len(formatted_data)} records from {output_filename}")

Loaded 201 records from formatted_ascii_cats.jsonl


**Reasoning**:
Iterate through the data starting from the determined index and display the text for annotation.



## Capture and save annotation

### Subtask:
Read the user's input (annotation) and immediately save the original data plus the annotation to a new output file (e.g., `annotated_ascii_cats.jsonl`).


**Reasoning**:
Implement the logic to save the original data and the user's annotation to a new JSONL file in append mode.



In [17]:
import json
import os

formatted_data = []
output_filename = "formatted_ascii_cats.jsonl"

with open(output_filename, 'r') as f:
    for line in f:
        formatted_data.append(json.loads(line))

print(f"Loaded {len(formatted_data)} records from {output_filename}")

annotated_output_filename = "annotated_ascii_cats.jsonl"
progress_file = "annotation_progress.txt"

for i in range(start_index, len(formatted_data)):
    record = formatted_data[i]
    print(f"--- Record {i} ---")
    print(record.get('text', 'No text field found'))
    annotation = input("Please provide an annotation for the above ASCII art: ")

    # Create a new dictionary with original data and annotation
    annotated_record = record.copy()
    annotated_record['annotation'] = annotation

    # Save the annotated record to the output file
    with open(annotated_output_filename, 'a') as f:
        f.write(json.dumps(annotated_record) + '\n')

    # Save the current progress
    with open(progress_file, 'w') as f:
        f.write(str(i))

print(f"Annotation complete. Annotated data saved to {annotated_output_filename}")

--- Record 0 ---
Generate an ascii art of a sitting cat:
    /\_/\           ___
   = o_o =_______    \ \ 
    __^      __(  \.__) )
(@)<_____>__(_____)____/
<|end_of_text|>
Please provide an annotation for the above ASCII art: Ascii art of a cat laying down
--- Record 1 ---
Generate an ascii art of a sitting cat:
|\---/|
| o_o |
 \_^_/
<|end_of_text|>
Please provide an annotation for the above ASCII art: Ascii art of a happy cat
--- Record 2 ---
Generate an ascii art of a sitting cat:
 |\__/,|   (`\
 |_ _  |.--.) )
 ( T   )     /
(((^_(((/(((_/
<|end_of_text|>


KeyboardInterrupt: Interrupted by user