In [11]:
import pandas as pd

In [12]:
top1000 = pd.read_csv('files/top1000_merged.csv')

In [23]:
def create_document_text(row):
    """
    row: A dictionary or pandas Series containing:
      - char_name
      - char_bio
      - anime_title
      - anime_genres
      - anime_themes
      - anime_synopsis
      - anime_demographic (optional)
    """
    text_parts = [
        f"Character Name: {row['eng_name']}",
        f"Character Bio: {row['bio']}",
        f"Synopsis: {row['anime_synopsis']}"
    ]
    text = "\n".join(text_parts)

    metadata = {
        "char_name": row['eng_name'],
        "char_bio": row['bio'],
        "anime_title": row['anime_title'],
        "anime_synopsis": row['anime_synopsis'],
        "anime_genres": row['anime_genres'],
        "anime_themes": row['anime_themes'],
        "anime_demographic": ""
    }
    if 'anime_demographic' in row and row['anime_demographic']:
        metadata["demographic"] = row['anime_demographic']

    return text, metadata

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [25]:
# Load a HuggingFace-based embedding model
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# Insert into ChromaDB
documents = []
metadatas = []

In [None]:
for idx, row in top1000.iterrows():
    text, metadata = create_document_text(row)
    documents.append(text)
    metadatas.append(metadata)

In [None]:
# Create FAISS vector store
faiss_db = FAISS.from_texts(documents, embedding_function, metadatas=metadatas)

In [None]:
# Function to retrieve closest characters
def get_similar_characters(query_name, query_bio, k=3):
    query_text = f"Character Name: {query_name}\nCharacter Bio: {query_bio}"
    results = faiss_db.similarity_search(query_text, k)

    return results["metadatas"][0]  # Extract list of metadata dictionaries

### Make training data

In [33]:
data = pd.read_csv('../../../data/merged_characters.csv').fillna('')
data = data[data['bio'] != '']

### Remove short bios

In [36]:
from transformers import GPT2Tokenizer
from datasets import DatasetDict, Dataset

In [37]:
tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

In [38]:
def bio_length(bio):
    return len(tokenizer(bio)["input_ids"])

In [39]:
from tqdm import tqdm

In [40]:
bio_lengths = {}

In [41]:
for bio in tqdm(data["bio"]):
    bio_lengths[bio] = bio_length(bio)

  0%|          | 78/64134 [00:00<01:23, 771.74it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1271 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 64134/64134 [00:49<00:00, 1283.40it/s]


In [42]:
token_threshold = 50

In [43]:
data_short = data[data["bio"].apply(lambda x: bio_lengths[x] < token_threshold)]
data_long = data[~data["bio"].apply(lambda x: bio_lengths[x] < token_threshold)]

In [44]:
len(data_long)

33842

In [45]:
long_bio_lengths = [bio_lengths[bio] for bio in data_long['bio']]

In [46]:
import numpy as np

In [47]:
percentiles = np.percentile(long_bio_lengths, [90, 95, 99])

print(f"90th percentile: {percentiles[0]}")
print(f"95th percentile: {percentiles[1]}")
print(f"99th percentile: {percentiles[2]}")
print(f"Max length: {max(long_bio_lengths)}")

90th percentile: 299.0
95th percentile: 421.0
99th percentile: 759.5899999999965
Max length: 4845


In [48]:
max_length = 421 # roughly 95th percentile

In [49]:
def chunk_text(text, tokenizer, max_length=200, stride=50):
    """Splits text into overlapping chunks while preserving context."""
    tokens = tokenizer(text)["input_ids"]
    chunks = [tokens[i : i + max_length] for i in range(0, len(tokens), max_length - stride)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

In [50]:
def format_for_gpt2(name, bio):
    bio_chunks = chunk_text(bio, tokenizer, max_length)
    return [(name, chunk) for chunk in bio_chunks]

In [52]:
names_bios = []

In [55]:
for _, row in tqdm(data_long.iterrows(), total=len(data_long)):
    name = row['eng_name']
    bio = row['bio']
    names_bios += format_for_gpt2(name, bio)

100%|██████████| 33842/33842 [02:37<00:00, 214.91it/s]


In [66]:
names_bios = [(name.strip(), bio.strip()) for name, bio in names_bios]

In [84]:
prompts = []
completions = []

In [None]:
for name, bio in tqdm(names_bios):
    similar_characters = get_similar_characters(name, bio, k=3)

    all_genres = set()
    all_themes = set()

    for char in similar_characters:
        all_genres.update(char.metadata['anime_genres'].split('|'))
        all_themes.update(char.metadata['anime_themes'].split('|'))
    
    genres = ', '.join(all_genres)
    themes = ', '.join(all_themes)

    
    prompt = f"[NAME] {name}\n[GENRES] {', '.join(all_genres)}\n[THEMES] {', '.join(all_themes)}\n[BIO]"
    prompts.append(prompt)
    completions.append(bio)

100%|██████████| 36608/36608 [27:21<00:00, 22.31it/s]


In [86]:
prompts[1000], completions[1000]

('[NAME] Ako Udagawa\n[GENRES] Comedy, Drama, Romance, Supernatural\n[THEMES] Music, School, CGDCT\n[BIO]',
 "Year: First Year\nBirthday: July 3rd\nZodiac Sign: Cancer\nLikes: Potato chips, jelly beans\nDislikes: Eggs, green peppers\nBand: Roselia\nPosition: Drums\nUdagawa Ako is a first-year student at Haneoka Girls' High School and the drummer of Roselia. She admires Minato Yukina and her elder sister Udagawa Tomoe, and is best friends with Shirokane Rinko.")

In [87]:
fine_tuning_data = [{"prompt": prompt, "completion": completion} for prompt, completion in zip(prompts, completions)]

In [88]:
fine_tuning_data[10000]

{'prompt': '[NAME] Houichi Kano\n[GENRES] Supernatural, Sci-Fi, Romance, Action, Drama\n[THEMES] Showbiz, Super Power, Gore, School, Martial Arts, Reincarnation\n[BIO]',
 'completion': 'The main character of the series, Houichi\'s life is turned upside down when the Riofaldians take over the Earth by force. Houichi is called "Gun" by his friends, which is a joke on his name. He is called this due to the fact that the kanji symbol for Ho in his name means "gun" in Japanese. On the day of the tenth anniversary of first contact with the Riofaldians, Houichi meets Isaka, a beautiful, not to mention well-stacked, young high school girl who claims to be dating Houichi. She turns out to be a creation of Housuke Kano, and gives Houichi the Gunner Suit Glove, which allows Houichi to change his clothes into a power suit. After fighting off Riofaldian robots, Houichi is taken to the XXX Unit, Exaxxion.'}

In [2]:
import json

In [90]:
with open("files/fine_tuning_data.jsonl", "w") as f:
    for entry in fine_tuning_data:
        f.write(json.dumps(entry) + "\n")

In [5]:
with open("files/name_bio_rag.jsonl", "r") as f:
    fine_tuning_data = [json.loads(line) for line in f]

In [7]:
fine_tuning_texts = [{"text": example['prompt'] + f" {example['completion']} [END]"} for example in fine_tuning_data]

In [12]:
with open("files/name_bio_rag.jsonl", "w") as f:
    for entry in fine_tuning_texts:
        f.write(json.dumps(entry) + "\n")