In [1]:
# Author: Yuki Rivera
# This notebook creates input/output pairs for our anime generation project

In [None]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from itertools import combinations
from itertools import chain

In [None]:
# Expand the displayed length to max

pd.set_option("display.max_colwidth", None)

### Loads the dataset

In [None]:
df = pd.read_csv('duplicates_removed_top_anime_v2.csv')

In [None]:
df.shape

(10380, 33)

### Helper functions to create input/output pairs

In [None]:
# Converts str to python list in the corresponding columns
def clean_list_columns(df):
    for col in ['genres', 'themes', 'demographics']:
        df.loc[:,col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith("[") else x)
    return df


In [None]:
# Function to format list values
def format_list_for_prompt(val):
    if isinstance(val, list):
        if not val:
            return ""
        if len(val) == 1:
            return val[0]
        return ", ".join(val[:-1]) + " and " + val[-1]
    return str(val)

In [None]:
def build_prompt_from_template(formatted):
    keys = tuple(sorted(formatted.keys()))
    if keys in TEMPLATES_BY_FIELDS:
        return TEMPLATES_BY_FIELDS[keys].format(**formatted)
    else:
        # builds pairs dynamically
        prompt_parts = [
            f"'{v}' {k}" if k != "title" else f"similar to '{v}'"
            for k, v in formatted.items()
        ]
        return "Create an anime synopsis with " + ", ".join(prompt_parts) + "."

### Templates to combine with tags

In [None]:
TEMPLATES_BY_FIELDS = {
    # One field
    ("demographic",): "Generate an anime synopsis for '{demographic}'.",
    ("genre",): "Write an anime synopsis that matches '{genre}'.",
    ("theme",): "Write an anime synopsis based on the '{theme}'.",

    # Two fields
    ("demographic", "genre"): "Write an anime idea in '{genre}' for '{demographic}'.",
    ("demographic", "theme"): "Generate a '{theme}' themed anime targeting '{demographic}'.",
    ("demographic", "title"): "Generate an anime synopsis for '{demographic}' like '{title}'.",
    ("genre", "theme"): "Create an anime synopsis with '{genre}' and '{theme}'.",
    ("genre", "title"): "Write a '{genre}' anime synopsis inspired by '{title}'.",
    ("theme", "title"): "Write an anime story idea with '{theme}', similar to '{title}'.",

    # Three fields
    ("demographic", "genre", "theme"): "Create an anime with '{genre}' and '{theme}' for '{demographic}'.",
    ("demographic", "genre", "title"): "Write an anime story idea with '{genre}' for '{demographic}', inspired by '{title}'.",
    ("demographic", "theme", "title"): "Generate a '{theme}' themed anime synopsis for '{demographic}' like '{title}'.",
    ("genre", "theme", "title"): "Provide me an anime story idea in '{genre}' with '{theme}', similar to '{title}'.",

    # All four fields
    ("demographic", "genre", "theme", "title"): "Write an anime synopsis involving '{genre}' with '{theme}' for '{demographic}', similar to '{title}'."
}

### Function to generate pairs

In [None]:
def generate_all_prompt_synopsis_pairs(row):
    if pd.isna(row.get("synopsis")) or not row["synopsis"].strip():
        return []

    raw_fields_data = {
        "genre": row.get("genres"),
        "theme": row.get("themes"),
        "demographic": row.get("demographics"),
        "title": row.get("english_name")
    }

    # Filter out empty fields
    valid_formatted_for_prompt = {
        key: format_list_for_prompt(val)
        for key, val in raw_fields_data.items() 
        if (isinstance(val, list) and val) or (isinstance(val, str) and val.strip())
    }

    if not valid_formatted_for_prompt:
        return []

    pairs = []
    keys_for_combinations = list(valid_formatted_for_prompt.keys())

    # Generate all non-empty combinations of the available fields
    for r in range(1, len(keys_for_combinations) + 1):
        for key_combo in combinations(keys_for_combinations, r):
            # If the combination only contains 'title' and nothing else, skip it.
            if len(key_combo) == 1 and 'title' in key_combo:
                continue # Skip this combination

            formatted_subset_for_prompt = {k: valid_formatted_for_prompt[k] for k in key_combo}
            prompt = build_prompt_from_template(formatted_subset_for_prompt)

            # Creates the 'tags_combined' string for the current combination 
            # This explicitly includes only 'genre', 'theme', 'demographic' and excludes 'title'.
            current_tags_list = []
            for k_tag in key_combo:
                # Only process if the key is one of the desired tag categories
                if k_tag in ['genre', 'theme', 'demographic']:
                    val = raw_fields_data.get(k_tag) # Get the raw list/string value

                    if isinstance(val, list):
                        # Flatten list of tags, lowercasing and stripping each item
                        current_tags_list.extend([item.lower().strip() for item in val if isinstance(item, str) and item.strip()])
                    elif isinstance(val, str) and val.strip():
                        cleaned_single_tag_str = re.sub(r'\s+and\s+|\s*,\s*', ' ', val.lower().strip())
                        current_tags_list.extend(cleaned_single_tag_str.split())

            tags_combined = " ".join(sorted(list(set(current_tags_list))))


            pairs.append({
                "prompt": prompt.strip(),
                "synopsis": row["synopsis"].strip(),
                "tags_combined": tags_combined 
            })

    return pairs

### Test with sample dataset

In [None]:
df = clean_list_columns(df.copy())

In [None]:
sample_df = df.sample(5)

In [None]:
# generate prompts
prompts = [generate_all_prompt_synopsis_pairs(row) for _, row in sample_df.iterrows()]

In [None]:
flattened = list(chain.from_iterable(prompts))

for pair in flattened:
    print("Prompt:", pair["prompt"])
    print("Synopsis:", pair["synopsis"])
    print('Tags:', pair['tags_combined'])
    print("-" * 40)

Prompt: Write an anime synopsis that matches 'Adventure, Comedy and Sci-Fi'.
Synopsis: locomon, a train digimon, has suddenly appeared in the real world. all the tamers scramble to stop the train before it reaches its final destination: the portal into the digital world.
Tags: adventure comedy sci-fi
----------------------------------------
Prompt: Write a 'Adventure, Comedy and Sci-Fi' anime synopsis inspired by 'Digimon Tamers: Runaway Locomon'.
Synopsis: locomon, a train digimon, has suddenly appeared in the real world. all the tamers scramble to stop the train before it reaches its final destination: the portal into the digital world.
Tags: adventure comedy sci-fi
----------------------------------------
Prompt: Write an anime synopsis that matches 'Comedy and Fantasy'.
Synopsis: narrated recap of the first season of hataraku maousama!!.
Tags: comedy fantasy
----------------------------------------
Prompt: Write an anime synopsis based on the 'Mythology and Workplace'.
Synopsis: na

### Create prompt/output pairs for training and test sets

In [None]:
# split the dataset into training and heldout sets
training_df, heldout_df = train_test_split(df, test_size=0.15, random_state=42, shuffle=True)

In [None]:
train_pairs = []
for _, row in training_df.iterrows():
    train_pairs.extend(generate_all_prompt_synopsis_pairs(row))

test_pairs = []
for _, row in heldout_df.iterrows():
    test_pairs.extend(generate_all_prompt_synopsis_pairs(row))

train_df = pd.DataFrame(train_pairs)
test_df = pd.DataFrame(test_pairs)

In [None]:
train_df.shape

(60260, 3)

In [None]:
test_df.shape

(10620, 3)

### Saves the pairs to CSV

In [None]:
train_df.to_csv("prompt_synopsis_train.csv", index=False)
test_df.to_csv("prompt_synopsis_test.csv", index=False)