## Make 3 dataset consisting of 3 humor categories which each contains 6k jokes


In [None]:
import pandas as pd
import os
import string


os.chdir("C:/Users/patry/Desktop/Uni/Core topics AI/LLM project")
print(os.getcwd())

df = pd.read_csv(
    "train.tsv",
    sep="\t",
    header=None,                 
    names=["label", "joke"],     
    encoding="utf-8",
    engine="python",
    on_bad_lines="skip"
)

print(df.head())
print(df.shape)
 
def ascii_ratio(s: str) -> float:
    if not s:
        return 0.0
    printable = set(string.printable)
    return sum(ch in printable for ch in s) / len(s)

def fix_mojibake(text: str) -> str:
    if not isinstance(text, str):
        return text
    try:
        # reinterpret as latin-1 and decode as utf-8
        return text.encode("latin-1").decode("utf-8")
    except (UnicodeEncodeError, UnicodeDecodeError):
        return text



df = df.dropna(subset=["joke"])
df["joke"] = df["joke"].astype(str).str.strip()

df["joke"] = df["joke"].apply(fix_mojibake)

# drop obvious gibberish
df = df[df["joke"].apply(ascii_ratio) > 0.85]

# sample 2k dark-ish internet jokes
dark_df = df.sample(n=6000, random_state=42)
dark_df[["joke"]].to_csv("dark_jokes.csv", index=False)

In [None]:
df = pd.read_csv("dad_jokes_complete.csv", header=None, names=["joke"])

# Drop NaN & strip whitespace
df = df.dropna(subset=["joke"])
df["joke"] = df["joke"].astype(str).str.strip()

# Try to fix mojibake
def fix_mojibake(text: str) -> str:
    if not isinstance(text, str):
        return text
    try:
        return text.encode("latin-1").decode("utf-8")
    except:
        return text

df["joke"] = df["joke"].apply(fix_mojibake)

# Remove gibberish using ascii_ratio
def ascii_ratio(s: str) -> float:
    if not s:
        return 0.0
    printable = set(string.printable)
    return sum(ch in printable for ch in s) / len(s)

df = df[df["joke"].apply(ascii_ratio) > 0.85]

# remove overly long entries 
df = df[df["joke"].str.len() < 400]

# Save cleaned dataset
df.to_csv("dad_jokes_cleaned.csv", index=False)
print("Cleaned dad jokes:", df.shape)


import pandas as pd

df = pd.read_csv("dad_jokes_cleaned.csv")

dad_df = df.sample(n=6000, random_state=42)  
dad_df.to_csv("dad_jokes_sampled.csv", index=False)

print("Saved sampled dad jokes:", dad_df.shape)

In [None]:
import pandas as pd
import string
import os 


os.chdir("C:/Users/patry/Desktop/Uni/Core topics AI/LLM project")
# Load dataset
df = pd.read_csv("shortjokes.csv", header=None, names=["joke"])

# Drop NaN & strip whitespace
df = df.dropna(subset=["joke"])
df["joke"] = df["joke"].astype(str).str.strip()


def fix_mojibake(text: str) -> str:
    if not isinstance(text, str):
        return text
    try:
        return text.encode("latin-1").decode("utf-8")
    except:
        return text

df["joke"] = df["joke"].apply(fix_mojibake)


def ascii_ratio(s: str) -> float:
    if not s:
        return 0.0
    printable = set(string.printable)
    return sum(ch in printable for ch in s) / len(s)

df = df[df["joke"].apply(ascii_ratio) > 0.85]


df = df[df["joke"].str.len() < 400]

# Save cleaned dataset
df.to_csv("shortjokes_cleaned.csv", index=False)
print("Cleaned short jokes:", df.shape)


import pandas as pd

df = pd.read_csv("shortjokes_cleaned.csv")

short_df = df.sample(n=6000, random_state=42)  
short_df.to_csv("short_jokes_sampled.csv", index=False)

print("Saved sampled short jokes:", short_df.shape)

# Make csv files into instruction JSON files

In [None]:
print("Working directory:", os.getcwd())

def csv_to_jsonl(csv_path, jsonl_path, instruction_text, sep=",", column="joke"):
    df = pd.read_csv(csv_path, sep=sep)

    # If the column name is wrong (like "joke;;"), rename it
    if column not in df.columns:
        # Try to detect the joke column
        for col in df.columns:
            if "joke" in col.lower():
                df = df.rename(columns={col: "joke"})
                break

    with open(jsonl_path, "w", encoding="utf-8") as f:
        for joke in df["joke"]:
            example = {
                "instruction": instruction_text,
                "input": "",
                "output": joke
            }
            f.write(json.dumps(example, ensure_ascii=False) + "\n")

    print(f"Created: {jsonl_path}")



# Dad jokes — normal CSV
csv_to_jsonl(
    "dad_jokes_sampled.csv",
    "dad_jokes.jsonl",
    "Tell a dad joke.",
    sep=",",
    column="joke"
)

# Dark jokes — semicolon separator, column is "joke;;"
csv_to_jsonl(
    "dark_jokes.csv",
    "dark_jokes.jsonl",
    "Tell a dark humor joke.",
    sep=";",           
    column="joke;;"    
)

# Sarcastic/short jokes — normal CSV
csv_to_jsonl(
    "short_jokes_sampled.csv",
    "short_jokes.jsonl",
    "Tell a short joke.",
    sep=",",
    column="joke"
)


## Merge all jokes into 1 JSON file

In [None]:
files_to_merge = [
    "dad_jokes.jsonl",
    "dark_jokes.jsonl",
    "short_jokes.jsonl"
]

output_file = "humor_3cat.jsonl"

with open(output_file, "w", encoding="utf-8") as outfile:
    for fname in files_to_merge:
        with open(fname, "r", encoding="utf-8") as infile:
            for line in infile:
                outfile.write(line)

print("Merged into:", output_file)
