# Preprocessing for yes/no question answering

Imports:

In [68]:
import pandas as pd
import json
import numpy as np
import country_converter as coco
from sklearn.tree import DecisionTreeClassifier, export_text


In [32]:
raw_df = pd.read_csv("./data/person_2025_small.csv")
# Ensure all columns are displayed
pd.set_option('display.max_columns', None)

# Example: Show the first 5 rows of a DataFrame `df`
print(raw_df.head())

         id   wd_id     wp_id            slug            name  \
0     18934   Q9458     18934        Muhammad        Muhammad   
1      3395   Q9441      3395  Gautama_Buddha  Gautama Buddha   
2     14627    Q935     14627    Isaac_Newton    Isaac Newton   
3   4848272  Q22686   4848272    Donald_Trump    Donald Trump   
4  17414699    Q720  17414699    Genghis_Khan    Genghis Khan   

           occupation  prob_ratio gender          twitter  alive    l  \
0    RELIGIOUS FIGURE         0.0      M              NaN  False  223   
1         PHILOSOPHER         0.0      M              NaN  False  209   
2           PHYSICIST         0.0      M              NaN  False  235   
3          POLITICIAN         0.0      M  realDonaldTrump   True  253   
4  MILITARY PERSONNEL         0.0      M              NaN  False  176   

     hpi_raw                  bplace_name  bplace_lat  bplace_lon  \
0  35.680259                        Mecca   21.422500   39.823333   
1  35.495358                    

The idea is to: remove unnecessary columns (like id etc...) and then for each column create some relevant questions e.g. "Was the person born in the \[1900s/1800s/etc.\]?" from birthyear, or "Is the person a [Religious Figure/ Philosopher/ Physicist etc.]?" from occupation...

In [33]:
raw_df = raw_df[["name", "occupation", "gender", "alive", "bplace_name", "bplace_country", "birthyear", "dplace_name", "deathyear", "age" ]]
raw_df.head()

Unnamed: 0,name,occupation,gender,alive,bplace_name,bplace_country,birthyear,dplace_name,deathyear,age
0,Muhammad,RELIGIOUS FIGURE,M,False,Mecca,Saudi Arabia,570,Medina,632.0,62
1,Gautama Buddha,PHILOSOPHER,M,False,Lumbini,Nepal,-566,Kushinagar,-452.0,114
2,Isaac Newton,PHYSICIST,M,False,Woolsthorpe-by-Colsterworth,United Kingdom,1643,Kensington,1726.0,83
3,Donald Trump,POLITICIAN,M,True,Queens,United States,1946,,,79
4,Genghis Khan,MILITARY PERSONNEL,M,False,Khentii Mountains,Mongolia,1162,Yinchuan,1227.0,65


Let's encode occupations and age

In [65]:
def one_hot_occupations(occupation, top_n=20, top_occ=None):
    if not top_occ:
        return {}
    result = {}
    for occ in top_occ:
        if not isinstance(occ,str): 
            continue
        result[f"is_{occ.lower().replace(' ', '_')}"] = occupation.lower() == occ.lower()
    return result

def encode_age(age):
    return {
        "age_under_30": age < 30,
        "age_30_to_60": 30 <= age <= 60,
        "age_over_60": age > 60,
    }
    

def generate_year_buckets(df, col='birthyear', bin_width=50):
    min_year = int(df[col].min())
    max_year = int(df[col].max())

    min_bucket = (min_year // bin_width) * bin_width
    max_bucket = ((max_year // bin_width) + 1) * bin_width

    buckets = []
    for start in range(min_bucket, max_bucket, bin_width):
        end = start + bin_width
        label = f"born_{start}_{end - 1}"
        buckets.append((label, start, end))

    return buckets

def get_all_century_labels(df, col='birthyear'):
    min_century = int((df[col].min()) // 100) + 1
    max_century = int((df[col].max()) // 100) + 1
    return [century_label(c) for c in range(min_century, max_century + 1)]

def century_label(century):
    suffix = "th"
    if 10 < century % 100 < 14:
        suffix = "th"
    elif century % 10 == 1:
        suffix = "st"
    elif century % 10 == 2:
        suffix = "nd"
    elif century % 10 == 3:
        suffix = "rd"
    return f"born_in_{century}{suffix}_century"

def encode_birth_year(year, year_buckets, all_century_labels):
    traits = {}

    # 50-year buckets
    for label, start, end in year_buckets:
        traits[label] = start <= year < end

    # Century buckets (one-hot all)
    if pd.notnull(year):
        person_century = (year // 100) + 1
        for label in all_century_labels:
            traits[label] = label == century_label(person_century)
    else:
        for label in all_century_labels:
            traits[label] = False

    return traits

def get_all_countries(df, col="bplace_country"):
    countries = df[col].dropna().unique()
    return sorted([c.lower().replace(" ", "_") for c in countries if isinstance(c, str)])

def encode_country(country_name, all_countries):
    traits = {f"from_{c}": False for c in all_countries}
    if isinstance(country_name, str):
        key = country_name.lower().replace(" ", "_")
        if key in all_countries:
            traits[f"from_{key}"] = True
    return traits

    
ALL_CONTINENTS = ["africa", "antarctica", "asia", "europe", "america", "oceania"]

def encode_continent(country_name):
    traits = {f"from_{c.replace(' ', '_')}": False for c in ALL_CONTINENTS}
    try:
        
        continent = coco.convert(names=country_name, to='continent').lower()
        if continent in ALL_CONTINENTS:
            traits[f"from_{continent.replace(' ', '_')}"] = True
    except:
        pass  # leave all as False if conversion fails
    return traits

OCCUPATION_CATEGORIES = {
    "stem": [
        "scientist", "physicist", "chemist", "biologist", "mathematician", "statistician",
        "astronomer", "engineer", "inventor", "ecologist", "geologist", "roboticist", "computer scientist"
    ],
    "arts": [
        "artist", "painter", "sculptor", "photographer", "illustrator", "designer", "architect",
        "writer", "novelist", "poet", "screenwriter", "playwright",
        "actor", "actress", "director", "filmmaker", "dancer", "choreographer"
    ],
    "music": [
        "musician", "composer", "singer", "songwriter", "rapper", "pianist", "guitarist", "violinist", "dj"
    ],
    "sports": [
        "athlete", "footballer", "basketball player", "baseball player", "tennis player",
        "runner", "sprinter", "swimmer", "cyclist", "boxer", "wrestler", "skater", "coach",
        "cricketer", "golfer", "fencer", "gymnast", "judoka", "karateka"
    ],
    "media": [
        "journalist", "tv presenter", "radio host", "news anchor", "broadcaster", "youtuber", "podcaster", "influencer"
    ],
    "politics": [
        "politician", "president", "prime minister", "chancellor", "diplomat", "mayor", "senator", "governor", "ambassador", "monarch", "king", "queen", "emperor", "empress", "czar", "kaiser"
    ],
    "military": [
        "soldier", "general", "admiral", "commander", "officer", "warrior", "mercenary", "veteran"
    ],
    "business": [
        "entrepreneur", "businessperson", "executive", "ceo", "founder", "investor", "venture capitalist", "economist", "banker", "trader"
    ],
    "religion": [
        "priest", "monk", "nun", "theologian", "bishop", "cardinal", "imam", "rabbi", "pastor", "saint", "pope"
    ],
    "academia": [
        "professor", "academic", "philosopher", "historian", "linguist", "sociologist", "anthropologist", "archaeologist", "psychologist"
    ],
    "law": [
        "lawyer", "judge", "jurist", "prosecutor", "barrister"
    ],
    "crime": [
        "criminal", "mobster", "gangster", "assassin", "pirate", "serial killer", "terrorist"
    ],
    "royalty": [
        "king", "queen", "prince", "princess", "duke", "duchess", "emperor", "empress", "tsar", "pharaoh", "regent"
    ],
    "fictional": [
        "superhero", "villain", "fictional character", "wizard", "detective", "alien", "robot", "vampire", "god", "monster"
    ],
    "activism": [
        "activist", "feminist", "environmentalist", "revolutionary", "civil rights leader", "suffragist"
    ],
    "exploration": [
        "explorer", "navigator", "adventurer", "pioneer", "mountaineer", "astronaut"
    ],
    "misc": [
        "celebrity", "model", "beauty queen", "socialite", "pageant winner", "chef", "cook"
    ]
}


def encode_occupation_category(occupation, categories=OCCUPATION_CATEGORIES):
    traits = {f"is_{key}": False for key in categories}
    occ_lower = occupation.lower()
    matched = False

    for category, keywords in categories.items():
        for keyword in keywords:
            if keyword in occ_lower:
                traits[f"is_{category}"] = True
                matched = True
                break  # If you only want one category per person, keep this break

    if not matched:
        traits["is_other"] = True

    return traits


In [66]:
# Process each row
top_occupations = raw_df["occupation"].to_list()
processed = []
year_buckets = generate_year_buckets(raw_df, col='birthyear', bin_width=50)
century_labels = get_all_century_labels(raw_df, col='birthyear')
all_countries = get_all_countries(raw_df, col="bplace_country")
for _, row in raw_df.iterrows():
    traits = {
        "is_male": row["gender"] == "M",
        "is_alive": row["alive"],
    }
    traits.update(one_hot_occupations(row["occupation"], top_occ=top_occupations))
    traits.update(encode_birth_year(row['birthyear'], year_buckets, century_labels)) 
    traits.update(encode_age(row["age"]))
    #traits.update(encode_country(row["bplace_country"]))
    traits.update(encode_occupation_category(row["occupation"]))
    traits.update(encode_continent(row["bplace_country"]))
    traits.update(encode_country(row["bplace_country"], all_countries))
    processed.append({
        "name": row["name"],
        "traits": traits
    })

# Save to JSON
with open("processed_characters.json", "w") as f:
    json.dump(processed, f, indent=2)

## Let's build a Decision Tree on it

In [69]:


# Load your preprocessed JSON
with open("processed_characters.json") as f:
    data = json.load(f)

# Build DataFrame
names = []
traits_list = []
for entry in data:
    names.append(entry["name"])
    traits_list.append(entry["traits"])

df = pd.DataFrame(traits_list)
df["name"] = names

# Encode names as target classes
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["name"])
X = df.drop(columns=["name"])

# Train Decision Tree
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(criterion="entropy", max_depth=10)  # limit depth to avoid overfitting
clf.fit(X, y)

# Print the decision tree structure as text
tree_text = export_text(clf, feature_names=list(X.columns))
print(tree_text)


|--- from_europe <= 0.50
|   |--- from_asia <= 0.50
|   |   |--- born_in_20th_century <= 0.50
|   |   |   |--- from_united_states <= 0.50
|   |   |   |   |--- from_egypt <= 0.50
|   |   |   |   |   |--- from_africa <= 0.50
|   |   |   |   |   |   |--- born_-1050_-1001 <= 0.50
|   |   |   |   |   |   |   |--- class: 123
|   |   |   |   |   |   |--- born_-1050_-1001 >  0.50
|   |   |   |   |   |   |   |--- class: 128
|   |   |   |   |   |--- from_africa >  0.50
|   |   |   |   |   |   |--- is_philosopher <= 0.50
|   |   |   |   |   |   |   |--- class: 66
|   |   |   |   |   |   |--- is_philosopher >  0.50
|   |   |   |   |   |   |   |--- class: 13
|   |   |   |   |--- from_egypt >  0.50
|   |   |   |   |   |--- is_mathematician <= 0.50
|   |   |   |   |   |   |--- age_over_60 <= 0.50
|   |   |   |   |   |   |   |--- class: 27
|   |   |   |   |   |   |--- age_over_60 >  0.50
|   |   |   |   |   |   |   |--- class: 97
|   |   |   |   |   |--- is_mathematician >  0.50
|   |   |   |   |   | 

In [71]:
def ask_tree_questions(clf, X_columns, label_encoder):
    tree = clf.tree_
    node = 0  # start at root

    while tree.children_left[node] != tree.children_right[node]:  # while not a leaf
        feature_idx = tree.feature[node]
        feature_name = X_columns[feature_idx]

        # Ask user
        answer = input(f"Is the answer to '{feature_name.replace('_', ' ')}' yes? (y/n) ").strip().lower()
        if answer == "y":
            node = tree.children_right[node]
        else:
            node = tree.children_left[node]

    # Predict class (majority vote in this leaf)
    class_id = tree.value[node].argmax()
    prediction = label_encoder.inverse_transform([class_id])[0]
    print(f"\n🧠 I guess: {prediction}")


In [73]:
ask_tree_questions(clf, X.columns.tolist(), label_encoder)



🧠 I guess: Pope Francis
