# Test data

In [11]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

tabular_data = pd.read_csv('processed_data.csv')

## CASE 1:

In [None]:
from tqdm import tqdm
import random
import numpy as np
import pandas as pd
from functools import lru_cache

first_names = set([
    "Alice", "Bob", "Charlie", "Diana", "Ethan", "Fiona", "George", "Hannah",
    "Ian", "Julia", "Kevin", "Lily", "Mason", "Nina", "Oscar", "Paula",
    "Quincy", "Rachel", "Steve", "Tina", "Umar", "Vera", "Walter", "Xena", "Yara", "Zane"
])

state_to_cities = tabular_data.groupby('state')['city'].apply(frozenset).to_dict()
country_to_cities = tabular_data.groupby('country')['city'].apply(frozenset).to_dict()
continent_to_cities = tabular_data.groupby('continent')['city'].apply(frozenset).to_dict()

country_to_states = tabular_data.groupby('country')['state'].apply(frozenset).to_dict()
continent_to_states = tabular_data.groupby('continent')['state'].apply(frozenset).to_dict()
continent_to_countries = tabular_data.groupby('continent')['country'].apply(frozenset).to_dict()

all_cities = frozenset(tabular_data['city'])
all_states = frozenset(tabular_data['state'])
all_countries = frozenset(tabular_data['country'])

@lru_cache(maxsize=None)
def get_complement(full_set, subset):
    return list(full_set - subset)

def generate_optimized(row):
    person = random.choice(tuple(first_names))
    city, state, country, continent = row['city'], row['state'], row['country'], row['continent']

    state_cities = state_to_cities.get(state, frozenset())
    country_cities = country_to_cities.get(country, frozenset())
    continent_cities = continent_to_cities.get(continent, frozenset())

    country_states = country_to_states.get(country, frozenset())
    continent_states = continent_to_states.get(continent, frozenset())
    continent_countries = continent_to_countries.get(continent, frozenset())

    cities_not_in_state = get_complement(all_cities, state_cities)
    cities_not_in_country = get_complement(all_cities, country_cities)
    cities_not_in_continent = get_complement(all_cities, continent_cities)

    states_not_in_country = get_complement(all_states, country_states)
    states_not_in_continent = get_complement(all_states, continent_states)

    countries_not_in_continent = get_complement(all_countries, continent_countries)

    def get_options(not_in_set, correct_answer, n=5):
        options = random.sample(not_in_set, min(n, len(not_in_set))) + [correct_answer]
        random.shuffle(options)
        return options, options.index(correct_answer)

    continentcountry_options, continentcountry_index = get_options(countries_not_in_continent, country)
    continentstate_options, continentstate_index = get_options(states_not_in_continent, state)
    continentcity_options, continentcity_index = get_options(cities_not_in_continent, city)
    countrystate_options, countrystate_index = get_options(states_not_in_country, state)
    countrycity_options, countrycity_index = get_options(cities_not_in_country, city)
    statecity_options, statecity_index = get_options(cities_not_in_state, city)

    return [
        {
            'question': f'If {person} does not live in {continent}, then {person} does not live in:',
            'options': continentcountry_options,
            'gt_answer': country,
            'gt_option': continentcountry_index
        },
        {
            'question': f'If {person} does not live in {continent}, then {person} does not live in:',
            'options': continentstate_options,
            'gt_answer': state,
            'gt_option': continentstate_index
        },
        {
            'question': f'If {person} does not live in {continent}, then {person} does not live in:',
            'options': continentcity_options,
            'gt_answer': city,
            'gt_option': continentcity_index
        },
        {
            'question': f'If {person} does not live in {country}, then {person} does not live in:',
            'options': countrystate_options,
            'gt_answer': state,
            'gt_option': countrystate_index
        },
        {
            'question': f'If {person} does not live in {country}, then {person} does not live in:',
            'options': countrycity_options,
            'gt_answer': city,
            'gt_option': countrycity_index
        },
        {
            'question': f'If {person} does not live in {state}, then {person} does not live in:',
            'options': statecity_options,
            'gt_answer': city,
            'gt_option': statecity_index
        }
    ]

test_data = []
for _, row in tqdm(tabular_data.iterrows(), total=len(tabular_data), desc="Processing rows"):
    test_data.extend(generate_optimized(row))

test_df = pd.DataFrame(test_data)
test_df.to_csv('test_1.csv', index=False)



Processing rows: 100%|██████████| 148311/148311 [00:56<00:00, 2628.63it/s] 


In [4]:
df = pd.read_csv('test_1.csv')
sampled_df = df.sample(n=1500, random_state=42)
sampled_df.to_csv('test_data_case1.csv', index=False)

## CASE 2:

In [None]:
import random
import pandas as pd
from tqdm import tqdm

def generate_optimized(df, row, first_names, precomputed_pairs):
    person = random.choice(first_names)
    city, state, country, continent = row['city'], row['state'], row['country'], row['continent']
    
    state_country_pairs, state_continent_pairs, country_continent_pairs = precomputed_pairs

    incorrect_state_country = [f"{s}, {c}" for s, c in state_country_pairs if (s, c) != (state, country)]
    incorrect_state_continent = [f"{s}, {c}" for s, c in state_continent_pairs if (s, c) != (state, continent)]
    incorrect_country_continent = [f"{c1}, {c2}" for c1, c2 in country_continent_pairs if (c1, c2) != (country, continent)]

    state_country_options = random.sample(incorrect_state_country, min(5, len(incorrect_state_country))) + [f"{state}, {country}"]
    state_continent_options = random.sample(incorrect_state_continent, min(5, len(incorrect_state_continent))) + [f"{state}, {continent}"]
    country_continent_options = random.sample(incorrect_country_continent, min(5, len(incorrect_country_continent))) + [f"{country}, {continent}"]

    random.shuffle(state_country_options)
    random.shuffle(state_continent_options)
    random.shuffle(country_continent_options)

    return [
        {
            'question': f'If {person} lives in {city}, then {person} lives in:',
            'options': state_country_options,
            'gt_answer': f'{state}, {country}',
            'gt_option': state_country_options.index(f"{state}, {country}")
        },
        {
            'question': f'If {person} lives in {city}, then {person} lives in:',
            'options': state_continent_options,
            'gt_answer': f'{state}, {continent}',
            'gt_option': state_continent_options.index(f"{state}, {continent}")
        },
        {
            'question': f'If {person} lives in {city}, then {person} lives in:',
            'options': country_continent_options,
            'gt_answer': f'{country}, {continent}',
            'gt_option': country_continent_options.index(f"{country}, {continent}")
        },
        {
            'question': f'If {person} lives in {state}, then {person} lives in:',
            'options': country_continent_options,
            'gt_answer': f'{country}, {continent}',
            'gt_option': country_continent_options.index(f"{country}, {continent}")
        }
    ]

tabular_data = pd.read_csv('/content/processed_data.csv')
first_names = [
    "Alice", "Bob", "Charlie", "Diana", "Ethan", "Fiona", "George", "Hannah",
    "Ian", "Julia", "Kevin", "Lily", "Mason", "Nina", "Oscar", "Paula",
    "Quincy", "Rachel", "Steve", "Tina", "Umar", "Vera", "Walter", "Xena", "Yara", "Zane"
]
state_country_pairs = set(map(tuple, tabular_data[['state', 'country']].drop_duplicates().values))
state_continent_pairs = set(map(tuple, tabular_data[['state', 'continent']].drop_duplicates().values))
country_continent_pairs = set(map(tuple, tabular_data[['country', 'continent']].drop_duplicates().values))
precomputed_pairs = (state_country_pairs, state_continent_pairs, country_continent_pairs)

test_data = [
    entry
    for _, row in tqdm(tabular_data.iterrows(), total=len(tabular_data), desc="Processing rows")
    for entry in generate_optimized(tabular_data, row, first_names, precomputed_pairs)
]

pd.DataFrame(test_data).to_csv('test_2.csv', index=False)

print(f"Test dataset saved to test_2.csv")

In [None]:
df = pd.read_csv('test_2.csv')
sampled_df = df.sample(n=1500, random_state=42)
sampled_df.to_csv('test_data_case2.csv', index=False)

## CASE 3:

In [None]:
from tqdm import tqdm
import random
import numpy as np
import pandas as pd
from functools import lru_cache

first_names = set([
    "Alice", "Bob", "Charlie", "Diana", "Ethan", "Fiona", "George", "Hannah",
    "Ian", "Julia", "Kevin", "Lily", "Mason", "Nina", "Oscar", "Paula",
    "Quincy", "Rachel", "Steve", "Tina", "Umar", "Vera", "Walter", "Xena", "Yara", "Zane"
])

city_to_states = tabular_data.groupby('city')['state'].apply(frozenset).to_dict()
city_to_countries = tabular_data.groupby('city')['country'].apply(frozenset).to_dict()
city_to_continents = tabular_data.groupby('city')['continent'].apply(frozenset).to_dict()

state_to_countries = tabular_data.groupby('state')['country'].apply(frozenset).to_dict()
state_to_continents = tabular_data.groupby('state')['continent'].apply(frozenset).to_dict()

country_to_continents = tabular_data.groupby('country')['continent'].apply(frozenset).to_dict()

all_cities = frozenset(tabular_data['city'])
all_states = frozenset(tabular_data['state'])
all_countries = frozenset(tabular_data['country'])
all_continents = frozenset(tabular_data['continent'])

@lru_cache(maxsize=None)
def get_complement(full_set, subset):
    return list(full_set - subset)

def generate_optimized(row):
    person = random.choice(tuple(first_names))
    city, state, country, continent = row['city'], row['state'], row['country'], row['continent']

    city_states = city_to_states.get(city, frozenset())
    city_countries = city_to_countries.get(city, frozenset())
    city_continents = city_to_continents.get(city, frozenset())

    state_countries = state_to_countries.get(state, frozenset())
    state_continents = state_to_continents.get(state, frozenset())

    country_continents = country_to_continents.get(country, frozenset())

    states_not_of_cities = get_complement(all_states, city_states)
    countries_not_of_cities = get_complement(all_countries, city_countries)
    continents_not_of_cities = get_complement(all_continents, city_continents)

    countries_not_of_states = get_complement(all_countries, state_countries)
    continents_not_of_states = get_complement(all_continents, state_continents)

    continents_not_of_countries = get_complement(all_continents, country_continents)


    def get_options(correct_answers, wrong_answers, n=5):
        correct_answer_list = list(correct_answers)
        wrong_answer_list = list(wrong_answers)

        if len(wrong_answer_list) < 6:
            return None, None, None
        else:
        
            if len(correct_answer_list) == 1:
                clist = random.sample(wrong_answer_list, 1) + [correct_answer_list[-1]]
            else:
                clist = random.sample(correct_answer_list, 2)

            random.shuffle(clist)
            correct_answer = f'Either {clist[0]} or {clist[1]}'

            olist = random.sample(wrong_answer_list, 2 * n)
            options = [
                f'Either {olist[0]} or {olist[1]}',
                f'Either {olist[2]} or {olist[3]}',
                f'Either {olist[4]} or {olist[5]}',
                f'Either {olist[6]} or {olist[7]}',
                f'Either {olist[8]} or {olist[9]}'
            ]

            options = options + [correct_answer]
            random.shuffle(options)

            return options, options.index(correct_answer), correct_answer
    

    cityState_options, cityState_index, cityState_correct_answer = get_options(city_states, states_not_of_cities)
    cityCountry_options, cityCountry_index, cityCountry_correct_answer = get_options(city_countries, countries_not_of_cities)
    cityContinent_options, cityContinent_index, cityContinent_correct_answer = get_options(city_continents, continents_not_of_cities)

    stateCountry_options, stateCountry_index, stateCountry_correct_answer = get_options(state_countries, countries_not_of_states)
    stateContinent_options, stateContinent_index, stateContinent_correct_answer = get_options(state_continents, continents_not_of_states)

    countryContinent_options, countryContinent_index, countryContinent_correct_answer = get_options(country_continents, continents_not_of_countries)

    if cityContinent_options is None:
        return [
            {
                'question': f'If {person} lives in {city}, then {person} lives in:',
                'options': cityState_options,
                'gt_answer': cityState_correct_answer,
                'gt_option': cityState_index
            },
            {
                'question': f'If {person} lives in {city}, then {person} lives in:',
                'options': cityCountry_options,
                'gt_answer': cityCountry_correct_answer,
                'gt_option': cityCountry_index
            },
            {
                'question': f'If {person} lives in {state}, then {person} lives in:',
                'options': stateCountry_options,
                'gt_answer': stateCountry_correct_answer,
                'gt_option': stateCountry_index
            }
        ]
    else:
        return [
            {
                'question': f'If {person} lives in {city}, then {person} lives in:',
                'options': cityState_options,
                'gt_answer': cityState_correct_answer,
                'gt_option': cityState_index
            },
            {
                'question': f'If {person} lives in {city}, then {person} lives in:',
                'options': cityCountry_options,
                'gt_answer': cityCountry_correct_answer,
                'gt_option': cityCountry_index
            },
            {
                'question': f'If {person} lives in {city}, then {person} lives in:',
                'options': cityContinent_options,
                'gt_answer': cityContinent_correct_answer,
                'gt_option': cityContinent_index
            },
            {
                'question': f'If {person} lives in {state}, then {person} lives in:',
                'options': stateCountry_options,
                'gt_answer': stateCountry_correct_answer,
                'gt_option': stateCountry_index
            },
            {
                'question': f'If {person} lives in {state}, then {person} lives in:',
                'options': stateContinent_options,
                'gt_answer': stateContinent_correct_answer,
                'gt_option': stateContinent_index
            },
            {
                'question': f'If {person} lives in {country}, then {person} lives in:',
                'options': countryContinent_options,
                'gt_answer': countryContinent_correct_answer,
                'gt_option': countryContinent_index
            }
        ]

test_data = []
for _, row in tqdm(tabular_data.iterrows(), total=len(tabular_data), desc="Processing rows"):
    test_data.extend(generate_optimized(row))
    # break

test_df = pd.DataFrame(test_data)
test_df.to_csv('test_3.csv', index=False)

Processing rows: 100%|██████████| 148311/148311 [00:15<00:00, 9824.94it/s] 


In [26]:
df = pd.read_csv('test_3.csv')
sampled_df = df.sample(n=1500, random_state=42)
sampled_df.to_csv('test_data_case3.csv', index=False)