In [None]:
import pandas as pd
import random
from tqdm import tqdm


data = pd.read_csv("/content/imaginary_cities_with_duplicates.csv")
data.drop("Duplicate Flag", axis=1, inplace=True)

first_names = [
    "Alice", "Bob", "Charlie", "Diana", "Ethan", "Fiona", "George", "Hannah",
    "Ian", "Julia", "Kevin", "Lily", "Mason", "Nina", "Oscar", "Paula",
    "Quincy", "Rachel", "Steve", "Tina", "Umar", "Vera", "Walter", "Xena", "Yara", "Zane"
]

def sample_excluding(lst, exclude, n=1):
    filtered_list = [item for item in lst if item not in exclude]
    if len(filtered_list) < n:
        return None
    return random.sample(filtered_list, n)

In [None]:
data.columns = ["city", "state", "country", "continent"]

In [None]:
data.head()

Unnamed: 0,city,state,country,continent
0,Semolamo,Vanguard,Calinth,Yenith
1,Dalabu,Landsworth,Sundarim,Ziratha
2,Shidama,Harrington,Balandia,Yenith
3,Kaka,Mireford,Voltria,Xylandia
4,Damoteshi,Nendora,Vekharia,Yenith


In [None]:
# Function to generate data for a given level (state, country, continent)
def generate_data_4(level, tries=1):
    generated_data = []
    items = data[level].unique()

    for item in tqdm(items):
      in_objects = []

      if level == "state":
          in_objects.extend(data[data["state"] == item]['city'].unique())
      elif level == "country":
          in_objects.extend(data[data["country"] == item]['city'].unique())
          in_objects.extend(data[data["country"] == item]['state'].unique())
      elif level == "continent":
          in_objects.extend(data[data["continent"] == item]['city'].unique())
          in_objects.extend(data[data["continent"] == item]['state'].unique())
          in_objects.extend(data[data["continent"] == item]['country'].unique())
      else:
          continue  # Handle other levels if needed

      name = random.choice(first_names)

      if len(in_objects) < 2:
        continue

      used_objs = []
      for _ in range(tries):
          objects = sample_excluding(in_objects, used_objs, 2)
          if objects is None:
            continue

          remove_items = data[data["city"].isin(objects) | data["state"].isin(objects) | data["country"].isin(objects)][level].unique()
          options = sample_excluding(items, remove_items, 5)
          if options is None:
            continue
          random_index = random.randint(0, len(options))
          options.insert(random_index, item)

          question = f"If {name} lives in {objects[0]} or {objects[1]}, then {name} lives in:"
          answer = item
          generated_data.append({
              "question": question,
              "options": options,
              "gt_answer": answer,
              "gt_option": random_index + 1,
          })

          used_objs.extend(objects)

    return generated_data


# Generate data for countries
generated_data_state_4 = generate_data_4("state", tries=20)
print(f"\nGenerated {len(generated_data_state_4)} state-based questions.")

# Generate data for countries
generated_data_countries_4 = generate_data_4("country", tries=40)
print(f"\nGenerated {len(generated_data_countries_4)} country-based questions.")


# Generate data for continents
generated_data_continents_4 = generate_data_4("continent", tries=80)
print(f"\nGenerated {len(generated_data_continents_4)} continent-based questions.")

100%|██████████| 10/10 [00:00<00:00, 22.26it/s]



Generated 200 state-based questions.


100%|██████████| 10/10 [00:00<00:00, 11.58it/s]



Generated 312 country-based questions.


100%|██████████| 6/6 [00:00<00:00,  8.16it/s]


Generated 90 continent-based questions.





In [None]:

# Combine the generated data lists
total_req = 500

combined_data = random.sample(generated_data_countries_4, 250) + generated_data_continents_4
req = total_req - len(combined_data)
if req > 0:
  combined_data += random.sample(generated_data_state_4, req)

# Create a DataFrame from the combined data
df = pd.DataFrame(combined_data)

# Save the DataFrame to a CSV file
df.to_csv("combined_generated_data_4.csv", index=False)

In [None]:
def generate_data_5(level, tries=1):
    generated_data = []

    items = data[level].unique()

    for item in tqdm(items):
      subset_data = data[data[level] == item]

      for _ in range(tries):
        ques_row = subset_data.sample(1)
        test_entity = ""

        if level == "country":
          city = ques_row["city"].item()
          state = ques_row["state"].item()
          test_entity = f"{city}, {state}"
        elif level == "continent":
          city = ques_row["city"].item()
          state = ques_row["state"].item()
          country = ques_row["country"].item()
          entities = [1, 2, 3]
          choose = random.sample(entities, 2)
          choosen_entities = []
          if 1 in choose:
            choosen_entities.append(city)
          if 2 in choose:
            choosen_entities.append(state)
          if 3 in choose:
            choosen_entities.append(country)
          if len(choosen_entities) < 2:
            print(choosen_entities)
            print(choose)
          try:
            test_entity = ", ".join(choosen_entities)
          except:
            print(choosen_entities)
            print(choose)
            break


        options = sample_excluding(items, [item], 5)

        if options is None:
              continue
        random_index = random.randint(0, len(options))
        options.insert(random_index, item)

        name = random.choice(first_names)

        question = f"If {name} lives in {test_entity}, then {name} lives in:"
        answer = item
        generated_data.append({
            "question": question,
            "options": options,
            "gt_answer": answer,
            "gt_option": random_index + 1,
        })

        subset_data.drop(ques_row.index, inplace=True)
        if len(subset_data) == 0:
          break

    return generated_data

# Generate data for countries
generated_data_countries_5 = generate_data_5("country", 50)
print(f"\nGenerated {len(generated_data_countries_5)} country-based questions.")

# Generate data for continents
generated_data_continents_5 = generate_data_5("continent", 100)
print(f"\nGenerated {len(generated_data_continents_5)} continent-based questions.")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_data.drop(ques_row.index, inplace=True)
100%|██████████| 10/10 [00:00<00:00, 34.99it/s]



Generated 500 country-based questions.


100%|██████████| 6/6 [00:00<00:00, 14.63it/s]


Generated 600 continent-based questions.





In [None]:
# Combine the generated data lists
total_req = 500

combined_data = random.sample(generated_data_continents_5, 250)
req = total_req - len(combined_data)
if req > 0:
  combined_data += random.sample(generated_data_countries_5, req)

# Create a DataFrame from the combined data
df = pd.DataFrame(combined_data)

# Save the DataFrame to a CSV file
df.to_csv("combined_generated_data_5.csv", index=False)