In [1]:
import numpy as np
import pandas as pd

In [2]:
name_meta = pd.read_csv('../interim-data/filtered-names.csv', index_col=0)
name_meta.head()

Unnamed: 0,name,n,n_max,percent_per_year,connotation_1,connotation_2,connotation_3,connotation_4,connotation_5
0,Aaban,256.0,127.0,0.000402,dignity,nobility,prosperity,leadership,strength
1,Aadam,2638.0,343.0,0.000315,first human,earth,life,creation,origin
2,Aadan,139.0,136.0,0.000253,spiritual,strong,traditional,wise,noble
3,Aadarsh,266.0,246.0,0.00036,ideal,perfect,model,exemplary,principle
4,Aaden,5109.0,5061.0,0.003249,fire,warmth,light,passion,strength


In [3]:
# Create a dictionary mapping names to their n_max values
name_to_max_count = dict(zip(name_meta['name'], name_meta['n_max']))

# Display the first few items in the dictionary
list(name_to_max_count.items())[:5]

[('Aaban', 127.0),
 ('Aadam', 343.0),
 ('Aadan', 136.0),
 ('Aadarsh', 246.0),
 ('Aaden', 5061.0)]

In [4]:
name_distances = pd.read_csv('../interim-data/recent-us-and-uk-names-distance.csv', index_col=0)
name_distances.head()

Unnamed: 0,name_1,name_2,distance
0,Aaban,Aadam,4.312159
1,Aaban,Aadan,3.383403
2,Aadam,Aadan,3.474888
3,Aaban,Aadarsh,4.604571
4,Aadam,Aadarsh,4.10018


In [5]:
# Create a mirrored version of name_distances more efficiently
# Use pandas' rename and concat with a faster approach
name_distances_renamed = name_distances.rename(columns={'name_1': 'name_2', 'name_2': 'name_1'})

# Use pd.concat with a list comprehension for better performance
full_distances = pd.concat([name_distances, name_distances_renamed], ignore_index=True)

# Note: We're not removing duplicates as per the commented code in the original

# Display the first few rows of the result
full_distances.head()

Unnamed: 0,name_1,name_2,distance
0,Aaban,Aadam,4.312159
1,Aaban,Aadan,3.383403
2,Aadam,Aadan,3.474888
3,Aaban,Aadarsh,4.604571
4,Aadam,Aadarsh,4.10018


In [6]:
print(name_distances.shape)
print(full_distances.shape)

(452207701, 3)
(904415402, 3)


In [7]:
# Add a new column to full_distances with the n_max value for name_2
from tqdm.notebook import tqdm

# Create a progress bar for the operation
total = len(full_distances)
with tqdm(total=total, desc="Mapping n_max values") as pbar:
    # Process in chunks to show progress
    chunk_size = 100000
    for i in range(0, total, chunk_size):
        chunk_end = min(i + chunk_size, total)
        # Apply mapping to this chunk
        full_distances.loc[i:chunk_end-1, 'name2_nmax'] = full_distances.loc[i:chunk_end-1, 'name_2'].map(name_to_max_count)
        pbar.update(chunk_end - i)

# Display the first few rows to verify the new column
full_distances.head()

Mapping n_max values:   0%|          | 0/904415402 [00:00<?, ?it/s]

Unnamed: 0,name_1,name_2,distance,name2_nmax
0,Aaban,Aadam,4.312159,343.0
1,Aaban,Aadan,3.383403,136.0
2,Aadam,Aadan,3.474888,136.0
3,Aaban,Aadarsh,4.604571,246.0
4,Aadam,Aadarsh,4.10018,246.0


In [8]:
# Define a threshold for filtering based on name2_nmax
nmax_threshold = 500  # You can adjust this threshold as needed

# Create a dictionary that maps names to boolean values based on whether their n_max is >= threshold
name_to_above_threshold = {name: (count >= nmax_threshold) for name, count in name_to_max_count.items()}

# Display a few examples to verify
print("Examples of name_to_above_threshold:")
for name, is_above in list(name_to_above_threshold.items())[:5]:
    print(f"{name}: {is_above} (n_max: {name_to_max_count[name]})")

# Create a progress bar for the operation
total = len(full_distances)
with tqdm(total=total, desc="Mapping n_max values") as pbar:
    # Process in chunks to show progress
    chunk_size = 100000
    for i in range(0, total, chunk_size):
        chunk_end = min(i + chunk_size, total)
        # Add the new column with threshold boolean values
        full_distances.loc[i:chunk_end-1, 'name2_threshold'] = full_distances.loc[i:chunk_end-1, 'name_2'].map(name_to_above_threshold)
        pbar.update(chunk_end - i)

# Display the first few rows to verify the new columns
full_distances.head()

Examples of name_to_above_threshold:
Aaban: False (n_max: 127.0)
Aadam: False (n_max: 343.0)
Aadan: False (n_max: 136.0)
Aadarsh: False (n_max: 246.0)
Aaden: True (n_max: 5061.0)


Mapping n_max values:   0%|          | 0/904415402 [00:00<?, ?it/s]

Unnamed: 0,name_1,name_2,distance,name2_nmax,name2_threshold
0,Aaban,Aadam,4.312159,343.0,False
1,Aaban,Aadan,3.383403,136.0,False
2,Aadam,Aadan,3.474888,136.0,False
3,Aaban,Aadarsh,4.604571,246.0,False
4,Aadam,Aadarsh,4.10018,246.0,False


In [9]:
# Write the full mirrored distances dataframe to a CSV file
full_distances.to_csv('../interim-data/recent-us-and-uk-names-distance-mirrored.csv', index=False)

print(f"Saved mirrored distances to '../interim-data/recent-us-and-uk-names-distance-mirrored.csv'")
print(f"Shape of exported dataframe: {full_distances.shape}")

Saved mirrored distances to '../interim-data/recent-us-and-uk-names-distance-mirrored.csv'
Shape of exported dataframe: (904415402, 5)


In [2]:
# Check if full_distances is defined and has data
try:
    # This will raise a NameError if full_distances isn't defined
    # or will evaluate to False if it's empty
    if full_distances is None or full_distances.empty:
        raise NameError("full_distances needs to be loaded")
    print(f"Using existing full_distances dataframe with shape: {full_distances.shape}")
except (NameError, ValueError):
    print("Loading full_distances from CSV file...")
    # Load the dataframe from the CSV file saved in the previous cell
    full_distances = pd.read_csv('../interim-data/recent-us-and-uk-names-distance-mirrored.csv')
    print(f"Loaded full_distances with shape: {full_distances.shape}")

Loading full_distances from CSV file...
Loaded full_distances with shape: (904415402, 5)


In [3]:
print("starting filter...")

# Create a subset of full_distances where name2_threshold is true
filtered_distances = full_distances[full_distances["name2_threshold"]]

# Display information about the filtering
print(f"Original dataframe shape: {full_distances.shape}")
print(f"Filtered dataframe shape: {filtered_distances.shape}")
print(
    f"Removed {full_distances.shape[0] - filtered_distances.shape[0]} rows"
)

# Display the first few rows of the filtered dataframe
filtered_distances.head()

starting filter...
Original dataframe shape: (904415402, 5)
Filtered dataframe shape: (535299400, 5)
Removed 369116002 rows


Unnamed: 0,name_1,name_2,distance,name2_nmax,name2_threshold
6,Aaban,Aaden,4.267766,5061.0,True
7,Aadam,Aaden,4.085665,5061.0,True
8,Aadan,Aaden,3.719972,5061.0,True
9,Aadarsh,Aaden,4.344443,5061.0,True
36,Aaban,Aadhya,5.51541,2931.0,True


In [4]:
from tqdm.notebook import tqdm

tqdm.pandas()  # This registers progress_apply with pandas

# Use progress_apply with groupby to show progress on each group
closest_rows = filtered_distances.groupby("name_1", group_keys=False).progress_apply(
    lambda group: group.nsmallest(10, "distance")
)

# Create a rank within each group (from 1 to 10)
closest_rows["rank"] = closest_rows.groupby("name_1").cumcount() + 1

# Pivot the data so that each name_1 gets its own row with columns for the 10 closest name_2's
closest_df = closest_rows.pivot(
    index="name_1", columns="rank", values="name_2"
).reset_index()

# Rename columns to match the desired output: name, closest_1, ..., closest_10
closest_df.columns = ["name"] + [f"closest_{i}" for i in range(1, closest_df.shape[1])]

print(f"Created dataframe with {len(closest_df)} rows")
closest_df.head()

  0%|          | 0/30074 [00:00<?, ?it/s]

Created dataframe with 30074 rows


Unnamed: 0,name,closest_1,closest_2,closest_3,closest_4,closest_5,closest_6,closest_7,closest_8,closest_9,closest_10
0,Aaban,Griffith,Levon,Titus,Elimelech,Junius,Ajani,Alexius,Atha,Athan,Demetrus
1,Aadam,Janya,Gaia,Adem,Dunia,Genisis,Hawa,Verdia,Genesis,Genessis,Gennesis
2,Aadan,Thedore,Alvan,Efrem,Joachim,Mateus,Nataniel,Zacharias,Seferino,Cirilo,Isidoro
3,Aadarsh,Adarsh,Aadya,Skylor,Mackenzy,Siddhant,Primo,Benton,Stevon,Juston,Elige
4,Aaden,Aeden,Edan,Elene,Kasai,Aedan,Vermell,Ray,Leoma,Dawna,Ember


In [5]:
closest_df.to_csv('../output-data/recent-us-and-uk-names-proximity-v2.csv')