In [13]:
import numpy as np
import pandas as pd
import json
from tqdm import tqdm

names = pd.read_csv('../output-data/recent-us-and-uk-names.csv')
names.head()

Unnamed: 0,name,sex,year,n,n_us,percent_per_year,connotation_1,connotation_2,connotation_3,connotation_4,connotation_5
0,Aaban,M,2022,256.0,127.0,0.000402,dignity,nobility,prosperity,leadership,strength
1,Aadam,M,1995,2638.0,343.0,0.000315,first human,earth,life,creation,origin
2,Aadan,M,2003,139.0,136.0,0.000253,spiritual,strong,traditional,wise,noble
3,Aadarsh,M,2001,266.0,246.0,0.00036,ideal,perfect,model,exemplary,principle
4,Aaden,M,2020,5109.0,5061.0,0.003249,fire,warmth,light,passion,strength


In [14]:
names.shape

(30075, 11)

In [15]:
# Load the word vectors from the JSON file
with open('../word_vectors.json', 'r') as f:
    word_vectors = json.load(f)

# Create a DataFrame from the word vectors
# First, create a list of records where each record is a word and its vector components
records = []
for word, vector in word_vectors.items():
    # Create a dictionary with the word and each dimension of its vector
    record = {'word': word}
    for i, value in enumerate(vector):
        record[f'd{i+1}'] = value
    records.append(record)

# Convert the list of records to a DataFrame
word_vectors_df = pd.DataFrame(records)

# Set the word column as the index
word_vectors_df.set_index('word', inplace=True)

# Display the first few rows to verify
word_vectors_df.head()

Unnamed: 0_level_0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d291,d292,d293,d294,d295,d296,d297,d298,d299,d300
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dignity,-0.32382,-0.024203,-0.014129,0.364,-0.21679,-0.044356,0.2341,0.19194,-0.053919,2.6278,...,0.064851,-0.67542,-0.27378,-0.33281,-0.21162,0.52054,0.34484,0.29199,-0.28942,0.10225
radiance,0.078989,0.34494,0.24588,0.32392,-0.12872,-0.098345,0.38827,0.41738,-0.34803,-0.070135,...,-0.18911,-0.017001,0.20106,0.11273,-0.22736,0.17021,-0.074105,0.37581,-0.2968,0.581
spiritual,0.31617,0.18871,0.40116,0.61069,0.16471,0.34084,0.078078,0.5463,0.15994,2.9505,...,0.025738,-0.26824,-0.51527,0.21614,-0.38723,0.26537,0.16933,0.47888,-0.042282,-0.55875
ideal,0.32784,0.50207,-0.14487,-0.1316,0.57249,-0.12798,0.12616,0.074294,0.1124,1.557,...,0.071131,-0.081611,0.17925,-0.19036,0.33111,0.42373,0.21261,-0.33616,-0.064959,-0.14182
fire,-0.1392,0.10877,0.42009,-0.015987,-0.53519,-0.34612,0.04269,0.1412,-0.368,2.5701,...,-0.34045,0.013386,0.26044,0.2166,-0.70998,0.22047,-0.586,0.030956,-0.24704,0.056905


In [13]:
# Create a new dataframe that merges names that appear as both M and F -- This should no longer be necessary but we can keep for robustness to unknown unknown
# Group by name, sum n and percent_per_year, and keep connotations
filtered_names = names.groupby('name').agg({
    'n': ['sum'],
    'n_us': 'max',
    'percent_per_year': 'sum',
    'connotation_1': 'first',
    'connotation_2': 'first',
    'connotation_3': 'first',
    'connotation_4': 'first',
    'connotation_5': 'first'
}).reset_index()

# Flatten the multi-level columns
filtered_names.columns = ['name', 'n', 'n_max', 'percent_per_year', 'connotation_1', 
                         'connotation_2', 'connotation_3', 'connotation_4', 'connotation_5']

# Filter out names with n < 100
filtered_names = filtered_names[filtered_names['n'] >= 100]
filtered_names.reset_index()

# Display the first few rows to verify
filtered_names.head()

Unnamed: 0,name,n,n_max,percent_per_year,connotation_1,connotation_2,connotation_3,connotation_4,connotation_5
0,Aaban,256.0,127.0,0.000402,dignity,nobility,prosperity,leadership,strength
1,Aadam,2638.0,343.0,0.000315,first human,earth,life,creation,origin
2,Aadan,139.0,136.0,0.000253,spiritual,strong,traditional,wise,noble
3,Aadarsh,266.0,246.0,0.00036,ideal,perfect,model,exemplary,principle
4,Aaden,5109.0,5061.0,0.003249,fire,warmth,light,passion,strength


In [14]:
filtered_names.to_csv('../interim-data/filtered-names.csv')

In [31]:
# Create a new dataframe to store the name positions
name_position = filtered_names.copy()

# Function to calculate the average position of a name based on its connotations
def calculate_name_position(row):
    # Get the connotations for this name
    connotations = [row[f'connotation_{i}'] for i in range(1, 6) if pd.notna(row[f'connotation_{i}'])]
    
    # Filter out connotations that don't exist in word_vectors_df
    valid_connotations = [c for c in connotations if c in word_vectors_df.index]
    
    # If no valid connotations, return NaN for all dimensions
    if not valid_connotations:
        return pd.Series([np.nan] * 300, index=[f'd{i+1}' for i in range(300)])
    
    # Calculate the average position across all valid connotations
    avg_position = word_vectors_df.loc[valid_connotations].mean()
    
    return avg_position

# Apply the function to each row in the names dataframe
name_vectors = filtered_names.apply(calculate_name_position, axis=1)

# Concatenate the original names dataframe with the calculated positions
name_position = pd.concat([filtered_names, name_vectors], axis=1)

# Display the first few rows to verify
name_position.head()

Unnamed: 0,name,n,n_max,percent_per_year,connotation_1,connotation_2,connotation_3,connotation_4,connotation_5,common,...,d291,d292,d293,d294,d295,d296,d297,d298,d299,d300
0,Aaban,256.0,127.0,0.000402,dignity,nobility,prosperity,leadership,strength,False,...,-0.108071,-0.00157,-0.105568,-0.165162,-0.219768,0.508232,0.26499,0.297585,-0.072312,-0.162678
1,Aadam,2638.0,343.0,0.000315,first human,earth,life,creation,origin,False,...,-0.305493,-0.01838,-0.109842,-0.015208,-0.088794,0.132612,-0.092879,0.277908,-0.182016,-0.206481
2,Aadan,139.0,136.0,0.000253,spiritual,strong,traditional,wise,noble,False,...,-0.342989,-0.120146,-0.371107,0.024761,-0.1202,0.406658,0.361648,0.048312,0.058716,-0.195326
3,Aadarsh,266.0,246.0,0.00036,ideal,perfect,model,exemplary,principle,False,...,-0.020291,0.062312,0.239624,0.06727,0.191837,-0.041893,0.105918,-0.032114,-0.001908,-0.046741
4,Aaden,5109.0,5061.0,0.003249,fire,warmth,light,passion,strength,True,...,-0.235671,0.136806,-0.064909,-0.145342,-0.402592,0.184426,0.10004,0.068847,-0.15472,0.228309


In [32]:
# Save the filtered dataframe to CSV
output_path = '../interim-data/recent-us-and-uk-names-300dim.csv'
name_position.to_csv(output_path, index=False)

In [33]:
len(name_position)

30074

In [38]:
# Create a new dataframe to store pairwise distances between names
name_distances = pd.DataFrame(columns=['name_1', 'name_2', 'distance'])

# Extract name vectors (dimensions d1-d300)
name_vectors = name_position.iloc[:, name_position.columns.get_loc('d1'):name_position.columns.get_loc('d300')+1].values  # Columns d1-d300
names = name_position['name'].values

# Calculate distances efficiently using vectorized operations
print(f"Calculating distances for {len(names)} names...")

# Process in batches to manage memory usage
batch_size = 100
total_names = len(names)
distance_data = []

for i in tqdm(range(0, total_names, batch_size), desc="Processing batches"):
    # Get the current batch
    batch_end = min(i + batch_size, total_names)
    batch_names = names[i:batch_end]
    batch_vectors = name_vectors[i:batch_end]
    
    # Calculate distances between this batch and all subsequent names
    for j in range(i, total_names):
        # Skip same name comparisons and already processed pairs
        if j < batch_end:
            # Only process within batch where j > i to avoid duplicates
            for k in range(i, j):
                name1 = names[k]
                name2 = names[j]
                # Calculate Euclidean distance
                distance = np.linalg.norm(name_vectors[k] - name_vectors[j])
                distance_data.append({
                    'name_1': name1,
                    'name_2': name2,
                    'distance': distance
                })
        else:
            # Process between current batch and name j
            for k in range(i, batch_end):
                name1 = names[k]
                name2 = names[j]
                # Calculate Euclidean distance
                distance = np.linalg.norm(name_vectors[k] - name_vectors[j])
                distance_data.append({
                    'name_1': name1,
                    'name_2': name2,
                    'distance': distance
                })
    
    # Update the dataframe periodically to save memory
    if len(distance_data) >= 10000 or batch_end == total_names:
        name_distances = pd.concat([name_distances, pd.DataFrame(distance_data)], ignore_index=True)
        # print(f"Processed {len(name_distances)} name pairs so far...")
        distance_data = []

print(f"Completed calculating distances for {len(name_distances)} name pairs.")


Calculating distances for 30074 names...


  name_distances = pd.concat([name_distances, pd.DataFrame(distance_data)], ignore_index=True)
Processing batches: 100%|██████████| 301/301 [44:39<00:00,  8.90s/it]   

Completed calculating distances for 452207701 name pairs.





In [39]:
name_distances.head()

Unnamed: 0,name_1,name_2,distance
0,Aaban,Aadam,4.312159
1,Aaban,Aadan,3.383403
2,Aadam,Aadan,3.474888
3,Aaban,Aadarsh,4.604571
4,Aadam,Aadarsh,4.10018


In [41]:
name_distances.to_csv('../interim-data/recent-us-and-uk-names-distance.csv')

In [26]:
import numpy as np
import pandas as pd
import json
from tqdm import tqdm

name_distances = pd.read_csv('../interim-data/recent-us-and-uk-names-distance.csv')

# Find entries where name_1 is the same as name_2
same_name_entries = name_distances[name_distances['name_1'] == name_distances['name_2']]

# Display the count of such entries
print(f"Number of entries where name_1 equals name_2: {len(same_name_entries)}")

# Display the first few entries if any exist
if not same_name_entries.empty:
    print("\nSample of entries where name_1 equals name_2:")
    print(same_name_entries.head())
else:
    print("\nNo entries found where name_1 equals name_2.")


Number of entries where name_1 equals name_2: 0

No entries found where name_1 equals name_2.


In [27]:
name_distances = name_distances[['name_1', 'name_2', 'distance']]

In [28]:
name_distances

Unnamed: 0,name_1,name_2,distance
0,Aaban,Aadam,4.312159
1,Aaban,Aadan,3.383403
2,Aadam,Aadan,3.474888
3,Aaban,Aadarsh,4.604571
4,Aadam,Aadarsh,4.100180
...,...,...,...
452207696,willow-grace,wynter-rose,2.626772
452207697,willow-mae,wynter-rose,4.055928
452207698,willow-rae,wynter-rose,3.719902
452207699,willow-rose,wynter-rose,2.153839


In [29]:
# Create a copy of name_distances with name_1 and name_2 swapped
swapped_distances = name_distances.copy()
swapped_distances.columns = ['name_2', 'name_1', 'distance']  # Swap column names

# Concatenate original and swapped dataframes to create bidirectional pairs
name_dist_doubled = pd.concat([name_distances, swapped_distances], ignore_index=True)

# Display the last few rows to verify
name_dist_doubled.tail(100)

Unnamed: 0,name_1,name_2,distance
904415302,winter-rose,tia-rose,2.613055
904415303,winter-rose,tiarna,3.613580
904415304,winter-rose,tiggy,4.296437
904415305,winter-rose,tilly-mae,
904415306,winter-rose,tilly-may,4.278597
...,...,...,...
904415397,wynter-rose,willow-grace,2.626772
904415398,wynter-rose,willow-mae,4.055928
904415399,wynter-rose,willow-rae,3.719902
904415400,wynter-rose,willow-rose,2.153839


In [30]:
name_dist_doubled.shape

(904415402, 3)

In [31]:
import pandas as pd

n_closest = 10
results = []

# Group by name_1
grouped = name_dist_doubled.groupby('name_1')

for name, group_df in grouped:
    # Sort the group by distance
    group_df = group_df.sort_values('distance')
    # Get the top n_closest
    closest_names = group_df['name_2'].head(n_closest).tolist()
    # Pad if needed
    if len(closest_names) < n_closest:
        closest_names += [None] * (n_closest - len(closest_names))
    # Build the row
    results.append([name] + closest_names)
    print(name)

# Create DataFrame of results
columns = ['name'] + [f'closest{i+1}' for i in range(n_closest)]
res_df = pd.DataFrame(results, columns=columns)


Aaban
Aadam
Aadan
Aadarsh
Aaden
Aadhav
Aadhira
Aadhiran
Aadhvik
Aadhya
Aadi
Aadil
Aadin
Aadit
Aaditya
Aadvik
Aadvika
Aadya
Aadyn
Aafiya
Aafiyah
Aahan
Aahana
Aahil
Aaiden
Aaila
Aailyah
Aaima
Aaira
Aairah
Aaisha
Aaishah
Aaiza
Aakash
Aakifah
Aalani
Aalaya
Aalayah
Aaleah
Aaleyah
Aalia
Aaliah
Aalijah
Aaliya
Aaliyah
Aaliyha
Aalliyah
Aalyah
Aalyiah
Aamani
Aamilah
Aamina
Aaminah
Aamir
Aamira
Aamirah
Aamiyah
Aamna
Aanav
Aanaya
Aanika
Aaniya
Aaniyah
Aanshi
Aanvi
Aanya
Aara
Aaradhya
Aaralyn
Aaralynn
Aaran
Aarav
Aaren
Aari
Aaria
Aariah
Aarian
Aariana
Aarianna
Aarib
Aaric
Aariel
Aarik
Aarika
Aarin
Aarion
Aariona
Aarish
Aariv
Aariya
Aariyah
Aariz
Aarna
Aarnav
Aaro
Aarohi
Aaron
Aarron
Aarti
Aarush
Aarushi
Aarvi
Aarya
Aaryan
Aaryanna
Aaryav
Aaryn
Aasha
Aashi
Aashir
Aashna
Aashrith
Aashritha
Aashvi
Aasia
Aasim
Aasir
Aasiya
Aasiyah
Aastha
Aava
Aavya
Aavyan
Aayah
Aayan
Aayansh
Aayat
Aayden
Aayla
Aayra
Aayush
Aayushi
Ab
Abagael
Abagail
Abagale
Abagayle
Abaigeal
Abbagail
Abbas
Abbe
Abbegail
Abbey
Abbi
Abbi

In [32]:
closest_names_df = res_df

closest_names_df.to_csv('../output-data/recent-us-and-uk-names-proximity.csv')
closest_names_df.to_csv('../output-data/recent-us-names-proximity.csv') # Overwriting this as well just to avoid code further down breaking

In [33]:
res_df

Unnamed: 0,name,closest1,closest2,closest3,closest4,closest5,closest6,closest7,closest8,closest9,closest10
0,Aaban,Fuquan,Yazen,Saud,Titus,Tigran,Levon,Griffith,Llewelyn,Macsen,Elimelech
1,Aadam,Janya,Gaia,Adem,Dunia,Srishti,Genese,Genisis,Hawa,Verdia,Genesis
2,Aadan,Thedore,Nataniel,Nethaniel,Efrem,Matteus,Nathanuel,Mattheus,Mateus,Alvan,Zacharias
3,Aadarsh,Adarsh,Kameel,Aadya,Akhila,Skylor,Tameem,Keval,Mackenzy,Miyoshi,Siddhant
4,Aaden,Aeden,Azar,Shola,Keahi,Anala,Edan,Elene,Kasai,Aedan,Vermell
...,...,...,...,...,...,...,...,...,...,...,...
30069,willow-mae,Nelma,Vertie,Wava,Hazelyn,lily-may,Shirly,Shayleigh,Hazelee,Orva,Olivea
30070,willow-rae,Teah,Chany,Mayeli,Lecia,Luane,Tinna,Maylee,Cera,Lahna,Wrenly
30071,willow-rose,Rosio,ella-rose,mia-rose,Jessamine,Rosalva,freya-rose,Yasmine,Yasemin,Yasmina,Yasmin
30072,winter-rose,Orchid,Sheccid,Zarrah,ebony-rose,Shakara,Selina,Ebone,Velora,Katai,Tharon
