In [1]:
import os

from nltk import ngrams

from collections import Counter, defaultdict
from tqdm import tqdm

import pandas as pd
import numpy as np

### Import surnames and get weights

In [2]:
surnames = pd.read_csv("surnames2010.csv")
print(surnames.shape)
surnames.head()

(167613, 6)


Unnamed: 0,surname,p_whi,p_bla,p_his,p_asi,p_oth
0,SMITH,0.709,0.2311,0.024,0.005,0.0308
1,JOHNSON,0.5897,0.3463,0.0236,0.0054,0.035
2,WILLIAMS,0.4575,0.4768,0.0249,0.0046,0.0363
3,BROWN,0.5795,0.356,0.0252,0.0051,0.0342
4,JONES,0.5519,0.3848,0.0229,0.0044,0.0361


In [3]:
surnames_counts = pd.read_csv("Names_2010Census.csv")
surnames_counts['weight'] = surnames_counts['prop100k'] / 100000
surnames_counts = surnames_counts[['name', 'weight']]
surnames_counts = surnames_counts.rename(columns={"name": "surname", "weight": "weight"})
print(surnames_counts.shape)
surnames_counts.head()

(162254, 2)


Unnamed: 0,surname,weight
0,SMITH,0.008282
1,JOHNSON,0.006552
2,WILLIAMS,0.00551
3,BROWN,0.004872
4,JONES,0.004832


In [4]:
surnames = pd.merge(surnames, surnames_counts, on=['surname'])
surnames.head()

Unnamed: 0,surname,p_whi,p_bla,p_his,p_asi,p_oth,weight
0,SMITH,0.709,0.2311,0.024,0.005,0.0308,0.008282
1,JOHNSON,0.5897,0.3463,0.0236,0.0054,0.035,0.006552
2,WILLIAMS,0.4575,0.4768,0.0249,0.0046,0.0363,0.00551
3,BROWN,0.5795,0.356,0.0252,0.0051,0.0342,0.004872
4,JONES,0.5519,0.3848,0.0229,0.0044,0.0361,0.004832


### Function to count trygram occurrences

In [5]:
def get_counts(surnames):
    whi_model = defaultdict(lambda: defaultdict(lambda: 0))
    bla_model = defaultdict(lambda: defaultdict(lambda: 0))
    his_model = defaultdict(lambda: defaultdict(lambda: 0))
    asi_model = defaultdict(lambda: defaultdict(lambda: 0))
    oth_model = defaultdict(lambda: defaultdict(lambda: 0))

    for ii in tqdm(range(0, surnames.shape[0])):
        try:
            surname = list(surnames['surname'][ii].upper())
            surname_ngrams = ngrams(surnames['surname'][ii], n = 3, 
                                     pad_right = True, pad_left = True, 
                                     left_pad_symbol='<', right_pad_symbol='>')
            for l1, l2, l3 in surname_ngrams:
                whi_model[(l1, l2)][l3] += surnames['p_whi'][ii]*surnames['weight'][ii]
                bla_model[(l1, l2)][l3] += surnames['p_bla'][ii]*surnames['weight'][ii]
                his_model[(l1, l2)][l3] += surnames['p_his'][ii]*surnames['weight'][ii]
                asi_model[(l1, l2)][l3] += surnames['p_asi'][ii]*surnames['weight'][ii]
                oth_model[(l1, l2)][l3] += surnames['p_oth'][ii]*surnames['weight'][ii]
        except:
            print("Error on case {}".format(ii))
            
    return whi_model, bla_model, his_model, asi_model, oth_model

In [6]:
whi_model, bla_model, his_model, asi_model, oth_model = get_counts(surnames)

  3%|▎         | 5062/162253 [00:04<02:23, 1098.36it/s]

Error on case 4909


100%|██████████| 162253/162253 [02:55<00:00, 922.91it/s] 


### Convert counts to conditional probabilities 

In [7]:
for l1_l2 in whi_model:
    whi_total_count = float(sum(whi_model[l1_l2].values()))
    bla_total_count = float(sum(bla_model[l1_l2].values()))
    his_total_count = float(sum(his_model[l1_l2].values()))
    asi_total_count = float(sum(asi_model[l1_l2].values()))
    oth_total_count = float(sum(oth_model[l1_l2].values()))
    
    for l3 in whi_model[l1_l2]:
        whi_model[l1_l2][l3] /= whi_total_count
        bla_model[l1_l2][l3] /= bla_total_count
        his_model[l1_l2][l3] /= his_total_count
        asi_model[l1_l2][l3] /= asi_total_count
        oth_model[l1_l2][l3] /= oth_total_count

  bla_model[l1_l2][l3] /= bla_total_count
  asi_model[l1_l2][l3] /= asi_total_count
  oth_model[l1_l2][l3] /= oth_total_count
  whi_model[l1_l2][l3] /= whi_total_count


### Function to get P(R|S) for a single name and race

In [8]:
def get_name_prob(surname, model, return_probs = False):
    probs = defaultdict()
    surname = surname.upper()
    p_surname = 1
    surname_ngrams = ngrams(surname, n = 3, 
                                 pad_right = True, pad_left = True, 
                                 left_pad_symbol='<', right_pad_symbol='>')
    for l1, l2, l3 in surname_ngrams:
        cond_prob = model[(l1, l2)][l3]
        probs[(l1, l2, l3)] = cond_prob
        p_surname *= cond_prob
    
    if return_probs:
        return p_surname, probs
    else:
        return p_surname

In [10]:
get_name_prob("Smith", whi_model, return_probs = True)

(0.0006297469929376308,
 defaultdict(None,
             {('<', '<', 'S'): 0.10301164973888023,
              ('<', 'S', 'M'): 0.11203091254685146,
              ('S', 'M', 'I'): 0.7781645677040535,
              ('M', 'I', 'T'): 0.3960144375590823,
              ('I', 'T', 'H'): 0.3663670976402352,
              ('T', 'H', '>'): 0.4833291080316953,
              ('H', '>', '>'): 1.0}))

In [11]:
get_name_prob("Smith", bla_model, return_probs = True)

(0.0020614759943320266,
 defaultdict(None,
             {('<', '<', 'S'): 0.08376543645490296,
              ('<', 'S', 'M'): 0.22566689407628474,
              ('S', 'M', 'I'): 0.8963251034236016,
              ('M', 'I', 'T'): 0.5706655660271983,
              ('I', 'T', 'H'): 0.4802278038253288,
              ('T', 'H', '>'): 0.4439673634165719,
              ('H', '>', '>'): 1.0}))

In [95]:
models = {
    "whi": whi_model,
    "bla": bla_model,
    "his": his_model,
    "asi": asi_model,
    "oth": oth_model
}

### Get full distribution over race groups

In [96]:
def get_race_probs(surname, models):
    results = defaultdict()
    for race, model in models.items():
        results[race] = get_name_prob(surname, model)
    total = sum(results.values())
    for race in results:
        results[race] /= total
    return results

In [97]:
print(surnames.head())
get_race_probs("smith", models)

    surname   p_whi   p_bla   p_his   p_asi   p_oth    weight
0     SMITH  0.7090  0.2311  0.0240  0.0050  0.0308  0.008282
1   JOHNSON  0.5897  0.3463  0.0236  0.0054  0.0350  0.006552
2  WILLIAMS  0.4575  0.4768  0.0249  0.0046  0.0363  0.005510
3     BROWN  0.5795  0.3560  0.0252  0.0051  0.0342  0.004872
4     JONES  0.5519  0.3848  0.0229  0.0044  0.0361  0.004832


defaultdict(None,
            {'whi': 0.17779520314835587,
             'bla': 0.5820123753000933,
             'his': 0.0014056554570174005,
             'asi': 0.0008225314773946631,
             'oth': 0.2379642346171387})

In [98]:
get_race_probs("johnson", models)

defaultdict(None,
            {'whi': 0.07758016566850848,
             'bla': 0.7179600975892731,
             'his': 0.0010928060188440149,
             'asi': 0.0005604610304707167,
             'oth': 0.20280646969290372})

In [99]:
get_name_prob("johnson", bla_model, return_probs = True)

(0.0067298366923897535,
 defaultdict(None,
             {('<', '<', 'J'): 0.06834777322076013,
              ('<', 'J', 'O'): 0.625683436532375,
              ('J', 'O', 'H'): 0.48490907287117685,
              ('O', 'H', 'N'): 0.9326873905538675,
              ('H', 'N', 'S'): 0.9372152664113177,
              ('N', 'S', 'O'): 0.4883239499946842,
              ('S', 'O', 'N'): 0.9699644308808777,
              ('O', 'N', '>'): 0.7838372426315613,
              ('N', '>', '>'): 1.0}))

In [100]:
get_name_prob("johnson", whi_model, return_probs = True)

(0.0007272017585248665,
 defaultdict(None,
             {('<', '<', 'J'): 0.02624867441121226,
              ('<', 'J', 'O'): 0.5710038122841791,
              ('J', 'O', 'H'): 0.5187862857295419,
              ('O', 'H', 'N'): 0.7095044680298122,
              ('H', 'N', 'S'): 0.6607486559777127,
              ('N', 'S', 'O'): 0.31101019827161863,
              ('S', 'O', 'N'): 0.9027292881134743,
              ('O', 'N', '>'): 0.7105529775243876,
              ('N', '>', '>'): 1.0}))

### Save out the models

In [101]:
# Save out white model
df = pd.DataFrame.from_dict(whi_model).transpose()
df.to_csv("whi_cond_probs.csv")

In [102]:
# Save out black model
df = pd.DataFrame.from_dict(bla_model).transpose()
df.to_csv("bla_cond_probs.csv")

In [103]:
df = pd.DataFrame.from_dict(his_model).transpose()
df.to_csv("his_cond_probs.csv")

In [104]:
df = pd.DataFrame.from_dict(asi_model).transpose()
df.to_csv("asi_cond_probs.csv")

In [105]:
df = pd.DataFrame.from_dict(oth_model).transpose()
df.to_csv("oth_cond_probs.csv")