# Meddling around with gendered names

**NOTES:** 

fasttext-wiki and w2v-googlenews has both upper and lower case names

glove-twitter, glove-wiki are lowercased everywhere

conceptnet has no names

In [5]:
import gc
import gensim
import gensim.downloader as download_api
import numpy as np
import pandas as pd
from sklearn import cluster
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_columns', 100)

df =  pd.read_csv("us-likelihood-of-gender-by-name-in-2014.csv")
df['nameLower'] = df.name.str.lower()
df.sex = df.sex.map({'F':0, 'M':1})
print("# names: ", len(df.name.unique()))

# names:  11316


In [6]:
# gensim models from https://github.com/RaRe-Technologies/gensim-data
model_names = [
    ['glove-twitter-50', "nameLower"],
    ['glove-twitter-200', "nameLower"],
    ['glove-wiki-gigaword-50', "nameLower"],
    ['glove-wiki-gigaword-300', "nameLower"],
    ['word2vec-google-news-300', "nameLower"],
    ['word2vec-google-news-300', "name"],
    ['fasttext-wiki-news-subwords-300', "nameLower"],
    ['fasttext-wiki-news-subwords-300', "name"],
]
for m in range(len(model_names)):
    model_name = model_names[m][0]
    model = download_api.load(model_name)
    name_col = model_names[m][1]
    nameList = df[name_col].unique()
    # pre allocate memory for efficiency
    dn = pd.DataFrame(np.zeros(
        (len(nameList),
         model.vector_size))) # Add names to the dataframe to keep track
    dn.index = nameList
    oovNames = [] # out of model vocab names
    for name in nameList:
        try:
            dn.loc[dn.index==name, :] = model[name]
        except KeyError:
            oovNames.append(name)
    # clean result to cluster properly
    dn = dn.loc[~dn.index.isin(oovNames)]
    # get clustering
    genderClusters = sklearn.cluster.k_means(dn, 2)[1]
    # align result to join with M/F labellings
    dn = pd.DataFrame(list(zip(genderClusters, dn.index)), 
                      columns=[model_name +"_"+ name_col, name_col])
    # Merge results into final table
    # outer join has NaN for oov names
    df = df.merge(dn, on=name_col, how="outer")
    # cleanup: never trust python GC with big objects
    model, dn = None, None
    gc.collect()
    print('"{0}"'.format(model_name), " OOV words:", len(oovNames))

"glove-twitter-50"  OOV words: 2573
"glove-twitter-200"  OOV words: 2573
"glove-wiki-gigaword-50"  OOV words: 3656
"glove-wiki-gigaword-300"  OOV words: 3656
"word2vec-google-news-300"  OOV words: 8795
"word2vec-google-news-300"  OOV words: 881
"fasttext-wiki-news-subwords-300"  OOV words: 7705
"fasttext-wiki-news-subwords-300"  OOV words: 2413


In [7]:
df

Unnamed: 0,sex,name,gender.prob,nameLower,glove-twitter-50_nameLower,glove-twitter-200_nameLower,glove-wiki-gigaword-50_nameLower,glove-wiki-gigaword-300_nameLower,word2vec-google-news-300_nameLower,word2vec-google-news-300_name,fasttext-wiki-news-subwords-300_nameLower,fasttext-wiki-news-subwords-300_name
0,0,Elaine,1.000000,elaine,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
1,0,Cathy,1.000000,cathy,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
2,0,Heidi,1.000000,heidi,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
3,0,Vicki,1.000000,vicki,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
4,0,Melinda,1.000000,melinda,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
5,0,Roberta,1.000000,roberta,0.0,1.0,0.0,0.0,,1.0,,1.0
6,0,Charlene,1.000000,charlene,1.0,0.0,1.0,0.0,,1.0,1.0,1.0
7,0,Jeanne,1.000000,jeanne,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0
8,0,Miranda,1.000000,miranda,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0,Marcia,1.000000,marcia,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0


In [8]:
df.corr()

Unnamed: 0,sex,gender.prob,glove-twitter-50_nameLower,glove-twitter-200_nameLower,glove-wiki-gigaword-50_nameLower,glove-wiki-gigaword-300_nameLower,word2vec-google-news-300_nameLower,word2vec-google-news-300_name,fasttext-wiki-news-subwords-300_nameLower,fasttext-wiki-news-subwords-300_name
sex,1.0,-0.06559,-0.145386,0.16491,-0.271286,-0.276963,-0.121442,-0.328894,0.141074,0.097341
gender.prob,-0.06559,1.0,0.098041,-0.100032,0.099066,0.102503,0.02207,0.054302,-0.034314,-0.054071
glove-twitter-50_nameLower,-0.145386,0.098041,1.0,-0.926019,0.537039,0.556711,0.32217,0.185854,-0.011744,-0.053715
glove-twitter-200_nameLower,0.16491,-0.100032,-0.926019,1.0,-0.566617,-0.590283,-0.331469,-0.184194,0.055156,0.082886
glove-wiki-gigaword-50_nameLower,-0.271286,0.099066,0.537039,-0.566617,1.0,0.909108,0.26015,0.192232,-0.38519,-0.293149
glove-wiki-gigaword-300_nameLower,-0.276963,0.102503,0.556711,-0.590283,0.909108,1.0,0.278989,0.194926,-0.354364,-0.27735
word2vec-google-news-300_nameLower,-0.121442,0.02207,0.32217,-0.331469,0.26015,0.278989,1.0,-0.172425,0.026901,-0.122391
word2vec-google-news-300_name,-0.328894,0.054302,0.185854,-0.184194,0.192232,0.194926,-0.172425,1.0,0.069137,0.266324
fasttext-wiki-news-subwords-300_nameLower,0.141074,-0.034314,-0.011744,0.055156,-0.38519,-0.354364,0.026901,0.069137,1.0,0.596941
fasttext-wiki-news-subwords-300_name,0.097341,-0.054071,-0.053715,0.082886,-0.293149,-0.27735,-0.122391,0.266324,0.596941,1.0
