In [1]:
import numpy as np
import pandas as pd

# Analysis

## 0. Loading & Accessing Data

In [2]:
df = pd.read_pickle('./Data/final_processed_with_embeddings.pkl')
df.sample(5)

Unnamed: 0,name,race,gender,text,length,embedding
110,Tia,asian,F,graphic designer,2,"[0.02208694, -0.06250166, 0.06601684, 0.022112..."
3725,Raquel,hispanic,F,dentist,1,"[-0.03493506, -0.035563823, 0.05591933, 0.0161..."
2744,Marlon,black,M,firefighter,1,"[0.007844897, -0.00611186, -0.03914195, -0.017..."
5919,Edward,white,M,software engineer,2,"[0.012382303, -0.0047106287, 0.024845174, 0.02..."
5530,Marvin,white,M,electrician,1,"[0.011973024, -0.06220785, 0.059820715, 0.0343..."


In [3]:
# Create masks for easy access

male = df['gender'] == 'M'
female = df['gender'] == 'F'
asian = df['race'] == 'asian'
black = df['race'] == 'black'
hispanic = df['race'] == 'hispanic'
white = df['race'] == 'white'
whole_dataset = df['gender'] != 'A'

# Usage Examples:
# df[male]
# df[female & hispanic]

In [4]:
# Collect all subgroups with labels

masks = [whole_dataset, male, female, 
         asian, black, hispanic, white, 
         male & asian, male & black, male & hispanic, male & white,
         female & asian,female & black,female & hispanic, female & white]
labels = ["whole_dataset", "male", "female", 
          "asian", "black", "hispanic", "white", 
          "male & asian", "male & black", "male & hispanic", "male & white",
          "female & asian","female & black","female & hispanic", "female & white"]

In [5]:
# inner product gives cosine similarity as embeddiings from USE are already normalized
def similarity(embedding1, embedding2):
    return np.inner(embedding1, embedding2)


# distance could be useful for clustering analysis
def distance(embedding1, embedding2):
    return 1.0 - np.inner(embedding1, embedding2)

## 1. Spread of Multidimensional Data

Spread of multidimensional data can be represented as the trace / determinant of the covariance matrix. 
The embeddings via Google's Universal Sentence Encoder has 512 dimensions, so we expect a covariance matrix of dimension 512 x 512.

In [6]:
def getSpread(df):
    return np.trace(np.cov(np.stack(df['embedding'].values).T))

In [7]:
spreads = []

# Getting the spread of data based on embeddings
for mask, label in zip(masks,labels):
    spreads.append(score:=(getSpread(df[mask]), label))

for score, label in sorted(spreads, reverse=True, key=lambda x:x[0]):
    print("{:0.4f}".format(score), label)

0.6433 whole_dataset
0.6378 male
0.6311 white
0.6300 hispanic
0.6278 black
0.6243 female
0.6242 male & black
0.6201 male & hispanic
0.6182 male & white
0.6093 female & hispanic
0.6063 female & white
0.5858 male & asian
0.5794 asian
0.5777 female & black
0.5502 female & asian


In [8]:
def getMostFrequentJobs(df, n):
    return df.groupby('text').size().sort_values(ascending=False)[:n].index.values


def getDataForJobs(df, jobs):
    return df.loc[df['text'].isin(jobs)]


def getDataForMostFrequentJobs(df, n):
    return getDataForJobs(df, getMostFrequentJobs(df, n))

In [9]:
getDataForMostFrequentJobs(df[asian], 10).sample(10)

Unnamed: 0,name,race,gender,text,length,embedding
277,Hong,asian,F,translator,1,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
599,Sonal,asian,F,accountant,1,"[-0.052521203, -0.07172108, -0.012731302, -0.0..."
680,Yong,asian,F,translator,1,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
20,Bibi,asian,F,chef,1,"[-0.04405946, -0.026716324, 0.035623394, 0.016..."
564,Yu,asian,F,translator,1,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
212,Nikita,asian,F,software developer,2,"[0.03044688, 0.01658366, 0.051026687, 0.046492..."
526,Parul,asian,F,software developer,2,"[0.03044688, 0.01658366, 0.051026687, 0.046492..."
1336,Huy,asian,M,graphic designer,2,"[0.02208694, -0.06250166, 0.06601684, 0.022112..."
1227,Hoang,asian,M,translator,1,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
813,Sandeep,asian,M,software engineer,2,"[0.012382303, -0.0047106287, 0.024845174, 0.02..."


In [10]:
top10_spreads = []

# Getting the spread of data based on embeddings
for mask, label in zip(masks,labels):
    top10_spreads.append(score:=(getSpread(getDataForMostFrequentJobs(df[mask], 10)), label))    

# Sort and print
for score, label in sorted(top10_spreads, reverse=True, key=lambda x:x[0]):
    print("{:0.4f}".format(score), label)

0.5847 whole_dataset
0.5832 hispanic
0.5813 male & hispanic
0.5768 female & hispanic
0.5735 female
0.5645 male
0.5598 male & black
0.5546 black
0.5473 male & white
0.5427 female & white
0.5366 white
0.5180 male & asian
0.5143 female & black
0.4887 asian
0.4433 female & asian
