In [1]:
import numpy as np
import pandas as pd

# Analysis

## 0. Loading & Accessing Data

In [20]:
df = pd.read_pickle('./Data/final_processed_with_embeddings.pkl')

In [24]:
# Subset data by race and gender
black = df.query('race == "black"')
asian = df.query('race == "asian"')
hispanic = df.query('race == "hispanic"')
white = df.query('race == "white"')

male = df.query('gender == "M"')
female = df.query('gender == "F"')

## 1. Spread of Multidimensional Data

Spread of multidimensional data can be represented as the trace / determinant of the covariance matrix. 
The embeddings via Google's Universal Sentence Encoder has 512 dimensions, so we expect a covariance matrix of dimension 512 x 512.

In [100]:
def getSpread(embeddings):
    return np.trace(np.cov(embeddings))

In [109]:
def bootstrapCI(data, num_bootstrap=1000, ci=95):
    bootstrap_spreads = []
    
    for _ in range(num_bootstrap):
        
        sample = data.sample(frac=1, replace=True) # Resampling with replacement
        # embeddings = np.stack(sample['embedding'].values) # Calculate spread for the bootstrap sample
        spread = getSpread(np.stack(sample['embedding'].values))
        bootstrap_spreads.append(spread)
    
    lower_percentile = (100 - ci) / 2
    upper_percentile = 100 - lower_percentile
    confidence_interval = np.percentile(bootstrap_spreads, [lower_percentile, upper_percentile])
    
    return confidence_interval

In [111]:
black_ci = bootstrapCI(black, num_bootstrap = 1000, ci = 95)
asian_ci = bootstrapCI(asian, num_bootstrap = 1000, ci = 95)
hispanic_ci = bootstrapCI(hispanic, num_bootstrap = 1000, ci = 95)
white_ci = bootstrapCI(white, num_bootstrap = 1000, ci = 95)

male_ci = bootstrapCI(male, num_bootstrap = 1000, ci = 95)
female_ci = bootstrapCI(female, num_bootstrap = 1000, ci = 95)

In [112]:
# Formatting and printing the confidence intervals
print("Confidence Intervals for Spread:")
print("Black: {:.4f} - {:.4f}".format(*black_ci))
print("Asian: {:.4f} - {:.4f}".format(*asian_ci))
print("Hispanic: {:.4f} - {:.4f}".format(*hispanic_ci))
print("White: {:.4f} - {:.4f}".format(*white_ci))
print("Male: {:.4f} - {:.4f}".format(*male_ci))
print("Female: {:.4f} - {:.4f}".format(*female_ci))

Confidence Intervals for Spread:
Black: 2.9096 - 2.9105
Asian: 2.9110 - 2.9117
Hispanic: 2.9209 - 2.9218
White: 2.9136 - 2.9144
Male: 5.8371 - 5.8381
Female: 5.8186 - 5.8199


## Focus on Top Jobs

In [8]:
def getMostFrequentJobs(df, n):
    return df.groupby('text').size().sort_values(ascending=False)[:n].index.values


def getDataForJobs(df, jobs):
    return df.loc[df['text'].isin(jobs)]


def getDataForMostFrequentJobs(df, n):
    return getDataForJobs(df, getMostFrequentJobs(df, n))

In [9]:
getDataForMostFrequentJobs(df[asian], 10).sample(10)

Unnamed: 0,name,race,gender,text,length,embedding
277,Hong,asian,F,translator,1,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
599,Sonal,asian,F,accountant,1,"[-0.052521203, -0.07172108, -0.012731302, -0.0..."
680,Yong,asian,F,translator,1,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
20,Bibi,asian,F,chef,1,"[-0.04405946, -0.026716324, 0.035623394, 0.016..."
564,Yu,asian,F,translator,1,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
212,Nikita,asian,F,software developer,2,"[0.03044688, 0.01658366, 0.051026687, 0.046492..."
526,Parul,asian,F,software developer,2,"[0.03044688, 0.01658366, 0.051026687, 0.046492..."
1336,Huy,asian,M,graphic designer,2,"[0.02208694, -0.06250166, 0.06601684, 0.022112..."
1227,Hoang,asian,M,translator,1,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
813,Sandeep,asian,M,software engineer,2,"[0.012382303, -0.0047106287, 0.024845174, 0.02..."


In [10]:
top10_spreads = []

# Getting the spread of data based on embeddings
for mask, label in zip(masks,labels):
    top10_spreads.append(score:=(getSpread(getDataForMostFrequentJobs(df[mask], 10)), label))    

# Sort and print
for score, label in sorted(top10_spreads, reverse=True, key=lambda x:x[0]):
    print("{:0.4f}".format(score), label)

0.5847 whole_dataset
0.5832 hispanic
0.5813 male & hispanic
0.5768 female & hispanic
0.5735 female
0.5645 male
0.5598 male & black
0.5546 black
0.5473 male & white
0.5427 female & white
0.5366 white
0.5180 male & asian
0.5143 female & black
0.4887 asian
0.4433 female & asian
