In [1]:
import numpy as np
import pandas as pd

# Analysis

## 0. Loading & Accessing Data

In [2]:
df = pd.read_pickle('./processed_data_with_embeddings.pkl')
df.sample(5)

Unnamed: 0,name,race,gender,text,embedding
1209,Hoang,asian,M,software developer,"[0.03044688, 0.01658366, 0.051026687, 0.046492..."
4477,Fidel,hispanic,M,translator,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
2021,Jasmin,black,F,teacher,"[-0.031633124, 0.018472634, 0.06361438, 0.0197..."
5341,Duke,white,M,monarch,"[0.020329619, -0.07263715, 0.0143735055, -0.00..."
1719,Tanisha,black,F,dentist,"[-0.03493506, -0.035563823, 0.05591933, 0.0161..."


In [3]:
# Create masks for easy access

male = df['gender'] == 'M'
female = df['gender'] == 'F'
asian = df['race'] == 'asian'
black = df['race'] == 'black'
hispanic = df['race'] == 'hispanic'
white = df['race'] == 'white'
whole_dataset = df['gender'] != 'A'

# Usage Examples:
# df[male]
# df[female & hispanic]

In [4]:
# Collect all subgroups with labels

masks = [whole_dataset, male, female, 
         asian, black, hispanic, white, 
         male & asian, male & black, male & hispanic, male & white,
         female & asian,female & black,female & hispanic, female & white]
labels = ["whole_dataset", "male", "female", 
          "asian", "black", "hispanic", "white", 
          "male & asian", "male & black", "male & hispanic", "male & white",
          "female & asian","female & black","female & hispanic", "female & white"]

In [5]:
# inner product gives cosine similarity as embeddiings from USE are already normalized
def similarity(embedding1, embedding2):
    return np.inner(embedding1, embedding2)


# distance could be useful for clustering analysis
def distance(embedding1, embedding2):
    return 1.0 - np.inner(embedding1, embedding2)

## 1. Spread of Multidimensional Data

Spread of multidimensional data can be represented as the trace of the covariance matrix. 
The embeddings via Google's Universal Sentence Encoder has 512 dimensions, so we expect a covariance matrix of dimension 512 x 512.

In [6]:
def getSpread(df, mask):
    return np.trace(np.cov(np.stack(df[mask]['embedding'].values).T))

In [7]:
spreads = []

# Getting the spread of data based on embeddings
for mask, label in zip(masks,labels):
    spreads.append(score:=(getSpread(df, mask), label))
    print("{:0.4f}".format(score[0]), score[1])

0.6433 whole_dataset
0.6378 male
0.6243 female
0.5794 asian
0.6278 black
0.6300 hispanic
0.6311 white
0.5858 male & asian
0.6242 male & black
0.6201 male & hispanic
0.6182 male & white
0.5502 female & asian
0.5777 female & black
0.6093 female & hispanic
0.6063 female & white


In [8]:
# Same thing, sorted!

for score, label in sorted(spreads, reverse=True, key=lambda x:x[0]):
    print("{:0.4f}".format(score), label)

0.6433 whole_dataset
0.6378 male
0.6311 white
0.6300 hispanic
0.6278 black
0.6243 female
0.6242 male & black
0.6201 male & hispanic
0.6182 male & white
0.6093 female & hispanic
0.6063 female & white
0.5858 male & asian
0.5794 asian
0.5777 female & black
0.5502 female & asian
