In [7]:
import numpy as np
import pandas as pd

# Analysis

## 0. Loading & Accessing Data

In [8]:
data = pd.read_pickle("./Data/final_processed_with_embeddings.pkl")

In [9]:
df = pd.read_pickle('./Data/final_processed_with_embeddings.pkl')
df.sample(5)

Unnamed: 0,name,race,gender,text,length,embedding
1680,Yolanda,black,F,dentist,1,"[-0.03493506, -0.035563823, 0.05591933, 0.0161..."
5766,Warren,white,M,investor,1,"[-0.03837071, -0.046607483, 0.052726462, 0.030..."
5032,Tracy,white,F,dentist,1,"[-0.03493506, -0.035563823, 0.05591933, 0.0161..."
3440,Juliana,hispanic,F,graphic designer,2,"[0.02208694, -0.06250166, 0.06601684, 0.022112..."
2223,Desiree,black,F,graphic designer,2,"[0.02208694, -0.06250166, 0.06601684, 0.022112..."


In [10]:
# Create masks for easy access

male = df['gender'] == 'M'
female = df['gender'] == 'F'
asian = df['race'] == 'asian'
black = df['race'] == 'black'
hispanic = df['race'] == 'hispanic'
white = df['race'] == 'white'
whole_dataset = df['gender'] != 'A'

# Usage Examples:
# df[male]
# df[female & hispanic]

In [11]:
# Collect all subgroups with labels

masks = [whole_dataset, male, female, 
         asian, black, hispanic, white, 
         male & asian, male & black, male & hispanic, male & white,
         female & asian,female & black,female & hispanic, female & white]
labels = ["whole_dataset", "male", "female", 
          "asian", "black", "hispanic", "white", 
          "male & asian", "male & black", "male & hispanic", "male & white",
          "female & asian","female & black","female & hispanic", "female & white"]

In [14]:
most_common_jobs= {}
most_common_masks = []
    #get x most common jobs of each group
for mask,label in zip(masks,labels):
    values, counts = np.unique(df[mask]["text"],return_counts=True)
    ind = np.argpartition(-counts, kth=10)[:10]
    most_common_jobs[label] = values[ind]  # prints the 10 most frequent elements
    print(values[ind])
    most_common_masks.append((np.isin(df[mask]["text"],test_elements =most_common_jobs[label])))
    #embeddings= df2[\"embedding\"][:]\n",
    #print(np.unique(embeddings))\n",
    #print(embeddings)\n",
    print(most_common_masks)

['graphic designer' 'accountant' 'dentist' 'electrician' 'translator'
 'software developer' 'teacher' 'architect' 'software engineer'
 'librarian']
[array([False,  True, False, ...,  True, False, False])]
['architect' 'plumber' 'chef' 'accountant' 'software developer'
 'software engineer' 'translator' 'electrician' 'teacher'
 'graphic designer']
[array([False,  True, False, ...,  True, False, False]), array([False, False, False, ..., False, False,  True])]
['accountant' 'teacher' 'dentist' 'librarian' 'translator' 'nurse'
 'graphic designer' 'architect' 'data analyst' 'lawyer']
[array([False,  True, False, ...,  True, False, False]), array([False, False, False, ..., False, False,  True]), array([False,  True, False, ...,  True,  True,  True])]
['software developer' 'chef' 'translator' 'engineer' 'graphic designer'
 'accountant' 'software engineer' 'data analyst' 'interpreter' 'dentist']
[array([False,  True, False, ...,  True, False, False]), array([False, False, False, ..., False, Fal

In [5]:
# inner product gives cosine similarity as embeddiings from USE are already normalized
def similarity(embedding1, embedding2):
    return np.inner(embedding1, embedding2)


# distance could be useful for clustering analysis
def distance(embedding1, embedding2):
    return 1.0 - np.inner(embedding1, embedding2)

## 1. Spread of Multidimensional Data

Spread of multidimensional data can be represented as the trace of the covariance matrix. 
The embeddings via Google's Universal Sentence Encoder has 512 dimensions, so we expect a covariance matrix of dimension 512 x 512.

In [6]:
def getSpread(df, mask):
    return np.trace(np.cov(np.stack(df[mask]['embedding'].values).T))

In [7]:
spreads = []

# Getting the spread of data based on embeddings
for mask, label in zip(masks,labels):
    spreads.append(score:=(getSpread(df, mask), label))
    print("{:0.4f}".format(score[0]), score[1])

0.6433 whole_dataset
0.6378 male
0.6243 female
0.5794 asian
0.6278 black
0.6300 hispanic
0.6311 white
0.5858 male & asian
0.6242 male & black
0.6201 male & hispanic
0.6182 male & white
0.5502 female & asian
0.5777 female & black
0.6093 female & hispanic
0.6063 female & white


In [8]:
# Same thing, sorted!

for score, label in sorted(spreads, reverse=True, key=lambda x:x[0]):
    print("{:0.4f}".format(score), label)

0.6433 whole_dataset
0.6378 male
0.6311 white
0.6300 hispanic
0.6278 black
0.6243 female
0.6242 male & black
0.6201 male & hispanic
0.6182 male & white
0.6093 female & hispanic
0.6063 female & white
0.5858 male & asian
0.5794 asian
0.5777 female & black
0.5502 female & asian


In [15]:
def getSpreadModified(data):
    return np.trace(np.cov(np.stack(data)))

In [17]:
most_common_spreads = []
for data,mask, label in zip(masks,most_common_masks,labels):
    data = df[data]
    unique_data = [list(x) for x in set(tuple(x) for x in data["embedding"].values)]
    #print(unique_data)\n",
    
    most_common_spreads.append(score:=(getSpreadModified(unique_data), label))
    print("{:0.4f}".format(score[0]), score[1])

0.4728 whole_dataset
0.3166 male
0.3087 female
0.2325 asian
0.2267 black
0.1895 hispanic
0.2305 white
0.1563 male & asian
0.1544 male & black
0.1348 male & hispanic
0.1270 male & white
0.1426 female & asian
0.1368 female & black
0.1153 female & hispanic
0.1661 female & white


In [18]:
# Same thing, sorted!

for score, label in sorted(most_common_spreads, reverse=True, key=lambda x:x[0]):
    print("{:0.4f}".format(score), label)

0.4728 whole_dataset
0.3166 male
0.3087 female
0.2325 asian
0.2305 white
0.2267 black
0.1895 hispanic
0.1661 female & white
0.1563 male & asian
0.1544 male & black
0.1426 female & asian
0.1368 female & black
0.1348 male & hispanic
0.1270 male & white
0.1153 female & hispanic
