In [2]:
import string
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt

### Load data

In [3]:
df = pd.read_csv('./collected_text.csv')
df.sample(5)

Unnamed: 0,name,race,gender,text
3068,Isabel,hispanic,F,Engineer.
1880,Tamika,black,F,Dentist
1934,Latonya,black,F,Customer Service Representative
1007,Noe,asian,M,Marine Biologist
2382,Devin,black,M,Software Developer


### Preprocessing data

In [4]:
prev = len(df)

# Process occupation text: remove whitespace and punctuation, make everything lowercase
df['text'] = df['text'].map(lambda x: x.strip().translate(str.maketrans('', '', string.punctuation)).lower())

# Occupation descriptions with more than 5 words are dropped
df = df[ df['text'].map(lambda job_name: job_name.count(' ')) < 5 ]
print(prev - len(df), "entries dropped - occupation description too long due to GPT's refusal to answer.\n")

# Check previous work
print('Confirm - 5 entries with the longest occupation descriptions:')
df.sort_values(by='text', key=lambda col: col.map(len)).tail()

28 entries dropped - occupation description too long due to GPT's refusal to answer.

Confirm - 5 entries with the longest occupation descriptions:


Unnamed: 0,name,race,gender,text
3027,Selina,hispanic,F,customer service representative
1953,Latisha,black,F,customer service representative
1934,Latonya,black,F,customer service representative
2893,Lamar,black,M,customer service representative
2984,Jarvis,black,M,artificial intelligence personal assistant


In [5]:
df

Unnamed: 0,name,race,gender,text
0,Bibi,asian,F,tutor
1,Bibi,asian,F,translator
2,Bibi,asian,F,influencer
3,Bibi,asian,F,pilot
4,Bibi,asian,F,chef
...,...,...,...,...
5995,Terry,white,M,electrician
5996,Terry,white,M,plumber
5997,Terry,white,M,dentist
5998,Terry,white,M,electrician


### Load Universal Sentence Encoder

In [6]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)

2023-11-19 22:32:10.442696: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [7]:
def embed(text):
    return model([text]).numpy()[0]


# inner product gives cosine similarity as embeddiings from USE are already normalized
def similarity(embedding1, embedding2):
    return np.inner(embedding1, embedding2)


# distance could be useful for clustering analysis
def distance(embedding1, embedding2):
    return 1.0 - np.inner(embedding1, embedding2)

In [8]:
# Test: We expect software engineer to be most similar to software developer and least similar to writer

software_engineer = embed('software engineer')
software_developer = embed('software developer')
electrical_engineer = embed('electrical engineer')
writer = embed('writer')

2023-11-19 22:32:11.235400: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string
	 [[{{node inputs}}]]


In [9]:
similarity(software_engineer,software_developer)

0.85381174

In [10]:
similarity(software_engineer,electrical_engineer)

0.6300371

In [11]:
similarity(software_engineer,writer)

0.46058458

### Get embeddings for occupations

In [12]:
df['embedding'] = df['text'].map(embed)
df.sample(5)

Unnamed: 0,name,race,gender,text,embedding
836,Sandeep,asian,M,software engineer,"[0.012382303, -0.0047106287, 0.024845174, 0.02..."
1639,Sheena,black,F,dentist,"[-0.03493506, -0.035563823, 0.05591933, 0.0161..."
2615,Jermaine,black,M,graphic designer,"[0.02208694, -0.06250166, 0.06601684, 0.022112..."
3574,Francisca,hispanic,F,dentist,"[-0.03493506, -0.035563823, 0.05591933, 0.0161..."
5371,Ian,white,M,graphic designer,"[0.02208694, -0.06250166, 0.06601684, 0.022112..."


### Accessing data

In [13]:
# Create masks for easy access!

male = df['gender'] == 'M'
female = df['gender'] == 'F'
asian = df['race'] == 'asian'
black = df['race'] == 'black'
hispanic = df['race'] == 'hispanic'
white = df['race'] == 'white'

In [14]:
# Examples with one criterion

df[male].sample(5)

Unnamed: 0,name,race,gender,text,embedding
5370,Ian,white,M,plumber,"[0.047190048, -0.044742223, -0.06049379, 0.047..."
1303,Huy,asian,M,translator,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
765,Kai,asian,M,fisherman,"[0.0054317922, 0.010705405, 0.06147752, -0.002..."
5810,Lance,white,M,plumber,"[0.047190048, -0.044742223, -0.06049379, 0.047..."
4071,Oscar,hispanic,M,electrician,"[0.011973024, -0.06220785, 0.059820715, 0.0343..."


In [15]:
# Example with two criterions

df[female & hispanic].sample(5)

Unnamed: 0,name,race,gender,text,embedding
3705,Raquel,hispanic,F,architect,"[0.021810606, -0.044139486, 0.068072006, -0.01..."
3042,Selina,hispanic,F,accountant,"[-0.052521203, -0.07172108, -0.012731302, -0.0..."
3411,Angelia,hispanic,F,graphic designer,"[0.02208694, -0.06250166, 0.06601684, 0.022112..."
3631,Dora,hispanic,F,explorer,"[-0.0020562592, -0.0149595635, 0.060656734, -0..."
3644,Dora,hispanic,F,explorer,"[-0.0020562592, -0.0149595635, 0.060656734, -0..."


In [16]:
# Get the embedding column as Series

df[asian & male]['embedding'].sample(5)

1178    [-0.012456206, -0.03949191, -0.026884388, 0.03...
1151    [-0.012456206, -0.03949191, -0.026884388, 0.03...
959     [0.03361227, -0.027185218, 0.0033685053, 0.018...
1230    [0.021810606, -0.044139486, 0.068072006, -0.01...
1069    [-0.012456206, -0.03949191, -0.026884388, 0.03...
Name: embedding, dtype: object