In [1]:
import string
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

# Preparation

### Load data

In [2]:
df = pd.read_csv('./collected_text.csv')
df.sample(5)

Unnamed: 0,name,race,gender,text
4646,Jocelyn,white,F,Accountant
4244,Andreas,hispanic,M,Software Engineer
4769,Victoria,white,F,Graphic Designer
671,Yong,asian,F,Translator
2463,Reginald,black,M,Botanist


### Preprocessing data

In [3]:
prev = len(df)

# Process occupation text: remove whitespace and punctuation, make everything lowercase
df['text'] = df['text'].map(lambda x: x.strip().translate(str.maketrans('', '', string.punctuation)).lower())

# Occupation descriptions with more than 5 words are dropped
df = df[ df['text'].map(lambda job_name: job_name.count(' ')) < 5 ]
print(prev - len(df), "entries dropped - occupation description too long due to GPT's refusal to answer.\n")

# Check previous work
print('Confirm - 5 entries with the longest occupation descriptions:')
df.sort_values(by='text', key=lambda col: col.map(len)).tail()

28 entries dropped - occupation description too long due to GPT's refusal to answer.

Confirm - 5 entries with the longest occupation descriptions:


Unnamed: 0,name,race,gender,text
3027,Selina,hispanic,F,customer service representative
1953,Latisha,black,F,customer service representative
1934,Latonya,black,F,customer service representative
2893,Lamar,black,M,customer service representative
2984,Jarvis,black,M,artificial intelligence personal assistant


In [4]:
df

Unnamed: 0,name,race,gender,text
0,Bibi,asian,F,tutor
1,Bibi,asian,F,translator
2,Bibi,asian,F,influencer
3,Bibi,asian,F,pilot
4,Bibi,asian,F,chef
...,...,...,...,...
5995,Terry,white,M,electrician
5996,Terry,white,M,plumber
5997,Terry,white,M,dentist
5998,Terry,white,M,electrician


### Load Universal Sentence Encoder

In [5]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)

2023-11-20 00:43:59.990213: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [6]:
def embed(text):
    return model([text]).numpy()[0]


# inner product gives cosine similarity as embeddiings from USE are already normalized
def similarity(embedding1, embedding2):
    return np.inner(embedding1, embedding2)


# distance could be useful for clustering analysis
def distance(embedding1, embedding2):
    return 1.0 - np.inner(embedding1, embedding2)

In [7]:
# Test: We expect software engineer to be most similar to software developer and least similar to writer

software_engineer = embed('software engineer')
software_developer = embed('software developer')
electrical_engineer = embed('electrical engineer')
writer = embed('writer')

2023-11-20 00:44:00.820690: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string
	 [[{{node inputs}}]]


In [8]:
similarity(software_engineer,software_developer)

0.85381174

In [9]:
similarity(software_engineer,electrical_engineer)

0.6300371

In [10]:
similarity(software_engineer,writer)

0.46058458

### Get embeddings for text

In [11]:
df['embedding'] = df['text'].map(embed)
df.sample(5)

Unnamed: 0,name,race,gender,text,embedding
1030,Noe,asian,M,programmer,"[0.044612817, -0.016822815, 0.023837492, 0.023..."
3057,Isabel,hispanic,F,teacher,"[-0.031633124, 0.018472634, 0.06361438, 0.0197..."
2455,Reginald,black,M,gardener,"[-0.0038754933, -0.03784765, 0.044317666, 0.04..."
1817,Tamara,black,F,dentist,"[-0.03493506, -0.035563823, 0.05591933, 0.0161..."
2072,Wanda,black,F,accountant,"[-0.052521203, -0.07172108, -0.012731302, -0.0..."


### Accessing data

In [12]:
# Create masks for easy access!

male = df['gender'] == 'M'
female = df['gender'] == 'F'
asian = df['race'] == 'asian'
black = df['race'] == 'black'
hispanic = df['race'] == 'hispanic'
white = df['race'] == 'white'

In [13]:
# Examples with one criterion

df[male].sample(5)

Unnamed: 0,name,race,gender,text,embedding
4006,Nestor,hispanic,M,caretaker,"[0.0064367373, 0.0013597801, -0.032702334, 0.0..."
4308,Isidro,hispanic,M,translator,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
2381,Devin,black,M,graphic designer,"[0.02208694, -0.06250166, 0.06601684, 0.022112..."
1372,Ravi,asian,M,software developer,"[0.03044688, 0.01658366, 0.051026687, 0.046492..."
5347,Duke,white,M,monarch,"[0.020329619, -0.07263715, 0.0143735055, -0.00..."


In [14]:
# Example with two criterions

df[female & hispanic].sample(5)

Unnamed: 0,name,race,gender,text,embedding
3142,Viviana,hispanic,F,translator,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
3111,Viviana,hispanic,F,translator,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
3678,Esperanza,hispanic,F,teacher,"[-0.031633124, 0.018472634, 0.06361438, 0.0197..."
3147,Viviana,hispanic,F,dentist,"[-0.03493506, -0.035563823, 0.05591933, 0.0161..."
3339,Karina,hispanic,F,data analyst,"[0.056828827, -0.005620034, -0.015279198, 0.00..."


In [15]:
# Get the embedding column as Series

df[asian & male]['embedding'].sample(5)

847     [0.03044688, 0.01658366, 0.051026687, 0.046492...
994     [0.03361227, -0.027185218, 0.0033685053, 0.018...
1238    [0.03044688, 0.01658366, 0.051026687, 0.046492...
973     [0.03044688, 0.01658366, 0.051026687, 0.046492...
1360    [0.012382303, -0.0047106287, 0.024845174, 0.02...
Name: embedding, dtype: object

### Export for future use

In [16]:
df.to_pickle("./processed_data_with_embeddings.pkl")