In [1]:
import string
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

# Preparation

### Load data

In [2]:
df = pd.read_csv('./Data/preprocessed_text.csv')
df.sample(5)

Unnamed: 0,name,race,gender,text,length
912,Samir,asian,M,engineer,1
903,Samir,asian,M,accountant,1
1316,Huy,asian,M,graphic designer,2
2746,Marlon,black,M,chef,1
2320,Tyrone,black,M,plumber,1


### Preprocessing data

In [None]:
# prev = len(df)

# # Process occupation text: remove whitespace and punctuation, make everything lowercase
# df['text'] = df['text'].map(lambda x: x.strip().translate(str.maketrans('', '', string.punctuation)).lower())

# # Occupation descriptions with more than 5 words are dropped
# df = df[ df['text'].map(lambda job_name: job_name.count(' ')) < 5 ]
# print(prev - len(df), "entries dropped - occupation description too long due to GPT's refusal to answer.\n")

# # Check previous work
# print('Confirm - 5 entries with the longest occupation descriptions:')
# df.sort_values(by='text', key=lambda col: col.map(len)).tail()

In [None]:
# df

### Load Universal Sentence Encoder

In [3]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)

2023-11-20 20:36:23.411686: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [4]:
def embed(text):
    return model([text]).numpy()[0]


# inner product gives cosine similarity as embeddiings from USE are already normalized
def similarity(embedding1, embedding2):
    return np.inner(embedding1, embedding2)


# distance could be useful for clustering analysis
def distance(embedding1, embedding2):
    return 1.0 - np.inner(embedding1, embedding2)

In [5]:
# Test: We expect software engineer to be most similar to software developer and least similar to writer

software_engineer = embed('software engineer')
software_developer = embed('software developer')
electrical_engineer = embed('electrical engineer')
writer = embed('writer')

2023-11-20 20:36:31.151437: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string
	 [[{{node inputs}}]]


In [6]:
similarity(software_engineer,software_developer)

0.85381174

In [7]:
similarity(software_engineer,electrical_engineer)

0.6300371

In [8]:
similarity(software_engineer,writer)

0.46058458

### Get embeddings for text

In [9]:
df['embedding'] = df['text'].map(embed)
df.sample(5)

Unnamed: 0,name,race,gender,text,length,embedding
3023,Selina,hispanic,F,paralegal,1,"[-0.053451147, -0.054388534, 0.022549136, -0.0..."
966,Tuan,asian,M,software engineer,2,"[0.012382303, -0.0047106287, 0.024845174, 0.02..."
4683,Esther,white,F,accountant,1,"[-0.052521203, -0.07172108, -0.012731302, -0.0..."
959,Tuan,asian,M,software developer,2,"[0.03044688, 0.01658366, 0.051026687, 0.046492..."
3796,Angelo,hispanic,M,carpenter,1,"[0.0019878338, -0.030702533, 0.069768734, 0.06..."


### Accessing data

In [10]:
# Create masks for easy access!

male = df['gender'] == 'M'
female = df['gender'] == 'F'
asian = df['race'] == 'asian'
black = df['race'] == 'black'
hispanic = df['race'] == 'hispanic'
white = df['race'] == 'white'

In [11]:
# Examples with one criterion

df[male].sample(5)

Unnamed: 0,name,race,gender,text,length,embedding
2529,Quincy,black,M,lawyer,1,"[-0.055707406, 0.0051456937, -0.013252552, 0.0..."
5492,Erick,white,M,software engineer,2,"[0.012382303, -0.0047106287, 0.024845174, 0.02..."
1481,Jae,asian,M,data analyst,2,"[0.056828827, -0.005620034, -0.015279198, 0.00..."
5620,Arnold,white,M,electrician,1,"[0.011973024, -0.06220785, 0.059820715, 0.0343..."
2590,Jermaine,black,M,software developer,2,"[0.03044688, 0.01658366, 0.051026687, 0.046492..."


In [12]:
# Example with two criterions

df[female & hispanic].sample(5)

Unnamed: 0,name,race,gender,text,length,embedding
3440,Juliana,hispanic,F,graphic designer,2,"[0.02208694, -0.06250166, 0.06601684, 0.022112..."
3704,Raquel,hispanic,F,dentist,1,"[-0.03493506, -0.035563823, 0.05591933, 0.0161..."
3387,Angelia,hispanic,F,statistician,1,"[0.028326634, -0.09130431, -0.05983043, -0.001..."
3283,Karina,hispanic,F,translator,1,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."
3404,Angelia,hispanic,F,translator,1,"[-0.012456206, -0.03949191, -0.026884388, 0.03..."


In [13]:
# Get the embedding column as Series

df[asian & male]['embedding'].sample(5)

1133    [-0.012456206, -0.03949191, -0.026884388, 0.03...
876     [-0.020740997, -0.06392108, -0.019993724, 0.03...
1458    [-0.031633124, 0.018472634, 0.06361438, 0.0197...
1273    [-0.012456206, -0.03949191, -0.026884388, 0.03...
1182    [0.012382303, -0.0047106287, 0.024845174, 0.02...
Name: embedding, dtype: object

### Export for future use

In [15]:
df.to_pickle("./Data/final_processed_with_embeddings.pkl")