In [1]:
import numpy as np
import json
from google.colab import drive
drive.mount('/content/gdrive')

word = F"/content/gdrive/My Drive/NLP/Project/reddit.US.txt.tok.clean.cleanedforw2v_3.w2v" 
vocab = F"/content/gdrive/My Drive/NLP/Project/race_attributes_optm.json" 
path = F"/content/gdrive/My Drive/NLP/Project/output/race/w2_v3/"
outprefix = 'race'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
def load_legacy_w2v(w2v_file, dim=50):
    vectors = {}
    with open(w2v_file, 'r') as f:
        for line in f:
            vect = line.strip().rsplit()
            word = vect[0]
            vect = np.array([float(x) for x in vect[1:]])
            if(dim == len(vect)):
                vectors[word] = vect
        
    return vectors, dim

In [0]:
word_vectors, embedding_dim  = load_legacy_w2v(word)

In [0]:
def load_analogy_templates(json_filepath, mode):
	with open(json_filepath, "r") as f:	
		loadedData = json.load(f)
		return loadedData["analogy_templates"][mode]

def load_test_terms(json_filepath):
	with open(json_filepath, "r") as f:	
		loadedData = json.load(f)
		return loadedData["testTerms"]

def load_eval_terms(json_filepath, mode):
	with open(json_filepath, "r") as f:	
		loadedData = json.load(f)
		return loadedData["eval_targets"], loadedData["analogy_templates"][mode].values()

def load_def_sets(json_filepath):
	with open(json_filepath, "r") as f: 
		loadedData = json.load(f)
		return {i: v for i, v in enumerate(loadedData["definite_sets"])}

In [5]:
print("Loading vocabulary from {}".format(vocab))

analogyTemplates = load_analogy_templates(vocab, 'role')
defSets = load_def_sets(vocab)
testTerms = load_test_terms(vocab)
neutral_words = []
for value in analogyTemplates.values():
    neutral_words.extend(value)

Loading vocabulary from /content/gdrive/My Drive/NLP/Project/race_attributes_optm.json


In [0]:
def pruneWordVecs(wordVecs):
    newWordVecs = {}
    for word, vec in wordVecs.items():
        valid=True
        if(not isValidWord(word)):
            valid = False
        if(valid):
            newWordVecs[word] = vec
    return newWordVecs

def isValidWord(word):
    return all([c.isalpha() for c in word])

In [7]:
print("Pruning Word Vectors... Starting with", len(word_vectors))
word_vectors = pruneWordVecs(word_vectors)
print("\tEnded with", len(word_vectors))

Pruning Word Vectors... Starting with 44895
	Ended with 44895


In [0]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

import torch

def normalize(word_vectors):
    for k, v in word_vectors.items():
        word_vectors[k] = v / np.linalg.norm(v)

def identify_bias_subspace(vocab, def_sets, subspace_dim, embedding_dim):
    # calculate means of defining sets
    means = {}
    for k, v in def_sets.items():
        wSet = []
        for w in v:
            try:
                wSet.append(vocab[w])
            except KeyError as e:
                pass
        set_vectors = np.array(wSet)
        means[k] = np.mean(set_vectors, axis=0)

    # calculate vectors to perform PCA
    matrix = []
    for k, v in def_sets.items():
        wSet = []
        for w in v:
            try:
                wSet.append(vocab[w])
            except KeyError as e:
                pass
        set_vectors = np.array(wSet)
        diffs = set_vectors - means[k]
        matrix.append(diffs)

    matrix = np.concatenate(matrix)

    pca = PCA(n_components=subspace_dim)
    pca.fit(matrix)

    return pca.components_

In [9]:
print("Identifying bias subspace")
subspace = identify_bias_subspace(word_vectors, defSets, 2, embedding_dim)


Identifying bias subspace


In [0]:
def neutralize_and_equalize(vocab, words, eq_sets, bias_subspace, embedding_dim):
    
    if bias_subspace.ndim == 1:
        bias_subspace = np.expand_dims(bias_subspace, 0)
    elif bias_subspace.ndim != 2:
        raise ValueError("bias subspace should be either a matrix or vector")

    new_vocab = vocab.copy()
    for w in words:
        # get projection onto bias subspace
        if w in vocab:
            v = vocab[w]
            v_b = project_onto_subspace(v, bias_subspace)

            new_v = (v - v_b) / np.linalg.norm(v - v_b)
            #print np.linalg.norm(new_v)
            # update embedding
            new_vocab[w] = new_v

    normalize(new_vocab)

    for eq_set in eq_sets:
        mean = np.zeros((embedding_dim,))

        #Make sure the elements in the eq sets are valid
        cleanEqSet = []
        for w in eq_set:
            try:
                _ = new_vocab[w]
                cleanEqSet.append(w)
            except KeyError as e:
                pass

        for w in cleanEqSet:
            mean += new_vocab[w]
        mean /= float(len(cleanEqSet))

        mean_b = project_onto_subspace(mean, bias_subspace)
        upsilon = mean - mean_b

        for w in cleanEqSet:
            v = new_vocab[w]
            v_b = project_onto_subspace(v, bias_subspace)

            frac = (v_b - mean_b) / np.linalg.norm(v_b - mean_b)
            new_v = upsilon + np.sqrt(1 - np.sum(np.square(upsilon))) * frac

            new_vocab[w] = new_v

    return new_vocab

def project_onto_subspace(vector, subspace):
    v_b = np.zeros_like(vector)
    for component in subspace:
        v_b += np.dot(vector.transpose(), component) * component
    return v_b


In [11]:
print("Neutralizing and Equalizing")
new_hard_word_vectors = neutralize_and_equalize(word_vectors, neutral_words,
                        defSets.values(), subspace, embedding_dim)

Neutralizing and Equalizing


In [0]:
from gensim.models.keyedvectors import Word2VecKeyedVectors

def convert_legacy_to_keyvec(legacy_w2v):
    dim = len(legacy_w2v[list(legacy_w2v.keys())[0]])
    vectors = Word2VecKeyedVectors(dim)

    ws = []
    vs = []

    for word, vect in legacy_w2v.items():
        ws.append(word)
        vs.append(vect)
        assert(len(vect) == dim)
    vectors.add(ws, vs, replace=True)
    return vectors

In [0]:
def cos(a,b,x,y, vectors):
		aVec = np.array(vectors[a])
		bVec = np.array(vectors[b])
		xVec = np.array(vectors[x])
		yVec = np.array(vectors[y])
		numerator = (aVec-bVec).dot(xVec-yVec)
		denominator = np.linalg.norm(aVec-bVec)*np.linalg.norm(xVec-yVec)
		return numerator/(denominator if denominator != 0 else 1e-6)


In [14]:
keyedVecs = convert_legacy_to_keyvec(word_vectors)
new_vectors = convert_legacy_to_keyvec(new_hard_word_vectors)
cosine = -10
final_word = final_word1 = ''
sequence = ['black', 'musician', 'asian']
#sequence = ['black', 'runner', 'caucasian']
for word in keyedVecs.vocab:
  x = cos(sequence[0], sequence[1], sequence[2], word, keyedVecs)
  if(x > cosine):
    cosine = x
    final_word = word

print(final_word)
cosine = -10
for word in new_vectors.vocab:
  y = cos(sequence[0], sequence[1], sequence[2], word, new_vectors)  
  if(y > cosine):
    cosine = y
    final_word1 = word

print(final_word1)

programmer
musician
