# Gender Bias in Word Embeddings for Russian Language


In [None]:
import gensim
import scipy
import statistics
import numpy as np
from numpy import mean
import random
import urllib.request
import pandas as pd
import zipfile

## Loading Word Embeddings

from the site https://rusvectores.org/ru/models/


### Word embeddings used for the analysis:

**Word2vec models with Universal dependecies tags**:


1) ruscorpora_upos_cbow_300_20_2019 (http://vectors.nlpl.eu/repository/20/180.zip)

2) ruwikiruscorpora_upos_skipgram_300_2_2019 (http://vectors.nlpl.eu/repository/20/182.zip)

3) tayga_upos_skipgram_300_2_2019 (http://vectors.nlpl.eu/repository/20/185.zip)

4) news_upos_skipgram_300_5_2019 (http://vectors.nlpl.eu/repository/20/184.zip)

**Fasttext model without tags**

1) geowac_lemmas_none_fasttextskipgram_300_5_2020 (http://vectors.nlpl.eu/repository/20/213.zip)

In [None]:
we_models = {"geowac_lemmas_none_fasttextskipgram_300_5_2020": "http://vectors.nlpl.eu/repository/20/213.zip",
             "ruscorpora_upos_cbow_300_20_2019" : "http://vectors.nlpl.eu/repository/20/180.zip",
             "ruwikiruscorpora_upos_skipgram_300_2_2019": "http://vectors.nlpl.eu/repository/20/182.zip",
             "tayga_upos_skipgram_300_2_2019":"http://vectors.nlpl.eu/repository/20/185.zip",
             "news_upos_skipgram_300_5_2019":"http://vectors.nlpl.eu/repository/20/184.zip"}

In [None]:
#saving all the model in the folder "we_models"
def get_models(model_url, model_name, path_to_save="/we_models/"):
    model_path = path_to_save + model_name + ".zip"
    urllib.request.urlretrieve(model_url, model_path)

for model_name, model_url in we_models.items():
    get_models(model_url, model_name)

In [None]:
#we need to extract fasttext models from zip
with zipfile.ZipFile("we_models/geowac_lemmas_none_fasttextskipgram_300_5_2020.zip", 'r') as zip_ref: 
    zip_ref.extractall("we_models/geowac_lemmas_none_fasttextskipgram_300_5_2020")

In [None]:
#function for opening the we model from archieve (for word2vec models) or folder (for fasttext models)
def open_model(model_name,model_path, is_fasttext = False):
    if is_fasttext == False:
        model_file = model_path + model_name + ".zip"
        with zipfile.ZipFile(model_file, 'r') as archive:
            stream = archive.open('model.bin')
            model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)
    else:
        model_file = model_path + model_name
        model = gensim.models.KeyedVectors.load(model_file + "/model.model")
    return model

## Word lists

In [None]:
#attribute words
male_terms = ["мужчина_NOUN", "мужской_ADJ", "мальчик_NOUN", "брат_NOUN", "сын_NOUN", "отец_NOUN", "папа_NOUN", "дедушка_NOUN", "дядя_NOUN"]
female_terms = ["женщина_NOUN", "женский_ADJ", "девочка_NOUN", "сестра_NOUN", "дочь_NOUN", "мать_NOUN", "мама_NOUN", "бабушка_NOUN", "тетя_NOUN"]

In [None]:
#target words
#from the folder "word_sets"
targ_weat_files = ["career_family.txt", 
             'math_arts.txt', 
             'science_arts.txt',
             'intelligence_appearence.txt',
             'strong_weak.txt',
             'tech_human.txt',
             'rational_emotional.txt']

### Getting the vectors for words from the models

In [None]:
#get the vector for a word from word embedding
def get_vectors(model, list_of_words, pos_tag=True):
    vectors = []
    if pos_tag:
        for w in list_of_words:
            vectors.append(model[w.strip()])
    else:
        for w in list_of_words:
            vectors.append(model[w[:w.find("_")]])
    return vectors

#get all the vectors for a set of words
def get_vectors_for_sets(model, sets, pos_tag=True):
    set_vectors = []
    for s in sets:
        set_vectors.append(get_vectors(model, s, pos_tag))
    return set_vectors

## Functions for measuring bias with the Word Embedding Association Test (WEAT) method



### Calculating test statistic, effect size and p-value

In [None]:
def cosine_vectors(v1, v2):
    return np.dot(v1, v2)/norm(v1)/norm(v2)

def similarity(w, A, B): #similarities between a word and two attribute sets
    w_a_sim, w_b_sim = 0, 0
    for i in range(len(A)):
        w_a_sim += cosine_vectors(w, A[i])
        w_b_sim += cosine_vectors(w, B[i])
    return w_a_sim/len(A), w_b_sim/len(B)
    
def sets_difference(X, Y, A, B): #calculating test statistic
    w_A_B_all= []
    x_sim, y_sim = 0, 0
    for i in range(len(X)):
        x_a_sim, x_b_sim = similarity(X[i], A, B)
        y_a_sim, y_b_sim = similarity(Y[i], A, B)
        
        x_sim += (x_a_sim - x_b_sim)
        y_sim += (y_a_sim - y_b_sim)
        
        w_A_B_all.extend([x_a_sim, x_b_sim, y_a_sim, y_b_sim])
    stdev_all_words = statistics.stdev(w_A_B_all) #for calculating effect size
    return x_sim - y_sim, stdev_all_words

def calc_p_value(X, Y, A, B,original_v): #randomization with 100000 iterations
    test_statistics = []
    len_X_Y = len(X) + len(Y)
    indices_old = list(range(len_X_Y))
    all_X_Y = X + Y
    for i in range(100000):
        X_ind = random.sample(indices_old, len_X_Y//2)
        Y_ind = list(set(indices_old)-set(X_ind))
        new_X = [all_X_Y[j] for j in X_ind]
        new_Y = [all_X_Y[j] for j in Y_ind]
        new_value, _ = sets_difference(new_X, new_Y, A, B)
        test_statistics.append(new_value)
    P_value = len([v for v in test_statistics if v >= original_v])/len(test_statistics)
    return P_value

def calc_bias(model, attr1, attr2, pos_tag=True):
    X, Y, A, B = get_vectors_for_sets(model, [attr1, attr2, male_terms, female_terms],pos_tag)
    original_v, stdev_all_words = sets_difference(X, Y, A, B)
    effect_size = (original_v/len(X)) / stdev_all_words
    p_value = calc_p_value(X, Y, A, B, original_v)
    return effect_size, p_value

## Measuring gender bias with the WEAT method 

In [None]:
# file for results of calculations
file_results = "results.csv"
with open(file_results, "w") as f:
    f.write("model,attr1,attr2,effect_size,p_value\n")

In [None]:
we_models_names = ["ruscorpora_upos_cbow_300_20_2019","ruwikiruscorpora_upos_skipgram_300_2_2019", 
                   "tayga_upos_skipgram_300_2_2019", "news_upos_skipgram_300_5_2019",
                   "geowac_lemmas_none_fasttextskipgram_300_5_2020"]

pos_tags = True #if the we model with POS-tags 
is_fastext = False #for fasttext models, zip should be unpacked, so another function for opening the model

for we_num in range(len(we_models_names)):
    we = we_models_names[we_num
                         
    if we.find("fasttext") > 0:
        is_fasttext = True
    if we.find("upos") != -1:
        pos_tag = False
                         
    model = open_model(we, "/we_models/",
                       is_fasttext = is_fasttext)
    
    for attr_weat_f in targ_weat_files:
        file_path = "/word_sets/{}".format(attr_weat_f)
        attr_weat = pd.read_csv(file_path)
        attr1, attr2 = list(attr_weat.iloc[:,0]), list(attr_weat.iloc[:,1])
        attr1_name, attr2_name = attr_weat.columns[0], attr_weat.columns[1]
        effect_size, p_value = calc_bias(model, attr1, attr2,pos_tag=pos_tag)

        with open(file_results, "a") as f:
            f.write(we + "," + attr1_name + "," + attr2_name + "," + str(effect_size) + "," + str(p_value))
            f.write("\n")