In [1]:
import math
import gensim
import umap
import hdbscan
import numpy as np
import pandas as pd
from ast import literal_eval
from scipy.stats import norm
from collections import Counter
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt

## Load wordvectors

In [2]:
vecs_w2v=gensim.models.KeyedVectors.load('wordvectors/riigikogu_word2vec.wordvectors')

In [3]:
np.expand_dims(vecs_w2v['õed'], axis=0).shape

(1, 300)

## Helpers

In [4]:
#SC-WEAT function
#source: https://github.com/wolferobert3/gender_bias_swe_aies2022/blob/main/gender_association_collector.py
def SC_WEAT(w, A, B, permutations):
    w_normed = w / np.linalg.norm(w)
    A_normed = A / np.linalg.norm(A,axis=-1,keepdims=True)
    B_normed = B / np.linalg.norm(B,axis=-1,keepdims=True)

    A_associations = w_normed @ A_normed.T
    B_associations = w_normed @ B_normed.T
    joint_associations = np.concatenate((A_associations,B_associations),axis=-1)

    test_statistic = np.mean(A_associations) - np.mean(B_associations)
    effect_size = test_statistic / np.std(joint_associations,ddof=1)

    midpoint = len(A)
    sample_distribution = np.array([np.random.permutation(joint_associations) for _ in range(permutations)])
    sample_associations = np.mean(sample_distribution[:,:midpoint],axis=1) - np.mean(sample_distribution[:,midpoint:],axis=1)
    p_value = 1 - norm.cdf(test_statistic,np.mean(sample_associations),np.std(sample_associations,ddof=1))

    return effect_size, p_value

## Get word vectors

In [6]:
#Attribute Words
female_stimuli = ['naise','naine','tüdruk','tüdruku','õde', 'naiste','naistele','naist','õed','tütar', 'naisterahvas', 'tüdrukud', 'tüdrukute', 'ema', 'emad', 'naistel', 'üksikema']
male_stimuli = ['mehe','mees','poiss','poisi', 'vend', 'meeste','meestele','meest','vennad','poeg', 'meesterahvas', 'poisid', 'poiste', 'isa', 'isad', 'meestel', 'üksikisa']

len(set(female_stimuli))==len(set(male_stimuli))

True

In [7]:
def get_word_vecs(model, words, print_every=1000):
    embeds=np.expand_dims(model[words[0]], axis=0)
    for i, word in enumerate(words[1:]):
        if i%print_every==0:
            print(f'working on word {i}')
        embeds=np.append(embeds, np.expand_dims(model[word], axis=0),  axis=0)
    return embeds

In [8]:
female_embeddings=get_word_vecs(vecs_w2v, female_stimuli)
male_embeddings=get_word_vecs(vecs_w2v, male_stimuli)

working on word 0
working on word 0


In [9]:
female_embeddings.shape, male_embeddings.shape

((17, 300), (17, 300))

## Get target words - use frequency and remove stopwords

In [10]:
df=pd.read_csv('data/interim/stenos_cleaned_topics_vecs.csv')
df.shape

(209949, 25)

In [11]:
df.tokes[0]

"['auväärt', 'kolleegid', ',', 'auväärt', 'riigikogu', ',', 'tere', 'hommikust', '!', 'kui', 'te', 'nüüd', 'korraks', 'jälle', 'keskendute', 'meie', 'suurepärase', 'istungi', 'rakendamisele', 'ja', 'räägite', 'omavahelisi', 'jutte', 'edasi', 'pärast', ',', 'siis', 'on', 'mul', 'suur', 'rõõm', 'avada', 'riigikogu', 'täiskogu', 'ii', 'istungjärgu', '9.', 'töönädala', 'teisipäevane', 'istung', '.', 'kõigepealt', 'on', 'eelnõude', 'ja', 'arupärimiste', 'üleandmine', ',', 'kui', 'selleks', 'on', 'soovi', '.', 'eelnõude', 'ja', 'arupärimiste', 'üleandmise', 'soovi', 'ma', 'ei', 'näe', '.', 'siis', 'teeme', 'palun', 'kohaloleku', 'kontrolli', '.', 'kohaloleku', 'kontroll', 'kohal', 'on', '75', 'riigikogu', 'liiget', ',', 'puudub', '26.', 'sellest', 'piisab', 'täielikult', ',', 'et', 'asuda', 'meie', 'päevakorra', 'arutelu', 'juurde', '.']"

In [12]:
with open('utils/estonian-stopwords.txt', encoding='utf-8') as f:
    stopwords=f.read().split('\n')
print(len(stopwords))  
with open('utils/estonian-stopwords_additional.txt', encoding='utf-8') as f:
    stopwords_addit=f.read().split('\n')
    
stopwords.extend(stopwords_addit)
stopwords.extend(['.', ',', '!', '?', ':', ';', ')', '(', '-', ']', '[', '...', '–,'])
stopwords.extend(female_stimuli)
stopwords.extend(male_stimuli)

5295


In [13]:
tokens=df.tokes.apply(literal_eval).tolist()
tokens=[item for sublist in tokens for item in sublist]

In [14]:
len(tokens), len(set(tokens))

(25252052, 420801)

In [15]:
%time cnt=Counter(tokens)

CPU times: total: 2.06 s
Wall time: 2.06 s


In [16]:
cnt['vanapoiss']

0

In [17]:
MIN_COUNT=20
most_common_tokens_cnt=cnt.most_common(100000)
most_common_tokens_cnt=[el for el in most_common_tokens_cnt if el[1]>=MIN_COUNT]
most_common_tokens=[el[0] for el in most_common_tokens_cnt if el[0] not in stopwords]
most_common_tokens=[el for el in most_common_tokens if el in vecs_w2v]
most_common_tokens[:10]

['euroopa',
 'seaduse',
 'raha',
 'eurot',
 'küsimusi',
 'tänan',
 'kolleeg',
 'ettepaneku',
 'inimesi',
 'fraktsiooni']

In [18]:
len(most_common_tokens)

40543

## Calculate SC-WEATS

In [20]:
STEP = 10000
N_ITERS=math.ceil(len(most_common_tokens)/STEP)
PERMUTATIONS = 10000

In [22]:
df_bias=pd.DataFrame()
for i in range(N_ITERS):
    print(i)
    targets = most_common_tokens[i*STEP:(i+1)*STEP]
    bias_array = np.array([SC_WEAT(vecs_w2v[word],
                                   female_embeddings, male_embeddings,PERMUTATIONS) for word in targets])
    bias_df_ = pd.DataFrame(bias_array, columns=['female_effect_size','female_p_value'])
    df_bias=df_bias.append(bias_df_)

0


  df_bias=df_bias.append(bias_df_)


1


  df_bias=df_bias.append(bias_df_)


2


  df_bias=df_bias.append(bias_df_)


3


  df_bias=df_bias.append(bias_df_)


4


  df_bias=df_bias.append(bias_df_)


In [23]:
df_bias.index=most_common_tokens

In [24]:
df_bias[(df_bias.female_effect_size>.5)&(df_bias.female_p_value<.05)].sort_values('female_effect_size', ascending=False).head(30)

Unnamed: 0,female_effect_size,female_p_value
juurutama,1.143367,0.000385
finantseerima,1.017046,0.001592
valmima,1.008965,0.001881
harjuma,0.98848,0.002135
taotlema,0.971773,0.002301
käivituma,0.96122,0.002586
töötama,0.952283,0.002839
määratlema,0.942401,0.003157
lisanduma,0.935941,0.002811
tõstatama,0.927435,0.003317


## Save

In [25]:
df_bias.to_csv('data/interim/gender_bias/words_female_scores_v2.csv')

## Load 

In [26]:
df_bias=pd.read_csv('data/interim/gender_bias/words_female_scores_v2.csv', index_col=0)
df_bias.shape

(40543, 2)

In [27]:
df_bias

Unnamed: 0,female_effect_size,female_p_value
euroopa,0.249238,0.239294
seaduse,0.639835,0.032584
raha,0.061697,0.427547
eurot,0.286011,0.204118
küsimusi,-0.547189,0.945468
...,...,...
lasketiiru,0.227060,0.251788
käsimüügiravimite,0.128099,0.356420
baskini,-0.336560,0.834577
c2,0.381899,0.131180


In [28]:
df_bias_female=df_bias[(df_bias.female_effect_size>=.5)&(df_bias.female_p_value<=.05)].sort_values('female_effect_size', ascending=False)
df_bias_female.shape

(707, 2)

In [29]:
df_bias_female.head(20)

Unnamed: 0,female_effect_size,female_p_value
juurutama,1.143367,0.000385
finantseerima,1.017046,0.001592
valmima,1.008965,0.001881
harjuma,0.98848,0.002135
taotlema,0.971773,0.002301
käivituma,0.96122,0.002586
töötama,0.952283,0.002839
määratlema,0.942401,0.003157
lisanduma,0.935941,0.002811
tõstatama,0.927435,0.003317


In [30]:
df_bias_male=df_bias[(df_bias.female_effect_size<=-.5)&(df_bias.female_p_value>=.95)].sort_values('female_effect_size', ascending=True)
df_bias_male.shape

(564, 2)

In [31]:
df_bias_male.head(20)

Unnamed: 0,female_effect_size,female_p_value
sõerdile,-1.021268,0.998612
peangi,-1.018271,0.998401
jaanus,-1.00413,0.998263
krimm,-0.989838,0.997932
okupeerinud,-0.955118,0.997177
välisministrile,-0.947992,0.996986
hansson,-0.931199,0.99675
õnne,-0.92572,0.996429
viimse,-0.916476,0.996413
andrus,-0.906184,0.996096


## Cluster female and male words

In [33]:
def cosine_similarity(vecs_ar1, vecs_ar2, words=None):
    """returns vecs_ar.shape[0] x vecs_ar.shape[0] dataframe of each word cosine similarity with other words"""
    sims= np.dot(vecs_ar1, vecs_ar2.T)/(np.linalg.norm(vecs_ar1)*np.linalg.norm(vecs_ar2.T))
    df_sims=pd.DataFrame(sims)
    if isinstance(words, list):
        df_sims.index=words
        df_sims.columns=words
    return df_sims

def cluster_sims(df_sims, n_clusters):
    clustering = AgglomerativeClustering(n_clusters=n_clusters, compute_distances=True).fit(df_sims)
    df_clusters=pd.DataFrame({'token':df_sims.index, 'cluster':clustering.labels_})
    df_clusters.index=df_clusters.token
    return clustering, df_clusters

#### word embeddings

In [34]:
df_bias_female_embs=pd.DataFrame(get_word_vecs(vecs_w2v, df_bias_female.index))
df_bias_female_embs.index=df_bias_female.index

working on word 0


In [35]:
df_bias_male_embs=pd.DataFrame(get_word_vecs(vecs_w2v, df_bias_male.index))
df_bias_male_embs.index=df_bias_male.index

working on word 0


#### cosine similarity between vectors

In [36]:
df_bias_female_embs_sims=cosine_similarity(df_bias_female_embs, df_bias_female_embs, list(df_bias_female_embs.index))

In [37]:
df_bias_male_embs_sims=cosine_similarity(df_bias_male_embs, df_bias_male_embs, list(df_bias_male_embs.index))

#### Agglomerative clustering

In [38]:
clust_clf_female_ag, df_female_clusters_ag=cluster_sims(df_bias_female_embs_sims, n_clusters=20)

In [39]:
df_female_clusters_ag.cluster.value_counts()[:20]

12    178
17    127
13     68
10     58
4      39
11     37
16     32
15     27
3      21
2      19
19     18
1      17
8      13
7      12
5      12
6      10
9       7
14      5
18      4
0       3
Name: cluster, dtype: int64

In [41]:
clust_clf_male_ag, df_male_clusters_ag=cluster_sims(df_bias_male_embs_sims, n_clusters=20)

#### HDBSCAN

In [42]:
#first reduce dimensions
reducer = umap.UMAP(random_state=42,n_components=4, metric='cosine')
bias_female_embs_sims_red=dding=reducer.fit_transform(df_bias_female_embs_sims)

In [43]:
#first reduce dimensions
reducer = umap.UMAP(random_state=42,n_components=4, metric='cosine')
bias_male_embs_sims_red=dding=reducer.fit_transform(df_bias_male_embs_sims)

In [44]:
bias_female_embs_sims_red.shape, bias_male_embs_sims_red.shape

((707, 4), (564, 4))

In [45]:
clust_hdb = hdbscan.HDBSCAN()
clust_hdb.fit(pd.DataFrame(bias_female_embs_sims_red))

HDBSCAN()

In [46]:
#number of clusters
len(set(clust_hdb.labels_))

20

In [47]:
clust_hdb_male = hdbscan.HDBSCAN()
clust_hdb_male.fit(pd.DataFrame(bias_male_embs_sims_red))

HDBSCAN()

In [48]:
df_female_clusters_ag['cluster_hdb']=clust_hdb.labels_
df_male_clusters_ag['cluster_hdb']=clust_hdb_male.labels_

In [49]:
df_female_clusters_ag.cluster_hdb.value_counts()

 0     249
-1     139
 2      33
 8      33
 14     32
 4      22
 9      21
 18     20
 17     19
 15     19
 12     18
 11     18
 5      17
 7      16
 6      10
 13      9
 10      9
 3       8
 16      8
 1       7
Name: cluster_hdb, dtype: int64

In [50]:
df_female_clusters_ag

Unnamed: 0_level_0,token,cluster,cluster_hdb
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
juurutama,juurutama,13,0
finantseerima,finantseerima,10,0
valmima,valmima,16,0
harjuma,harjuma,13,0
taotlema,taotlema,11,0
...,...,...,...
selguma,selguma,16,0
kontrolliga,kontrolliga,12,17
kasvama,kasvama,9,0
halval,halval,12,3


In [51]:
#add it to original data
df_bias_female['clust_ag']=df_female_clusters_ag.cluster
df_bias_female['cluster_hdb']=df_female_clusters_ag.cluster_hdb

In [52]:
df_bias_male['clust_ag']=df_male_clusters_ag.cluster
df_bias_male['cluster_hdb']=df_male_clusters_ag.cluster_hdb

In [53]:
vecs_w2v.most_similar('poiss', topn=20)

[('tüdruk', 0.713146448135376),
 ('mees', 0.6667014956474304),
 ('neiu', 0.6330894231796265),
 ('tütarlaps', 0.6147922277450562),
 ('poeg', 0.6108688116073608),
 ('noormees', 0.6090396642684937),
 ('naine', 0.6007523536682129),
 ('sõber', 0.6000964045524597),
 ('meesterahvas', 0.5992476940155029),
 ('tütar', 0.5898628830909729),
 ('kirikuõpetaja', 0.5848573446273804),
 ('vanaema', 0.5753918886184692),
 ('härrasmees', 0.5677305459976196),
 ('naisterahvas', 0.5645389556884766),
 ('pereisa', 0.5622426271438599),
 ('teadlane', 0.5608881711959839),
 ('vallavanem', 0.5470516681671143),
 ('toots', 0.5462425947189331),
 ('onu', 0.5456746220588684),
 ('noor', 0.5399383306503296)]

## Female clusters

In [56]:
df_bias_female.groupby('clust_ag')['female_effect_size'].median().sort_values(ascending=False).head(20)

clust_ag
14    0.793752
18    0.749313
9     0.738344
7     0.726154
13    0.702765
10    0.700774
19    0.694814
5     0.687891
16    0.677394
11    0.672608
6     0.665499
0     0.660479
15    0.644722
8     0.635444
4     0.635347
17    0.629783
1     0.620228
3     0.618650
12    0.616322
2     0.601868
Name: female_effect_size, dtype: float64

In [58]:
df_bias_female.groupby('cluster_hdb')['female_effect_size'].median().sort_values(ascending=False).head(20)

cluster_hdb
 4     0.696766
 0     0.682258
 13    0.681936
 9     0.672905
 11    0.662436
 7     0.644667
 2     0.643240
-1     0.641923
 5     0.621271
 16    0.616685
 12    0.615988
 8     0.613171
 3     0.611304
 14    0.603983
 17    0.603552
 1     0.602309
 10    0.601868
 6     0.597728
 15    0.590381
 18    0.587914
Name: female_effect_size, dtype: float64

In [59]:
df_bias_female[df_bias_female.cluster_hdb==-1]

Unnamed: 0,female_effect_size,female_p_value,clust_ag,cluster_hdb
saatekirja,0.918707,0.003713,17,-1
hindamist,0.881957,0.005014,3,-1
järeldusele,0.858506,0.006025,18,-1
koolitaja,0.854213,0.006215,12,-1
hindamise,0.820778,0.008568,3,-1
...,...,...,...,...
logopeedi,0.568674,0.048773,17,-1
kutsehaiguskindlustuse,0.566853,0.046084,2,-1
ravi-,0.565339,0.049932,12,-1
leidsime,0.562295,0.048216,18,-1


## Male clusters

In [60]:
df_bias_male.groupby('cluster_hdb')['female_effect_size'].median().sort_values(ascending=True).head(20)

cluster_hdb
 15   -0.711778
 17   -0.699287
 11   -0.663897
 21   -0.663516
 13   -0.662770
 24   -0.641110
 8    -0.637718
 5    -0.636074
 23   -0.633952
 20   -0.633385
-1    -0.631080
 1    -0.629136
 7    -0.627704
 0    -0.625395
 3    -0.624610
 22   -0.622789
 16   -0.618555
 14   -0.618506
 19   -0.617536
 10   -0.617502
Name: female_effect_size, dtype: float64

## Create 2d vectors for visualization

In [62]:
df_bias_all_embs=df_bias_female_embs.append(df_bias_male_embs)
df_bias_all_embs_sims=cosine_similarity(df_bias_all_embs, df_bias_all_embs, list(df_bias_all_embs.index))

  df_bias_all_embs=df_bias_female_embs.append(df_bias_male_embs)


In [63]:
reducer = umap.UMAP(random_state=42,n_components=2, metric='cosine')
bias_all_embs_sims_red=reducer.fit_transform(df_bias_all_embs)

In [64]:
df_bias_all_embs_sims_red=pd.DataFrame(bias_all_embs_sims_red)

In [65]:
df_bias_female['x']=df_bias_all_embs_sims_red[0].tolist()[:df_bias_female.shape[0]]
df_bias_female['y']=df_bias_all_embs_sims_red[1].tolist()[:df_bias_female.shape[0]]

In [66]:
df_bias_male['x']=df_bias_all_embs_sims_red[0].tolist()[df_bias_female.shape[0]:]
df_bias_male['y']=df_bias_all_embs_sims_red[1].tolist()[df_bias_female.shape[0]:]

In [67]:
df_bias_male

Unnamed: 0,female_effect_size,female_p_value,clust_ag,cluster_hdb,x,y
sõerdile,-1.021268,0.998612,16,5,-1.574706,1.658165
peangi,-1.018271,0.998401,1,1,2.760018,2.916470
jaanus,-1.004130,0.998263,0,10,-0.673930,2.131979
krimm,-0.989838,0.997932,1,12,0.551765,0.279491
okupeerinud,-0.955118,0.997177,1,23,1.756345,1.445126
...,...,...,...,...,...,...
aktsiisipoliitikas,-0.558500,0.950115,1,23,1.561862,0.362474
nestor,-0.558294,0.950191,9,10,-1.268597,1.833280
oskagi,-0.557308,0.950345,2,3,2.680472,2.496277
pomerants,-0.556146,0.950882,9,10,-1.137863,1.665205


## Add counts

In [68]:
df_bias_male['count']=[cnt[x] for x in df_bias_male.index]
df_bias_female['count']=[cnt[x] for x in df_bias_female.index]

## Save

In [69]:
df_bias_male.to_csv('data/interim/gender_bias/separated/bias_male_w2v.csv')

In [70]:
df_bias_female.to_csv('data/interim/gender_bias/separated/bias_female_w2v.csv')