In [1]:
import math
import gensim
import umap
import hdbscan
import numpy as np
import pandas as pd
from ast import literal_eval
from collections import Counter
from sklearn.cluster import AgglomerativeClustering
from src.gender_bias import SC_WEAT, get_word_vecs

## Load wordvectors

In [89]:
vecs_w2v=gensim.models.KeyedVectors.load('wordvectors/riigikogu_ft.wordvectors')

In [90]:
np.expand_dims(vecs_w2v['õed'], axis=0).shape

(1, 300)

In [155]:
vecs_w2v.most_similar('kool')

[('koolkond', 0.8596926331520081),
 ('e-kool', 0.8544713854789734),
 ('keskkool', 0.8158486485481262),
 ('koolilaps', 0.8155820965766907),
 ('koolipoiss', 0.8018566370010376),
 ('koolibuss', 0.7951183915138245),
 ('balletikool', 0.7884907722473145),
 ('kunstikool', 0.7793153524398804),
 ('algkool', 0.7788565754890442),
 ('koolivend', 0.7765803933143616)]

## Get word vectors

In [93]:
#Attribute Words
female_stimuli = ['naise','naine','tüdruk','tüdruku','õde', 'naiste','naistele','naist','õed','tütar', 'naisterahvas', 'tüdrukud', 'tüdrukute', 'ema', 'emad', 'naistel', 'üksikema']
male_stimuli = ['mehe','mees','poiss','poisi', 'vend', 'meeste','meestele','meest','vennad','poeg', 'meesterahvas', 'poisid', 'poiste', 'isa', 'isad', 'meestel', 'üksikisa']

len(set(female_stimuli))==len(set(male_stimuli))

True

In [95]:
female_embeddings=get_word_vecs(vecs_w2v, female_stimuli)
male_embeddings=get_word_vecs(vecs_w2v, male_stimuli)

working on word 0
working on word 0


In [96]:
female_embeddings.shape, male_embeddings.shape

((17, 300), (17, 300))

## Get target words - use frequency and remove stopwords

In [97]:
df=pd.read_csv('data/interim/stenos_cleaned_topics_vecs.csv')
df.shape

(209949, 25)

In [98]:
df.tokes[0]

"['auväärt', 'kolleegid', ',', 'auväärt', 'riigikogu', ',', 'tere', 'hommikust', '!', 'kui', 'te', 'nüüd', 'korraks', 'jälle', 'keskendute', 'meie', 'suurepärase', 'istungi', 'rakendamisele', 'ja', 'räägite', 'omavahelisi', 'jutte', 'edasi', 'pärast', ',', 'siis', 'on', 'mul', 'suur', 'rõõm', 'avada', 'riigikogu', 'täiskogu', 'ii', 'istungjärgu', '9.', 'töönädala', 'teisipäevane', 'istung', '.', 'kõigepealt', 'on', 'eelnõude', 'ja', 'arupärimiste', 'üleandmine', ',', 'kui', 'selleks', 'on', 'soovi', '.', 'eelnõude', 'ja', 'arupärimiste', 'üleandmise', 'soovi', 'ma', 'ei', 'näe', '.', 'siis', 'teeme', 'palun', 'kohaloleku', 'kontrolli', '.', 'kohaloleku', 'kontroll', 'kohal', 'on', '75', 'riigikogu', 'liiget', ',', 'puudub', '26.', 'sellest', 'piisab', 'täielikult', ',', 'et', 'asuda', 'meie', 'päevakorra', 'arutelu', 'juurde', '.']"

In [99]:
with open('utils/estonian-stopwords.txt', encoding='utf-8') as f:
    stopwords=f.read().split('\n')
print(len(stopwords))  
with open('utils/estonian-stopwords_additional.txt', encoding='utf-8') as f:
    stopwords_addit=f.read().split('\n')
    
stopwords.extend(stopwords_addit)
stopwords.extend(['.', ',', '!', '?', ':', ';', ')', '(', '-', ']', '[', '...', '–,'])
stopwords.extend(female_stimuli)
stopwords.extend(male_stimuli)

5295


In [100]:
tokens=df.tokes.apply(literal_eval).tolist()
tokens=[item for sublist in tokens for item in sublist]

In [101]:
len(tokens), len(set(tokens))

(25252052, 420801)

In [102]:
%time cnt=Counter(tokens)

CPU times: total: 2.25 s
Wall time: 2.25 s


In [103]:
cnt['vanapoiss']

0

In [104]:
MIN_COUNT=20
most_common_tokens_cnt=cnt.most_common(100000)
most_common_tokens_cnt=[el for el in most_common_tokens_cnt if el[1]>=MIN_COUNT]
most_common_tokens=[el[0] for el in most_common_tokens_cnt if el[0] not in stopwords]
most_common_tokens=[el for el in most_common_tokens if el in vecs_w2v]
most_common_tokens[:10]

['euroopa',
 'seaduse',
 'raha',
 'eurot',
 'küsimusi',
 'tänan',
 'kolleeg',
 'ettepaneku',
 'inimesi',
 'fraktsiooni']

In [105]:
len(most_common_tokens)

40560

## Calculate WEATS

In [107]:
STEP = 10000
N_ITERS=math.ceil(len(most_common_tokens)/STEP)
PERMUTATIONS = 10000

In [109]:
df_bias=pd.DataFrame()
for i in range(N_ITERS):
    print(i)
    targets = most_common_tokens[i*STEP:(i+1)*STEP]
    bias_array = np.array([SC_WEAT(vecs_w2v[word],
                                   female_embeddings, male_embeddings,PERMUTATIONS) for word in targets])
    bias_df_ = pd.DataFrame(bias_array, columns=['female_effect_size','female_p_value'])
    df_bias=df_bias.append(bias_df_)

0


  df_bias=df_bias.append(bias_df_)


1


  df_bias=df_bias.append(bias_df_)


2


  df_bias=df_bias.append(bias_df_)


3


  df_bias=df_bias.append(bias_df_)


4


  df_bias=df_bias.append(bias_df_)


In [110]:
df_bias.index=most_common_tokens

In [111]:
df_bias[(df_bias.female_effect_size>.5)&(df_bias.female_p_value<.05)].sort_values('female_effect_size', ascending=False).head(30)

Unnamed: 0,female_effect_size,female_p_value
psüühika-,1.337927,4.3e-05
psüühikahäirega,1.164181,0.000307
õppe-,1.086988,0.000781
psüühikahäire,1.078061,0.000909
kehvemasse,1.00758,0.001547
nooremas,0.993021,0.001899
täiskasvanu,0.960464,0.002331
täiskasvanuna,0.941072,0.003254
õppejõu,0.930382,0.003201
neiu,0.922881,0.003975


## Save

In [112]:
df_bias.to_csv('data/interim/gender_bias/words_female_scores_ft_v2.csv')

## Load 

In [113]:
df_bias=pd.read_csv('data/interim/gender_bias/words_female_scores_ft_v2.csv', index_col=0)
df_bias.shape

(40560, 2)

In [114]:
df_bias

Unnamed: 0,female_effect_size,female_p_value
euroopa,0.162927,0.320740
seaduse,0.167700,0.311534
raha,-0.516739,0.933825
eurot,0.107782,0.379906
küsimusi,-0.215929,0.737552
...,...,...
lasketiiru,-0.219089,0.739115
käsimüügiravimite,0.232356,0.249476
baskini,-0.367254,0.857242
c2,0.699231,0.020415


In [115]:
df_bias_female=df_bias[(df_bias.female_effect_size>=.5)&(df_bias.female_p_value<=.05)].sort_values('female_effect_size', ascending=False)
df_bias_female.shape

(331, 2)

In [116]:
df_bias_female.head(20)

Unnamed: 0,female_effect_size,female_p_value
psüühika-,1.337927,4.3e-05
psüühikahäirega,1.164181,0.000307
õppe-,1.086988,0.000781
psüühikahäire,1.078061,0.000909
kehvemasse,1.00758,0.001547
nooremas,0.993021,0.001899
täiskasvanu,0.960464,0.002331
täiskasvanuna,0.941072,0.003254
õppejõu,0.930382,0.003201
neiu,0.922881,0.003975


In [156]:
df_bias_male=df_bias[(df_bias.female_effect_size<=-.5)&(df_bias.female_p_value>=.95)].sort_values('female_effect_size', ascending=True)
df_bias_male.shape

(1330, 2)

In [157]:
df_bias_male.head(20)

Unnamed: 0,female_effect_size,female_p_value
loosungiga,-1.244283,0.999865
tormata,-1.202379,0.999771
ideena,-1.183721,0.999731
nimekiri,-1.166723,0.999666
toompea,-1.161083,0.999634
tuld,-1.160888,0.999632
lipu,-1.151933,0.999635
käekiri,-1.131399,0.999541
top-up,-1.11227,0.999385
barack,-1.102836,0.999403


## Cluster female and male words

In [197]:
def cosine_similarity(vecs_ar1, vecs_ar2, words=None):
    """returns vecs_ar.shape[0] x vecs_ar.shape[0] dataframe of each word cosine similarity with other words"""
    sims= np.dot(vecs_ar1, vecs_ar2.T)/(np.linalg.norm(vecs_ar1)*np.linalg.norm(vecs_ar2.T))
    df_sims=pd.DataFrame(sims)
    if isinstance(words, list):
        df_sims.index=words
        df_sims.columns=words
    return df_sims

def cluster_sims(df_sims, n_clusters):
    clustering = AgglomerativeClustering(n_clusters=n_clusters, compute_distances=True).fit(df_sims)
    df_clusters=pd.DataFrame({'token':df_sims.index, 'cluster':clustering.labels_})
    df_clusters.index=df_clusters.token
    return clustering, df_clusters

#### word embeddings

In [198]:
df_bias_female_embs=pd.DataFrame(get_word_vecs(vecs_w2v, df_bias_female.index))
df_bias_female_embs.index=df_bias_female.index

working on word 0


In [199]:
df_bias_male_embs=pd.DataFrame(get_word_vecs(vecs_w2v, df_bias_male.index))
df_bias_male_embs.index=df_bias_male.index

working on word 0
working on word 1000


#### cosine similarity between vectors

In [200]:
df_bias_female_embs_sims=cosine_similarity(df_bias_female_embs, df_bias_female_embs, list(df_bias_female_embs.index))
# df_bias_female_embs_sims

In [201]:
df_bias_male_embs_sims=cosine_similarity(df_bias_male_embs, df_bias_male_embs, list(df_bias_male_embs.index))
# df_bias_male_embs

#### Agglomerative clustering

In [230]:
clust_clf_female_ag, df_female_clusters_ag=cluster_sims(df_bias_female_embs_sims, n_clusters=20)

In [231]:
df_female_clusters_ag.cluster.value_counts()[:20]

3     50
1     41
9     40
11    35
2     27
5     21
6     18
4     16
19    15
12    11
7     10
14    10
0      9
16     8
10     8
15     7
8      2
17     1
18     1
13     1
Name: cluster, dtype: int64

In [232]:
clust_clf_male_ag, df_male_clusters_ag=cluster_sims(df_bias_male_embs_sims, n_clusters=20)

#### HDBSCAN

In [233]:
#first reduce dimensions
reducer = umap.UMAP(random_state=42,n_components=4, metric='cosine')
bias_female_embs_sims_red=reducer.fit_transform(df_bias_female_embs_sims)

In [234]:
#first reduce dimensions
reducer = umap.UMAP(random_state=42,n_components=4, metric='cosine')
bias_male_embs_sims_red=reducer.fit_transform(df_bias_male_embs_sims)

In [235]:
bias_female_embs_sims_red.shape, bias_male_embs_sims_red.shape

((331, 4), (1330, 4))

In [236]:
clust_hdb = hdbscan.HDBSCAN()
clust_hdb.fit(pd.DataFrame(bias_female_embs_sims_red))

HDBSCAN()

In [237]:
#number of clusters
len(set(clust_hdb.labels_))

9

In [238]:
clust_hdb_male = hdbscan.HDBSCAN()
clust_hdb_male.fit(pd.DataFrame(bias_male_embs_sims_red))

HDBSCAN()

In [239]:
df_female_clusters_ag['cluster_hdb']=clust_hdb.labels_
df_male_clusters_ag['cluster_hdb']=clust_hdb_male.labels_

In [240]:
df_female_clusters_ag.cluster_hdb.value_counts()

 6    98
 7    67
 4    43
 1    42
 0    36
 5    21
 2    16
 3     7
-1     1
Name: cluster_hdb, dtype: int64

In [241]:
df_female_clusters_ag

Unnamed: 0_level_0,token,cluster,cluster_hdb
token,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
psüühika-,psüühika-,3,6
psüühikahäirega,psüühikahäirega,3,6
õppe-,õppe-,0,7
psüühikahäire,psüühikahäire,3,6
kehvemasse,kehvemasse,15,3
...,...,...,...
levima,levima,10,1
leiva,leiva,19,6
lähisuhtevägivalla,lähisuhtevägivalla,3,6
hilisemal,hilisemal,1,4


In [242]:
#add it to original data
df_bias_female['clust_ag']=df_female_clusters_ag.cluster
df_bias_female['cluster_hdb']=df_female_clusters_ag.cluster_hdb

In [243]:
df_bias_male['clust_ag']=df_male_clusters_ag.cluster
df_bias_male['cluster_hdb']=df_male_clusters_ag.cluster_hdb

## Female clusters

In [244]:
df_bias_female.groupby('clust_ag')['female_effect_size'].median().sort_values(ascending=False).head(20)

clust_ag
13    0.796828
0     0.751974
12    0.723268
2     0.707655
1     0.694861
15    0.660072
9     0.646263
3     0.645838
6     0.642157
14    0.635579
4     0.624965
17    0.622648
7     0.620554
16    0.618817
18    0.618424
5     0.617519
11    0.609718
10    0.599374
19    0.596353
8     0.590170
Name: female_effect_size, dtype: float64

In [246]:
df_bias_female.groupby('cluster_hdb')['female_effect_size'].median().sort_values(ascending=False).head(20)

cluster_hdb
 4    0.694861
 3    0.660072
 7    0.655614
 6    0.640821
 5    0.632836
 2    0.624965
 1    0.615174
 0    0.612225
-1    0.566590
Name: female_effect_size, dtype: float64

## Male clusters

In [247]:
df_bias_male.groupby('cluster_hdb')['female_effect_size'].median().sort_values(ascending=True).head(20)

cluster_hdb
 8    -0.720913
 17   -0.716080
 10   -0.698120
 6    -0.694866
 1    -0.694566
 14   -0.693734
 5    -0.692701
 26   -0.689394
 15   -0.687362
 24   -0.681173
 2    -0.680988
 4    -0.680490
 23   -0.678033
 31   -0.674806
 21   -0.670588
 18   -0.670250
 19   -0.667749
 3    -0.667253
 0    -0.658267
-1    -0.657993
Name: female_effect_size, dtype: float64

## Create 2d vectors for visualization

In [249]:
df_bias_all_embs=df_bias_female_embs.append(df_bias_male_embs)
df_bias_all_embs_sims=cosine_similarity(df_bias_all_embs, df_bias_all_embs, list(df_bias_all_embs.index))

  df_bias_all_embs=df_bias_female_embs.append(df_bias_male_embs)


In [250]:
reducer = umap.UMAP(random_state=42,n_components=2, metric='cosine')
bias_all_embs_sims_red=reducer.fit_transform(df_bias_all_embs)

In [251]:
df_bias_all_embs_sims_red=pd.DataFrame(bias_all_embs_sims_red)

In [252]:
df_bias_female['x']=df_bias_all_embs_sims_red[0].tolist()[:df_bias_female.shape[0]]
df_bias_female['y']=df_bias_all_embs_sims_red[1].tolist()[:df_bias_female.shape[0]]

In [253]:
df_bias_male['x']=df_bias_all_embs_sims_red[0].tolist()[df_bias_female.shape[0]:]
df_bias_male['y']=df_bias_all_embs_sims_red[1].tolist()[df_bias_female.shape[0]:]

In [254]:
df_bias_male

Unnamed: 0,female_effect_size,female_p_value,clust_ag,cluster_hdb,x,y
loosungiga,-1.244283,0.999865,10,2,12.384358,11.696504
tormata,-1.202379,0.999771,5,4,3.907009,10.012736
ideena,-1.183721,0.999731,3,5,10.622167,9.758638
nimekiri,-1.166723,0.999666,1,28,11.647769,10.540583
toompea,-1.161083,0.999634,2,-1,9.992006,12.730493
...,...,...,...,...,...,...
tonn,-0.560028,0.950856,1,-1,8.634193,10.261371
toe,-0.559997,0.950635,3,5,10.308892,10.477944
delegatsioon,-0.557763,0.950354,9,16,8.843691,9.980645
barjääre,-0.556434,0.950183,4,-1,7.086227,9.688588


## Add counts

In [257]:
df_bias_male['count']=[cnt[x] for x in df_bias_male.index]
df_bias_female['count']=[cnt[x] for x in df_bias_female.index]

## Save

In [258]:
df_bias_male.to_csv('data/interim/gender_bias/separated/bias_male_ft.csv')

In [259]:
df_bias_female.to_csv('data/interim/gender_bias/separated/bias_female_ft.csv')