In [1]:
# set path to downloaded historical embeddings. example:
path_to_historical_emb = 'C:\\Users\\user\\Downloads'

In [2]:
import numpy as np
import pickle
from random import shuffle
import matplotlib.pyplot as plt
from scipy.linalg import orthogonal_procrustes
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from noise_aware import noise_aware

%matplotlib inline

In [3]:
# load embeddings and index them
def load_historical_emb(year):
    f = open(path_to_historical_emb + '\\eng-fiction-all\\sgns\\' + str(year) + '-vocab.pkl', 'rb')
    iw = pickle.load(f)
    emb_array = np.load(path_to_historical_emb + '\\eng-fiction-all\\sgns\\' + str(year) + '-w.npy')
    word2idx = {word: i for i, word in enumerate(iw)}
    idx2word = {i: word for i, word in enumerate(iw)}
    f = open(path_to_historical_emb + '\\eng-fiction-all\\pos\\' + str(year) + '-pos.pkl', 'rb')
    pos = pickle.load(f)
    return emb_array, np.array(iw), word2idx, idx2word, pos

In [4]:
# get top changed words
def get_most_changed_words(A, B, idx2word, Q, num_of_words=10, f_indices=None):
    threshold = 10^-5
    # load dict of full non-stop non-proper nouns words
    f = open(path_to_historical_emb + '\\eng-fiction-all\\word_lists\\full-nstop_nproper.pkl', 'rb')
    full_nstop_nproper = pickle.load(f, encoding='latin1')
    # load frequencies
    f = open(path_to_historical_emb + '\\eng-fiction-all\\freqs.pkl', 'rb')
    freqs = pickle.load(f, encoding='latin1')

    n, dim = A.shape
    distances_vector_indexed = []
    for i in range (n):
        sim = 1 - cosine(np.dot(A[i, :],Q),B[i, :])
        distances_vector_indexed.append((sim, idx2word[i]))  
        
    distances_vector_sorted = sorted(distances_vector_indexed)
    # print top changed words
    i = 0
    while (i < num_of_words):
        word = distances_vector_sorted[i][1]
        if freqs[word][1900] > threshold and freqs[word][1990] > threshold and word in full_nstop_nproper:
            print (i, 'sim:', round(distances_vector_sorted[i][0],3), 'word:', distances_vector_sorted[i][1])
            i = i+1

In [5]:
# clean zerored embeddings
def clean_zeros(nonzero_idxs, array, words):
    array = array[list(nonzero_idxs), :]
    words = words[list(nonzero_idxs)]
    word2idx = {word: i for i, word in enumerate(words)}
    idx2word = {i: word for i, word in enumerate(words)}
    return array, word2idx, idx2word

Load historical embeddings

In [6]:
# load historical embeddings
mat1900, words1900, word2idx1900, idx2word1900, pos1900 = load_historical_emb(1900)
mat1990, words1990, word2idx1990, idx2word1990, pos1990 = load_historical_emb(1990)

In [7]:
# reorder matrices
idx_list = [word2idx1990[idx2word1900[i]] for i in range (0,len(word2idx1900))]
array_1900 = mat1900
array_1990 = mat1990[idx_list, :]
words1990 = words1990[idx_list]

# clean zero embeddings
f = open(path_to_historical_emb + '\\eng-fiction-all\\word_lists\\full-nstop_nproper.pkl', 'rb')
full_nstop_nproper = pickle.load(f, encoding='latin1')

rows1900, _ = np.nonzero(array_1900)
idxs_1900 = set(rows1900)
rows1990, _ = np.nonzero(array_1990)
idxs_1990 = set(rows1990)
nonzero_idxs = idxs_1900.intersection(idxs_1990)
nstop_nproper_idx = {i for i, word in enumerate(words1900) if word in full_nstop_nproper}
clean_idxs = nonzero_idxs.intersection(nstop_nproper_idx)

array_1900, word2idx_ordered, idx2word_ordered = clean_zeros(clean_idxs, array_1900, words1900)
array_1990, _, _ = clean_zeros(clean_idxs, array_1990, words1990)

n, dim = array_1900.shape
init_Q, _ = orthogonal_procrustes(array_1900, array_1990)

Noise Aware Aligment

In [8]:
Q_pred, alpha_pred, t_indices_pred, f_indices_pred = \
noise_aware(array_1900, array_1990)

iter: 0 alpha: 0.57 sigma: 0.003 sigmay 0.003
iter: 1 alpha: 0.553 sigma: 0.003 sigmay 0.003
iter: 2 alpha: 0.548 sigma: 0.003 sigmay 0.003
iter: 3 alpha: 0.547 sigma: 0.003 sigmay 0.003
iter: 4 alpha: 0.546 sigma: 0.003 sigmay 0.003
iter: 5 alpha: 0.546 sigma: 0.003 sigmay 0.003
iter: 6 alpha: 0.545 sigma: 0.003 sigmay 0.003
iter: 7 alpha: 0.545 sigma: 0.003 sigmay 0.003
iter: 8 alpha: 0.545 sigma: 0.003 sigmay 0.003
iter: 9 alpha: 0.545 sigma: 0.003 sigmay 0.003


In [10]:
get_most_changed_words(array_1900, array_1990, idx2word_ordered, num_of_words=10, Q=Q_pred, f_indices=f_indices_pred)

0 sim: -0.003 word: guy
1 sim: 0.073 word: 31
2 sim: 0.084 word: ignored
3 sim: 0.087 word: overdue
4 sim: 0.088 word: 2
5 sim: 0.093 word: vis
6 sim: 0.108 word: ad
7 sim: 0.115 word: notices
8 sim: 0.121 word: random
9 sim: 0.123 word: 27


In [11]:
words = ['wanting', 'gay', 'check', 'starting', 'major', 'actually', 'touching', 'harry', 'headed', 'romance']

for word in words:
    sim =  1 - cosine(np.dot(array_1900[word2idx_ordered[word], :],Q_pred),array_1990[word2idx_ordered[word], :])
    print ('word:', word, 'is_clean:', word2idx_ordered[word] in t_indices_pred, 'sim', sim)

word: wanting is_clean: False sim 0.19206961647592302
word: gay is_clean: False sim 0.2759707895493978
word: check is_clean: False sim 0.25367264136509693
word: starting is_clean: False sim 0.2740132369190764
word: major is_clean: False sim 0.26918349630846394
word: actually is_clean: False sim 0.24110088125204576
word: touching is_clean: False sim 0.4542916169674803
word: harry is_clean: False sim 0.2627476280889731
word: headed is_clean: False sim 0.2993917518480882
word: romance is_clean: False sim 0.2974979837761651


In [12]:
unchanged_historical = [idx2word_ordered[idx] for idx in t_indices_pred]
f = open("unchanged_historical.txt", "w", encoding="utf-8")
f.write('\n'.join(unchanged_historical))
changed_historical = [idx2word_ordered[idx] for idx in f_indices_pred]
f = open("changed_historical.txt", "w", encoding="utf-8")
f.write('\n'.join(changed_historical))

32600