In [49]:
from collections import Counter
import numpy as np
from scipy.linalg import orthogonal_procrustes
from nltk import bigrams
from nltk.probability import FreqDist
from sklearn.metrics.pairwise import cosine_similarity
from utils import get_sorted_tweets, get_target_words

In [13]:
tweets = get_sorted_tweets()
target_words = get_target_words()

In [35]:
def get_ppmi_by_year(year, dim):
    tweets = get_sorted_tweets()
    target_words = get_target_words()
    tweets = tweets[str(year)]
    words = []
    for tweet in tweets:
        words.extend(tweet['tokens'])

    cnt = Counter(words)
    common = [word[0] for word in cnt.most_common(2000)]
    w = list(set(common)) + target_words
    bigram_pairs = list(bigrams(words))
    bigram_freq = FreqDist(bigram_pairs)
    cooccur_matrix = np.zeros((len(w), len(w)))
    for i in range(len(w)):
        for j in range(len(w)):
            freq = bigram_freq[(w[i], w[j])]
            cooccur_matrix[i, j] = freq
    total_occurrences = np.sum(cooccur_matrix)
    row_sums = np.sum(cooccur_matrix, axis=1)
    col_sums = np.sum(cooccur_matrix, axis=0)

    ppmi_matrix = np.zeros_like(cooccur_matrix)

    for i in range(cooccur_matrix.shape[0]):
        for j in range(cooccur_matrix.shape[1]):
            p_x_y = cooccur_matrix[i, j] / total_occurrences
            p_x = row_sums[i] / total_occurrences
            p_y = col_sums[j] / total_occurrences
            
            pmi = np.log2(p_x_y / (p_x * p_y))
            if pmi != pmi:
                pmi = 0
            ppmi_matrix[i, j] = max(pmi, 0)
    target_word_seq = []
    for word in target_words:
        idx = w.index(word)
        target_word_seq.append(ppmi_matrix[idx])
    ppmi_matrix = np.vstack(target_word_seq)

    U, S, Vt = np.linalg.svd(ppmi_matrix, full_matrices=False)

    U_k = U[:, :dim]
    S_k = np.diag(S[:dim])
    Vt_k = Vt[:, :dim]

    reduced_matrix = np.dot(U_k, np.dot(S_k, Vt_k))
    return reduced_matrix
    

In [38]:
dim = 300
ppmi_matrix_2019 = get_ppmi_by_year(2019, dim)
ppmi_matrix_2020 = get_ppmi_by_year(2020, dim)
ppmi_matrix_2021 = get_ppmi_by_year(2021, dim)


  pmi = np.log2(p_x_y / (p_x * p_y))
  pmi = np.log2(p_x_y / (p_x * p_y))


In [53]:
r_2019 = orthogonal_procrustes(ppmi_matrix_2019, ppmi_matrix_2020)
ppmi_matrix_2019 = ppmi_matrix_2019 @ r_2019[0]
sim = cosine_similarity(ppmi_matrix_2019, ppmi_matrix_2020)
1 - np.diag(sim)

1629.5569561023312


array([2.81775752e-02, 7.16158728e-03, 6.51641695e-04, 5.26622201e-01,
       1.58930229e-02, 1.27899313e-01, 5.09733447e-03, 1.07818521e-01,
       2.11466537e-01, 3.33332605e-01, 3.55947800e-01, 2.50400672e-01,
       3.66968757e-01, 5.75781943e-02, 5.22041046e-03, 3.86168108e-02,
       1.06218583e-01, 2.69006332e-01, 1.84030191e-01, 1.44408046e-01,
       1.53596237e-02, 9.31517437e-01, 8.43768682e-02, 3.96587708e-02,
       1.46531971e-01, 2.18946140e-01, 8.37373143e-02, 7.25161563e-02,
       2.70390156e-01, 2.85753212e-02, 4.80566792e-02, 1.12126165e-02,
       9.57676258e-05, 1.14963159e-01])