In [1]:
import numpy as np
import pandas as pd

from collections import Counter
from scipy.stats import pearsonr
from scipy.linalg import orthogonal_procrustes
from nltk import bigrams
from nltk.probability import FreqDist
from sklearn.metrics.pairwise import cosine_similarity
from utils import get_sorted_tweets, get_target_words

In [2]:
tweets = get_sorted_tweets()
target_words = get_target_words()

tsv_file_path = 'data/annotator.tsv'

# Read the TSV file into a pandas DataFrame
df = pd.read_csv(tsv_file_path, sep=' ', header=None).to_numpy()
annotator = {item[0]: item[1] for item in df}
ground_truth = [annotator[word] for word in target_words]

In [3]:
def get_ppmi_by_year(year, dim):
    tweets = get_sorted_tweets()
    target_words = get_target_words()
    tweets = tweets[str(year)]
    words = []
    for tweet in tweets:
        words.extend(tweet['tokens'])

    cnt = Counter(words)
    common = [word[0] for word in cnt.most_common(2000)]
    w = list(set(common)) + target_words
    bigram_pairs = list(bigrams(words))
    bigram_freq = FreqDist(bigram_pairs)
    cooccur_matrix = np.zeros((len(w), len(w)))
    for i in range(len(w)):
        for j in range(len(w)):
            freq = bigram_freq[(w[i], w[j])]
            cooccur_matrix[i, j] = freq
    total_occurrences = np.sum(cooccur_matrix)
    row_sums = np.sum(cooccur_matrix, axis=1)
    col_sums = np.sum(cooccur_matrix, axis=0)

    ppmi_matrix = np.zeros_like(cooccur_matrix)

    for i in range(cooccur_matrix.shape[0]):
        for j in range(cooccur_matrix.shape[1]):
            p_x_y = cooccur_matrix[i, j] / total_occurrences
            p_x = row_sums[i] / total_occurrences
            p_y = col_sums[j] / total_occurrences
            
            pmi = np.log2(p_x_y / (p_x * p_y))
            if pmi != pmi:
                pmi = 0
            ppmi_matrix[i, j] = max(pmi, 0)
    target_word_seq = []
    for word in target_words:
        idx = w.index(word)
        target_word_seq.append(ppmi_matrix[idx])
    ppmi_matrix = np.vstack(target_word_seq)

    U, S, Vt = np.linalg.svd(ppmi_matrix, full_matrices=False)

    U_k = U[:, :dim]
    S_k = np.diag(S[:dim])
    Vt_k = Vt[:, :dim]

    reduced_matrix = np.dot(U_k, np.dot(S_k, Vt_k))
    return reduced_matrix
    

In [4]:
def find_correlation(A, B):
    R = orthogonal_procrustes(A, B)
    A = A @ R[0]
    sim = cosine_similarity(A, B)
    dist = 1 - np.diag(sim)
    return pearsonr(dist, ground_truth)

In [None]:
p_value = []
correlation = []

for dim in [100, 200, 300, 400, 500]:
    ppmi_matrix_2019 = get_ppmi_by_year(2019, dim)
    ppmi_matrix_2020 = get_ppmi_by_year(2020, dim)
    ppmi_matrix_2021 = get_ppmi_by_year(2021, dim)
    print(f'dim = {dim}')
    corr_1920 = find_correlation(ppmi_matrix_2019, ppmi_matrix_2020)
    corr_2021 = find_correlation(ppmi_matrix_2020, ppmi_matrix_2021)
    p_value.append(corr_1920[1])
    p_value.append(corr_2021[1])
    correlation.append(corr_1920[0])
    correlation.append(corr_2021[0])
    print(corr_1920, corr_2021)

sum(p_value) / len(p_value), sum(correlation) / len(correlation)