In [7]:
import numpy as np
import pandas as pd

from collections import Counter
from scipy.stats import pearsonr
from scipy.linalg import orthogonal_procrustes
from nltk import bigrams
from nltk.probability import FreqDist
from sklearn.metrics.pairwise import cosine_similarity
from utils import get_sorted_tweets, get_target_words

In [8]:
tweets = get_sorted_tweets()
target_words = get_target_words()

tsv_file_path = 'data/annotator.tsv'

# Read the TSV file into a pandas DataFrame
df = pd.read_csv(tsv_file_path, sep=' ', header=None).to_numpy()
annotator = {item[0]: item[1] for item in df}
ground_truth = [annotator[word] for word in target_words]
ground_truth

[0.54,
 0.76,
 0.33,
 0.46,
 0.28,
 0.43,
 0.05,
 0.85,
 0.32,
 0.5,
 0.81,
 0.95,
 0.4,
 0.17,
 0.64,
 0.77,
 1.0,
 0.71,
 0.16,
 0.09,
 0.48,
 0.95,
 0.94,
 0.76,
 0.68,
 0.83,
 0.9,
 0.03,
 0.6,
 0.92,
 0.89,
 0.22,
 0.61,
 0.83]

In [9]:
def get_ppmi_by_year(year, dim):
    tweets = get_sorted_tweets()
    target_words = get_target_words()
    tweets = tweets[str(year)]
    words = []
    for tweet in tweets:
        words.extend(tweet['tokens'])

    cnt = Counter(words)
    common = [word[0] for word in cnt.most_common(2000)]
    w = list(set(common)) + target_words
    bigram_pairs = list(bigrams(words))
    bigram_freq = FreqDist(bigram_pairs)
    cooccur_matrix = np.zeros((len(w), len(w)))
    for i in range(len(w)):
        for j in range(len(w)):
            freq = bigram_freq[(w[i], w[j])]
            cooccur_matrix[i, j] = freq
    total_occurrences = np.sum(cooccur_matrix)
    row_sums = np.sum(cooccur_matrix, axis=1)
    col_sums = np.sum(cooccur_matrix, axis=0)

    ppmi_matrix = np.zeros_like(cooccur_matrix)

    for i in range(cooccur_matrix.shape[0]):
        for j in range(cooccur_matrix.shape[1]):
            p_x_y = cooccur_matrix[i, j] / total_occurrences
            p_x = row_sums[i] / total_occurrences
            p_y = col_sums[j] / total_occurrences
            
            pmi = np.log2(p_x_y / (p_x * p_y))
            if pmi != pmi:
                pmi = 0
            ppmi_matrix[i, j] = max(pmi, 0)
    target_word_seq = []
    for word in target_words:
        idx = w.index(word)
        target_word_seq.append(ppmi_matrix[idx])
    ppmi_matrix = np.vstack(target_word_seq)

    U, S, Vt = np.linalg.svd(ppmi_matrix, full_matrices=False)

    U_k = U[:, :dim]
    S_k = np.diag(S[:dim])
    Vt_k = Vt[:, :dim]

    reduced_matrix = np.dot(U_k, np.dot(S_k, Vt_k))
    return reduced_matrix
    

In [11]:
def find_correlation(A, B):
    R = orthogonal_procrustes(A, B)
    A = A @ R[0]
    sim = cosine_similarity(A, B)
    dist = 1 - np.diag(sim)
    return pearsonr(dist, ground_truth)

In [15]:
for dim in [100, 200, 300, 400, 500]:
    ppmi_matrix_2019 = get_ppmi_by_year(2019, dim)
    ppmi_matrix_2020 = get_ppmi_by_year(2020, dim)
    ppmi_matrix_2021 = get_ppmi_by_year(2021, dim)
    print(f'dim = {dim}')
    print(find_correlation(ppmi_matrix_2019, ppmi_matrix_2020))
    print(find_correlation(ppmi_matrix_2020, ppmi_matrix_2021))

  pmi = np.log2(p_x_y / (p_x * p_y))
  pmi = np.log2(p_x_y / (p_x * p_y))


dim = 100
PearsonRResult(statistic=0.05062434795308799, pvalue=0.7761581690362572)
PearsonRResult(statistic=-0.028424897144051078, pvalue=0.8732145276354708)


  pmi = np.log2(p_x_y / (p_x * p_y))
  pmi = np.log2(p_x_y / (p_x * p_y))


dim = 200
PearsonRResult(statistic=-0.2038182381641085, pvalue=0.24760447898736498)
PearsonRResult(statistic=0.013163535086063865, pvalue=0.9410994903071626)


  pmi = np.log2(p_x_y / (p_x * p_y))
  pmi = np.log2(p_x_y / (p_x * p_y))


dim = 300
PearsonRResult(statistic=0.02346701491255716, pvalue=0.8951940802111944)
PearsonRResult(statistic=-0.09048383900510194, pvalue=0.6108095738066032)


  pmi = np.log2(p_x_y / (p_x * p_y))
  pmi = np.log2(p_x_y / (p_x * p_y))


dim = 400
PearsonRResult(statistic=0.015473701785429629, pvalue=0.9307854844547294)
PearsonRResult(statistic=-0.09435829077827111, pvalue=0.595553464243175)


  pmi = np.log2(p_x_y / (p_x * p_y))
  pmi = np.log2(p_x_y / (p_x * p_y))


dim = 500
PearsonRResult(statistic=0.04611301143151122, pvalue=0.7956637562269073)
PearsonRResult(statistic=-0.08139506084545639, pvalue=0.6472263130286253)
