In [2]:
import gensim
import pickle
from utils import get_sorted_tweets, get_target_words, load_annotator_labels
import gensim
from procrustes import orthogonal
import numpy as np
import pickle
from sklearn import metrics
from utils import get_target_words
from scipy.stats import pearsonr

In [3]:

#since dataset is small, we chose a larger k 
date_dict = get_sorted_tweets()

years = ["2019", "2020", "2021"]
for year in years:

    k = 5
    model = gensim.models.Word2Vec(
        sg=1, # skipgram
        hs=0, # negative sampling
        negative=k, # number of negative samples
        workers=4,
        vector_size=100
    )

    # Train
    # sentences = PathLineSentences(corpDir)
    
    sentence_list = []
    data_year = date_dict[year]
    for data in data_year:
        sentence_list.append(data["tokens"])

    model.build_vocab(sentence_list)
    model.train(sentence_list, total_examples=model.corpus_count, epochs=20)

    print(f"{year}: {model}")

    # # Save the vectors and the model
    outpath = f'model_files/sngs_{year}'
    model.wv.save(outpath)
    model.save(outpath + '.model')

2019: Word2Vec<vocab=1271, vector_size=100, alpha=0.025>
2020: Word2Vec<vocab=8861, vector_size=100, alpha=0.025>
2021: Word2Vec<vocab=7591, vector_size=100, alpha=0.025>


In [4]:
target_words = get_target_words()
    
word2vec_path = "model_files/sngs_2019"
word2vec = gensim.models.KeyedVectors.load(word2vec_path)
w2v_vocabulary = word2vec.key_to_index

words = list( w2v_vocabulary.keys())

target_words = set()

date_dict = get_sorted_tweets()

for data in date_dict[year]:
    target_words.add(data["word"])

with open(f'data/target_words_{year}.pkl', 'wb') as fp:
    pickle.dump(list(target_words), fp)

In [5]:
def cosine_dist(v1, v2):
    return 1 - metrics.pairwise.cosine_similarity(v1.reshape(1, -1), v2.reshape(1, -1))[0][0]

def get_same_words(word_set1, word_set2):
    return word_set1.intersection(word_set2)

def create_matrices(word2vec1, word2vec2):

    vector_size = 100
    vocab1 = set(word2vec1.index_to_key)
    vocab2 = set(word2vec2.index_to_key)

    intersect = get_same_words(vocab1, vocab2)

    mat1 = np.zeros((len(intersect), vector_size))
    mat2 = np.zeros((len(intersect), vector_size))

    for i, word in enumerate(intersect):
        mat1[i] = word2vec1.get_vector(word)
        mat2[i] = word2vec2.get_vector(word)
    

    return list(intersect), mat1, mat2

def get_consine_distance(year1, year2):
    target_words = get_target_words()

    word2vec_path1 = f"model_files/sngs_{year1}"
    word2vec_path2 = f"model_files/sngs_{year2}"
    word2vec1 = gensim.models.KeyedVectors.load(word2vec_path1)
    word2vec2 = gensim.models.KeyedVectors.load(word2vec_path2)

    intersect, mat1, mat2 = create_matrices(word2vec1, word2vec2)

    result = orthogonal(mat1, mat2, scale=True, translate=True)
    # display_procrutes_result(result)
    a_op = np.dot(result.new_a, result.t)
    cosine_distances = {}
    for target in target_words:
        try:
            idx = intersect.index(target)
            cosine_distances[target] = cosine_dist(a_op[idx], result.new_b[idx])
        except:
            pass
            # print(f"target word {target} is not found in the intersect of both corpora")
    
    return cosine_distances

In [6]:
years = [("2019", "2020"), ("2020", "2021")]

print("pearson correlation to annotated labels") 

for year1, year2 in years:
    cd = get_consine_distance(year1, year2)
    
    labels = load_annotator_labels()

    sgns_vec = []
    annotator_vec = []

    for key in cd.keys():
        sgns_vec.append(float(cd[key]))
        annotator_vec.append(float(labels[key]))

    
    pearson, p_value = pearsonr(sgns_vec, annotator_vec)
    print(f"pearson coeff and p-value for year {year1}-{year2}: {pearson}, {p_value}")

pearson correlation to annotated labels
pearson coeff and p-value for year 2019-2020: 0.3560518830300495, 0.13460947157852474
pearson coeff and p-value for year 2020-2021: 0.06471005315346301, 0.8051081443290519
