In [9]:
import gensim
import pickle
from utils import get_sorted_tweets, get_target_words, load_annotator_labels
from scipy.linalg import orthogonal_procrustes
import numpy as np
from sklearn import metrics
from scipy.stats import pearsonr
from operator import itemgetter

In [10]:
# sgns hyperparameters
k_lst = [5, 7, 10, 12, 15]
vector_size_lst = [100, 200, 300]

### The following cell will take some amount of time to run since it is training 15 sgns models. However, these models are already saved so you can skip it if need be.

In [11]:
date_dict = get_sorted_tweets()

years = ["2019", "2020", "2021"]

for k in k_lst:
    for vector_size in vector_size_lst:
        for year in years:

            model = gensim.models.Word2Vec(
                sg=1, # skipgram
                hs=0, # negative sampling
                negative=k, # number of negative samples
                workers=4,
                vector_size=vector_size
            )
            
            sentence_list = []
            data_year = date_dict[year]
            for data in data_year:
                sentence_list.append(data["tokens"])

            model.build_vocab(sentence_list)
            model.train(sentence_list, total_examples=model.corpus_count, epochs=20)

            print(f"k: {k} vector_size: {vector_size} year: {year}: {model}")

            # # Save the vectors and the model
            outpath = f'model_files/sngs_{year}_{k}_{vector_size}'
            model.wv.save(outpath)
            model.save(outpath + '.model')

k: 5 vector_size: 100 year: 2019: Word2Vec<vocab=1271, vector_size=100, alpha=0.025>
k: 5 vector_size: 100 year: 2020: Word2Vec<vocab=8861, vector_size=100, alpha=0.025>
k: 5 vector_size: 100 year: 2021: Word2Vec<vocab=7591, vector_size=100, alpha=0.025>
k: 5 vector_size: 200 year: 2019: Word2Vec<vocab=1271, vector_size=200, alpha=0.025>
k: 5 vector_size: 200 year: 2020: Word2Vec<vocab=8861, vector_size=200, alpha=0.025>
k: 5 vector_size: 200 year: 2021: Word2Vec<vocab=7591, vector_size=200, alpha=0.025>
k: 5 vector_size: 300 year: 2019: Word2Vec<vocab=1271, vector_size=300, alpha=0.025>
k: 5 vector_size: 300 year: 2020: Word2Vec<vocab=8861, vector_size=300, alpha=0.025>
k: 5 vector_size: 300 year: 2021: Word2Vec<vocab=7591, vector_size=300, alpha=0.025>
k: 7 vector_size: 100 year: 2019: Word2Vec<vocab=1271, vector_size=100, alpha=0.025>
k: 7 vector_size: 100 year: 2020: Word2Vec<vocab=8861, vector_size=100, alpha=0.025>
k: 7 vector_size: 100 year: 2021: Word2Vec<vocab=7591, vector_siz

In [12]:
def cosine_dist(v1, v2):
    return 1 - metrics.pairwise.cosine_similarity(v1.reshape(1, -1), v2.reshape(1, -1))[0][0]

def get_same_words(word_set1, word_set2):
    return word_set1.intersection(word_set2)

def create_matrices(word2vec1, word2vec2):
    vocab1 = set(word2vec1.index_to_key)
    vocab2 = set(word2vec2.index_to_key)

    intersect = get_same_words(vocab1, vocab2)

    mat1 = np.zeros((len(intersect), vector_size))
    mat2 = np.zeros((len(intersect), vector_size))

    for i, word in enumerate(intersect):
        mat1[i] = word2vec1.get_vector(word)
        mat2[i] = word2vec2.get_vector(word)
    

    return list(intersect), mat1, mat2

def get_consine_distance(year1, year2, k, vector_size):
    target_words = get_target_words()

    word2vec_path1 = f"model_files/sngs_{year1}_{k}_{vector_size}"
    word2vec_path2 = f"model_files/sngs_{year2}_{k}_{vector_size}"
    word2vec1 = gensim.models.KeyedVectors.load(word2vec_path1)
    word2vec2 = gensim.models.KeyedVectors.load(word2vec_path2)

    intersect, A, B = create_matrices(word2vec1, word2vec2)

    result = orthogonal_procrustes(A, B)
    A_op = A @ result[0]
    cosine_distances = {}
    for target in target_words:
        try:
            idx = intersect.index(target)
            cosine_distances[target] = cosine_dist(A_op[idx], B[idx])
        except:
            pass
            # print(f"target word {target} is not found in the intersect of both corpora")
    
    return cosine_distances

In [13]:
years = [("2019", "2020"), ("2020", "2021")]

scores = []
avg_scores = []

for k in k_lst:
    for vector_size in vector_size_lst:
        total_pearson = 0
        total_p_value = 0
        for year1, year2 in years:
            cd = get_consine_distance(year1, year2, k, vector_size)
            
            labels = load_annotator_labels()

            sgns_vec = []
            annotator_vec = []

            for key in cd.keys():
                sgns_vec.append(float(cd[key]))
                annotator_vec.append(float(labels[key]))

            pearson, p_value = pearsonr(sgns_vec, annotator_vec)
            total_pearson += pearson
            total_p_value += p_value
            scores.append((k, vector_size, pearson, p_value, year1, year2))
        avg_scores.append((k, vector_size, total_pearson / 2, total_p_value / 2))

sorted_avg_scores = sorted(avg_scores, key=itemgetter(2), reverse=True)

for score in sorted_avg_scores:
    k, vector_size, pearson, p_value = score
    print(f"avg pearson coeff and p-value for k={k}, vector_size={vector_size}: {pearson}, {p_value}")


avg pearson coeff and p-value for k=15, vector_size=100: 0.47284280816877333, 0.054630160601486316
avg pearson coeff and p-value for k=12, vector_size=100: 0.4358772878640359, 0.07781644216479439
avg pearson coeff and p-value for k=10, vector_size=100: 0.32453833437791024, 0.22861472972834954
avg pearson coeff and p-value for k=5, vector_size=100: 0.23200998421737587, 0.3696732158956685
avg pearson coeff and p-value for k=5, vector_size=200: 0.22400907981207732, 0.4069698064214717
avg pearson coeff and p-value for k=7, vector_size=100: 0.21790866223737831, 0.3862896561031963
avg pearson coeff and p-value for k=10, vector_size=200: 0.21504609104114925, 0.424653968576419
avg pearson coeff and p-value for k=7, vector_size=200: 0.19233450259659673, 0.516732760503054
avg pearson coeff and p-value for k=15, vector_size=300: 0.15635637619077497, 0.5054529001259831
avg pearson coeff and p-value for k=12, vector_size=200: 0.1536200932435125, 0.5840827144359321
avg pearson coeff and p-value for 

In [14]:
print("all scores")
for score_obj in scores:
    k, vector_size, pearson, p_value, year1, year2 = score_obj
    print(f"pearson coeff and p-value for k={k}, vector_size={vector_size}, year={year1}-{year2}: {pearson}, {p_value}")

all scores
pearson coeff and p-value for k=5, vector_size=100, year=2019-2020: 0.2978033729975836, 0.21560374317047998
pearson coeff and p-value for k=5, vector_size=100, year=2020-2021: 0.16621659543716816, 0.5237426886208569
pearson coeff and p-value for k=5, vector_size=200, year=2019-2020: 0.3259475466241949, 0.17324308119146234
pearson coeff and p-value for k=5, vector_size=200, year=2020-2021: 0.12207061299995976, 0.640696531651481
pearson coeff and p-value for k=5, vector_size=300, year=2019-2020: 0.11815032251905985, 0.6299939163568279
pearson coeff and p-value for k=5, vector_size=300, year=2020-2021: 0.0029264380966489076, 0.9911062674264837
pearson coeff and p-value for k=7, vector_size=100, year=2019-2020: 0.23569636575114633, 0.33134520910573345
pearson coeff and p-value for k=7, vector_size=100, year=2020-2021: 0.20012095872361027, 0.44123410310065914
pearson coeff and p-value for k=7, vector_size=200, year=2019-2020: 0.34816681976429426, 0.14408174721704506
pearson coeff