In [7]:
import gensim
import pickle
from utils import get_sorted_tweets, get_target_words, load_annotator_labels
import gensim
from procrustes import orthogonal
import numpy as np
import pickle
from sklearn import metrics
from utils import get_target_words
from scipy.stats import pearsonr
from operator import itemgetter

In [2]:
# sgns hyperparameters
k_lst = [5, 7, 10, 12, 15]
vector_size_lst = [100, 200, 300]

In [30]:
date_dict = get_sorted_tweets()

years = ["2019", "2020", "2021"]

for k in k_lst:
    for vector_size in vector_size_lst:
        for year in years:

            model = gensim.models.Word2Vec(
                sg=1, # skipgram
                hs=0, # negative sampling
                negative=k, # number of negative samples
                workers=4,
                vector_size=vector_size
            )
            
            sentence_list = []
            data_year = date_dict[year]
            for data in data_year:
                sentence_list.append(data["tokens"])

            model.build_vocab(sentence_list)
            model.train(sentence_list, total_examples=model.corpus_count, epochs=20)

            print(f"k: {k} vector_size: {vector_size} year: {year}: {model}")

            # # Save the vectors and the model
            outpath = f'model_files/sngs_{year}_{k}_{vector_size}'
            model.wv.save(outpath)
            model.save(outpath + '.model')

k: 5 vector_size: 100 year: 2019: Word2Vec<vocab=1271, vector_size=100, alpha=0.025>
k: 5 vector_size: 100 year: 2020: Word2Vec<vocab=8861, vector_size=100, alpha=0.025>
k: 5 vector_size: 100 year: 2021: Word2Vec<vocab=7591, vector_size=100, alpha=0.025>
k: 5 vector_size: 200 year: 2019: Word2Vec<vocab=1271, vector_size=200, alpha=0.025>
k: 5 vector_size: 200 year: 2020: Word2Vec<vocab=8861, vector_size=200, alpha=0.025>
k: 5 vector_size: 200 year: 2021: Word2Vec<vocab=7591, vector_size=200, alpha=0.025>
k: 5 vector_size: 300 year: 2019: Word2Vec<vocab=1271, vector_size=300, alpha=0.025>
k: 5 vector_size: 300 year: 2020: Word2Vec<vocab=8861, vector_size=300, alpha=0.025>
k: 5 vector_size: 300 year: 2021: Word2Vec<vocab=7591, vector_size=300, alpha=0.025>
k: 7 vector_size: 100 year: 2019: Word2Vec<vocab=1271, vector_size=100, alpha=0.025>
k: 7 vector_size: 100 year: 2020: Word2Vec<vocab=8861, vector_size=100, alpha=0.025>
k: 7 vector_size: 100 year: 2021: Word2Vec<vocab=7591, vector_siz

In [9]:
def cosine_dist(v1, v2):
    return 1 - metrics.pairwise.cosine_similarity(v1.reshape(1, -1), v2.reshape(1, -1))[0][0]

def get_same_words(word_set1, word_set2):
    return word_set1.intersection(word_set2)

def create_matrices(word2vec1, word2vec2):
    vocab1 = set(word2vec1.index_to_key)
    vocab2 = set(word2vec2.index_to_key)

    intersect = get_same_words(vocab1, vocab2)

    mat1 = np.zeros((len(intersect), vector_size))
    mat2 = np.zeros((len(intersect), vector_size))

    for i, word in enumerate(intersect):
        mat1[i] = word2vec1.get_vector(word)
        mat2[i] = word2vec2.get_vector(word)
    

    return list(intersect), mat1, mat2

def get_consine_distance(year1, year2, k, vector_size):
    target_words = get_target_words()

    word2vec_path1 = f"model_files/sngs_{year1}_{k}_{vector_size}"
    word2vec_path2 = f"model_files/sngs_{year2}_{k}_{vector_size}"
    word2vec1 = gensim.models.KeyedVectors.load(word2vec_path1)
    word2vec2 = gensim.models.KeyedVectors.load(word2vec_path2)

    intersect, mat1, mat2 = create_matrices(word2vec1, word2vec2)

    result = orthogonal(mat1, mat2, scale=True, translate=True)
    # display_procrutes_result(result)
    a_op = np.dot(result.new_a, result.t)
    cosine_distances = {}
    for target in target_words:
        try:
            idx = intersect.index(target)
            cosine_distances[target] = cosine_dist(a_op[idx], result.new_b[idx])
        except:
            pass
            # print(f"target word {target} is not found in the intersect of both corpora")
    
    return cosine_distances

In [10]:
years = [("2019", "2020"), ("2020", "2021")]

scores = []
avg_scores = []

for k in k_lst:
    for vector_size in vector_size_lst:
        total_pearson = 0
        for year1, year2 in years:
            cd = get_consine_distance(year1, year2, k, vector_size)
            
            labels = load_annotator_labels()

            sgns_vec = []
            annotator_vec = []

            for key in cd.keys():
                sgns_vec.append(float(cd[key]))
                annotator_vec.append(float(labels[key]))

            pearson, p_value = pearsonr(sgns_vec, annotator_vec)
            total_pearson += pearson
            scores.append((k, vector_size, pearson, p_value, year1, year2))
        avg_scores.append((k, vector_size, total_pearson / 2))

sorted_avg_scores = sorted(avg_scores, key=itemgetter(2), reverse=True)

for score in sorted_avg_scores:
    k, vector_size, pearson = score
    print(f"pearson coeff and p-value for k={k}, vector_size={vector_size}: {pearson}")

print('-' * 80)
print("all scores")
for score_obj in scores:
    k, vector_size, pearson, p_value, year1, year2 = score_obj
    print(f"pearson coeff and p-value for k={k}, vector_size={vector_size}, year={year1}-{year2}: {pearson}, {p_value}")

1161
3119
1161
3119
1161
3119
1161
3119
1161
3119
1161
3119
1161
3119
1161
3119
1161
3119
1161
3119
1161
3119
1161
3119
1161
3119
1161
3119
1161
3119
pearson coeff and p-value for k=10, vector_size=100: 0.36859002512970807
pearson coeff and p-value for k=15, vector_size=100: 0.35634412780396263
pearson coeff and p-value for k=12, vector_size=100: 0.3529861192859172
pearson coeff and p-value for k=5, vector_size=100: 0.2844202672584794
pearson coeff and p-value for k=10, vector_size=200: 0.23261616224768084
pearson coeff and p-value for k=7, vector_size=100: 0.1690126681180513
pearson coeff and p-value for k=15, vector_size=300: 0.16262132910649132
pearson coeff and p-value for k=15, vector_size=200: 0.15944589431889106
pearson coeff and p-value for k=12, vector_size=200: 0.15363872010872978
pearson coeff and p-value for k=5, vector_size=300: 0.1533837356810638
pearson coeff and p-value for k=7, vector_size=200: 0.13455327294557134
pearson coeff and p-value for k=5, vector_size=200: 0.1