In [1]:
from google.colab import drive 
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [3]:
import pandas as pd

df_words = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/Thesis/Data/Classification/Common/classification_words.csv')
df_words.head()

Unnamed: 0,WORD,BASE_YEAR,GROUND_TRUTH,NEXT_YEAR
0,железный,2006,0,2007
1,подземный,2009,1,2010
2,катарский,2003,1,2004
3,гуманитарный,2005,0,2006
4,капитальный,2012,0,2013


In [4]:
# Source: https://gist.github.com/zhicongchen/9e23d5c3f1e5b1293b16133485cd17d8
import gensim
import numpy as np

def smart_procrustes_align_gensim(base_embed, other_embed, words=None):
    """
    Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
    Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
    Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
        
    First, intersect the vocabularies (see `intersection_align_gensim` documentation).
    Then do the alignment on the other_embed model.
    Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
    Return other_embed.
    If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
    """

    # patch by Richard So [https://twitter.com/richardjeanso) (thanks!) to update this code for new version of gensim
    # base_embed.init_sims(replace=True)
    # other_embed.init_sims(replace=True)

    # make sure vocabulary and indices are aligned
    in_base_embed, in_other_embed = intersection_align_gensim(base_embed, other_embed, words=words)

    # get the (normalized) embedding matrices
    base_vecs = in_base_embed.wv.get_normed_vectors()
    other_vecs = in_other_embed.wv.get_normed_vectors()

    # just a matrix dot product with numpy
    m = other_vecs.T.dot(base_vecs) 
    # SVD method from numpy
    u, _, v = np.linalg.svd(m)
    # another matrix operation
    ortho = u.dot(v) 
    # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
    other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
    
    return other_embed

def intersection_align_gensim(m1, m2, words=None):
    """
    Intersect two gensim word2vec models, m1 and m2.
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocab_m1 = set(m1.wv.index_to_key)
    vocab_m2 = set(m2.wv.index_to_key)

    # Find the common vocabulary
    common_vocab = vocab_m1 & vocab_m2
    if words: common_vocab &= set(words)

    # If no alignment necessary because vocab is identical...
    if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
        return (m1,m2)

    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)
    # print(len(common_vocab))

    # Then for each model...
    for m in [m1, m2]:
        # Replace old syn0norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
        old_arr = m.wv.vectors
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        new_key_to_index = {}
        new_index_to_key = []
        for new_index, key in enumerate(common_vocab):
            new_key_to_index[key] = new_index
            new_index_to_key.append(key)
        m.wv.key_to_index = new_key_to_index
        m.wv.index_to_key = new_index_to_key
        
        print(len(m.wv.key_to_index), len(m.wv.vectors))
        
    return (m1,m2)

In [None]:
from ast import literal_eval

# train new models with window size 10 and minimum frequency 5
years = [i for i in range(2000, 2015)]
for year in tqdm(years):
    df_temp = pd.read_csv(f'/content/gdrive/MyDrive/Colab Notebooks/Thesis/Data/Common/News/data_{year}.csv', compression='zip',
                   converters={'tokenized_text': literal_eval, 'lemmas': literal_eval})
    model = Word2Vec(sentences=df_temp.lemmas, vector_size=300, window=10, min_count=5, 
                     sg=1, negative=5, ns_exponent=0.75, workers=4)
    model.save(f'/content/gdrive/MyDrive/Colab Notebooks/Thesis/Models/Classification/Word2vec/word2vec_5w10_{year}.model')

100%|██████████| 14/14 [1:51:57<00:00, 479.85s/it]


In [5]:
df_words['cos_similarity_w2v'] = [0 for i in range(df_words.shape[0])]

In [6]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

years = [i for i in range(2000, 2014)]

count = 0

for year in tqdm(years):
    model_0 = Word2Vec.load(f'/content/gdrive/MyDrive/Colab Notebooks/Thesis/Models/Classification/Word2vec/word2vec_5w10_{year}.model')
    model_1 = Word2Vec.load(f'/content/gdrive/MyDrive/Colab Notebooks/Thesis/Models/Classification/Word2vec/word2vec_5w10_{year + 1}.model')   
    m_1, m_0 = intersection_align_gensim(model_1, model_0)
    m_0_modified = smart_procrustes_align_gensim(m_1, m_0)
    
    for i in df_words[df_words.BASE_YEAR == year].index:
        word = df_words.WORD[i]
        try:
            df_words.loc[i, ('cos_similarity_w2v')] = cosine_similarity([m_0_modified.wv[word]], 
                                                                  [model_1.wv[word]])[0][0]
        except:
            print(f'{word} not present')
            count += 1

print(count)

  7%|▋         | 1/14 [00:04<00:56,  4.36s/it]

15229 15229
15229 15229
подавляющий not present
летный not present


 14%|█▍        | 2/14 [00:06<00:33,  2.82s/it]

17266 17266
17266 17266
приемный not present


 21%|██▏       | 3/14 [00:08<00:29,  2.65s/it]

17453 17453
17453 17453


 29%|██▊       | 4/14 [00:10<00:23,  2.32s/it]

18348 18348
18348 18348
южноуральский not present
уставной not present


 36%|███▌      | 5/14 [00:12<00:19,  2.16s/it]

20502 20502
20502 20502
приемный not present
мертвый not present
22745 22745
22745 22745


 43%|████▎     | 6/14 [00:14<00:17,  2.23s/it]

приемный not present
23298 23298
23298 23298


 50%|█████     | 7/14 [00:16<00:15,  2.21s/it]

подавляющий not present
24837 24837
24837 24837
приемный not present

 57%|█████▋    | 8/14 [00:20<00:16,  2.69s/it]


27672 27672
27672 27672


 64%|██████▍   | 9/14 [00:24<00:16,  3.25s/it]

саяно-шушенский not present
принятый not present
молодежный not present
28193 28193
28193 28193


 71%|███████▏  | 10/14 [00:29<00:13,  3.50s/it]

саяно-шушенский not present
27638 27638
27638 27638


 79%|███████▊  | 11/14 [00:34<00:12,  4.00s/it]

съемочный not present
28310 28310
28310 28310


 86%|████████▌ | 12/14 [00:39<00:09,  4.52s/it]

29303 29303
29303 29303


 93%|█████████▎| 13/14 [00:44<00:04,  4.61s/it]

приемный not present
принятый not present
27109 27109
27109 27109


100%|██████████| 14/14 [00:47<00:00,  3.38s/it]

приемный not present
санкт-петербургский not present
самопровозглашенный not present
надежный not present
21





In [8]:
df_words.shape[0]

280

In [9]:
df_words.GROUND_TRUTH.value_counts()

0    220
1     42
2     18
Name: GROUND_TRUTH, dtype: int64

In [10]:
df_words_1 = df_words.drop(df_words[df_words.cos_similarity_w2v == 0].index)

In [11]:
df_words_1.GROUND_TRUTH.value_counts()

0    203
1     41
2     15
Name: GROUND_TRUTH, dtype: int64

In [None]:
df_words_1.to_csv('/content/gdrive/MyDrive/Colab Notebooks/thesis/Data/Classification/Word2vec/df_cos_w2v.csv', index=False)

In [15]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, balanced_accuracy_score, precision_score, recall_score
import numpy as np
from tqdm import tqdm

rfc_w2v = RandomForestClassifier(n_estimators=120, min_samples_split=4, random_state=42, class_weight='balanced', max_depth=4)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
balanced_accuracy_scores = []
precision_scores = []
recall_scores = []

for train_index, test_index in tqdm(skf.split(df_words_1.cos_similarity_w2v, df_words_1.GROUND_TRUTH)):
    x_train_fold, x_test_fold = df_words_1.cos_similarity_w2v.iloc[train_index], df_words_1.cos_similarity_w2v.iloc[test_index]
    y_train_fold, y_test_fold = df_words_1.GROUND_TRUTH.iloc[train_index], df_words_1.GROUND_TRUTH.iloc[test_index]
    rfc_w2v.fit(np.expand_dims(np.array(x_train_fold), axis=1), y_train_fold)
    pred = rfc_w2v.predict(np.expand_dims(np.array(x_test_fold), axis=1))
    f1_scores.append(f1_score(y_test_fold, pred, average='macro'))
    balanced_accuracy_scores.append(balanced_accuracy_score(y_test_fold, pred))
    precision_scores.append(precision_score(y_test_fold, pred, average='macro'))
    recall_scores.append(recall_score(y_test_fold, pred, average='macro'))

print(f'Mean F1 score: {np.mean(f1_scores):.4f}')
print(f'Mean balanced accuracy score: {np.mean(balanced_accuracy_scores):.4f}')
print(f'Mean precision score: {np.mean(precision_scores):.4f}')
print(f'Mean recall score: {np.mean(recall_scores):.4f}')

5it [00:01,  4.00it/s]

Mean F1 score: 0.4512
Mean balanced accuracy score: 0.4811
Mean precision score: 0.4418
Mean recall score: 0.4811



