In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import skew

from collections import Counter
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from data.pipData import * 
from modelSGNS import *
from dataSet import *
from visuEmbedding import *
from tool import *

# Explanation  
The selection of negative samples is an important aspect of Skip-Gram with Negative Samples (SGNS). The original SGNS relies on the overall frequency of words to select the most frequent words. These are then excellent candidates for negative examples. But a child cannot use statistics on a corpus as large as word2vec. In this file, we will look at different ways to select the best negative samples.

# Data
Selected corpus :
data/GoodNightGorilla_Intonation.txt

In [None]:
data = prepare_data_with_intonation(
    file_path="./data/GoodNightGorilla_Intonation.txt",
    language='english',
    remove_accent=True,
    remove_punct=True,
    keep_apostrophes=False,
    contraction_map={# specific to corpus 
        "that's" : "thatis",
        "it's" : "itis",
        "don't": "donot",
        "doesn't": "doesnot",},
    stop_words=["s", "n't"],
    break_line=False
)

texts, intonations = separate_text_intonation(data)

## Create a dataset (positive pair)

In [None]:
dataset = W2V_weighted_DataSet_v2(
    sentences=texts, 
    intonations=intonations,
    nb_neg=5,
    window_size=6
)

In [None]:
corpus_one_list = []
for paragraph in texts:
    corpus_one_list.extend(paragraph)
freq_word = Counter(corpus_one_list)

freq_sorted = freq_word.most_common()
print(freq_sorted)
words = []
freqs = []

for word, freq in freq_sorted:
    words.append(word)
    freqs.append(freq)

plt.figure(figsize=(12, 6))
plt.bar(words[:20], freqs[:20])
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Distribution of original SGNS

In [None]:
freq_list = [freq_word.get(i, 0) for i in dataset.encoder.keys()]
unigram = [f**0.75 for f in freq_list]

word_proba = {}
for word, token in dataset.encoder.items():
    word_proba[word] = unigram[token]

print(word_proba)


sorted_dict = {key: value for key, 
               value in sorted(word_proba.items(), 
                               key=lambda item: item[1], reverse=True)}

print(sorted_dict)

plt.figure(figsize=(12, 6))
plt.bar(list(sorted_dict.keys())[:20], list(sorted_dict.values())[:20])
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


## Use intonation of word

In [None]:
token_importance = dataset.word_importance
print(token_importance)

word_importance = {}

for token, importance in token_importance.items():
    word_importance[dataset.decoder[token]] = importance
    
print(word_importance)

In [None]:
sorted_dict = {key: value for key, 
               value in sorted(word_importance.items(), 
                               key=lambda item: item[1], reverse=True)}

plt.figure(figsize=(12, 6))
plt.bar(list(sorted_dict.keys())[0:60], list(sorted_dict.values())[0:60])
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
plt.bar(list(sorted_dict.keys())[-20:], list(sorted_dict.values())[-20:])
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()