In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from dataSet import SGNS_store_DataSet

from typing import Sequence, Optional, Callable, List, Dict

from copy import deepcopy

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')

import seaborn as sns
import matplotlib.pyplot as plt

import unicodedata
import string

from visuEmbedding import components_to_fig_3D, components_to_fig_3D_animation
import tool
from data.pipData import pipe_data, prepare_data, prepare_data_with_intonation, separate_text_intonation

import numpy as np
import pandas as pd

import numpy as np
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import skew

from collections import Counter

from data.pipData import separate_text_intonation
from dataSet import W2V_weighted_DataSet

In [None]:
class SkipGramModel(nn.Module):
    def __init__(self, emb_size:int, embedding_dimension:int=15, context_dimension:int|None=None, init_range:float|None=None, sparse:bool=True, device="cpu"):
        super().__init__()
        self.emb_size:int = emb_size
        self.emb_dim:int = embedding_dimension
        self.word_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, embedding_dim=self.emb_dim, device=device, sparse=sparse)
        self.con_size = embedding_dimension if context_dimension is None else context_dimension
        self.con_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, embedding_dim=self.con_size, device=device,sparse=sparse)

        if init_range is None:
            init_range = 0.5 / self.emb_dim
        self.word_emb.weight.data.uniform_(-init_range, init_range)
        self.con_emb.weight.data.uniform_(-init_range, init_range)

    def forward(self, centrals_words:list|torch.Tensor, pos_context:list|torch.Tensor, neg_context:list|torch.Tensor):
        words_emb:torch.Tensor = self.word_emb(centrals_words)
        context_emb:torch.Tensor = self.con_emb(pos_context) # [B, D]
        neg_emb:torch.Tensor = self.con_emb(neg_context) # [B, K, D]

        pos_score = torch.sum(words_emb * context_emb, dim=1)
        pos_loss = F.logsigmoid(pos_score)

        neg_score = torch.bmm(neg_emb, words_emb.unsqueeze(-1)).squeeze(2)
        neg_loss = F.logsigmoid(-neg_score).sum(1)

        loss = - (pos_loss + neg_loss).mean()
        return loss
    
    def save_weight(self, path:str="SGNS_weights/"):
        word_weights = self.word_emb.weight.detach().cpu()
        con_weight = self.con_emb.weight.detach().cpu()
        torch.save(word_weights, path+'word_embedding.pt')
        torch.save(con_weight, path+'con_embedding.pt')

    def load_weight(self, path:str="SGNS_weights/", name_word_weights:str="word_embedding.pt", name_con_weights:str="con_embedding.pt"):
        word_weights = torch.load(path + name_word_weights)
        con_weight = torch.load(path + name_con_weights)
        self.word_emb:nn.Embedding = nn.Embedding.from_pretrained(word_weights)
        self.con_emb:nn.Embedding = nn.Embedding.from_pretrained(con_weight)

In [None]:
class OnlyOneEmb(nn.Module):
    def __init__(self, emb_size:int, embedding_dimension:int=15, init_range:float|None=None, sparse:bool=True, device="cpu"):
        super().__init__()
        self.emb_size:int = emb_size
        self.emb_dim:int = embedding_dimension
        self.word_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, embedding_dim=self.emb_dim, device=device, sparse=sparse)

        if init_range is None:
            init_range = 0.5 / self.emb_dim
        self.word_emb.weight.data.uniform_(-init_range, init_range)

    def forward(self, centrals_words:list|torch.Tensor, pos_context:list|torch.Tensor, neg_context:list|torch.Tensor):
        words_emb:torch.Tensor = self.word_emb(centrals_words)
        context_emb:torch.Tensor = self.word_emb(pos_context) # [B, D]
        neg_emb:torch.Tensor = self.word_emb(neg_context) # [B, K, D]

        pos_score = torch.sum(words_emb * context_emb, dim=1)
        pos_loss = F.logsigmoid(pos_score)

        neg_score = torch.bmm(neg_emb, words_emb.unsqueeze(-1)).squeeze(2)
        neg_loss = F.logsigmoid(-neg_score).sum(1)

        loss = -(pos_loss + neg_loss).mean()
        return loss
    
    def save_weight(self, path:str="SGNS_weights/"):
        word_weights = self.word_emb.weight.detach().cpu()
        torch.save(word_weights, path+'word_embedding.pt')

    def load_weight(self, path:str="SGNS_weights/", name_word_weights:str="word_embedding.pt"):
        word_weights = torch.load(path + name_word_weights)
        self.word_emb:nn.Embedding = nn.Embedding.from_pretrained(word_weights)

In [None]:
class SGNS_Weighted(nn.Module):
    def __init__(self, emb_size:int, embedding_dimension:int=15, init_range:float|None=None, sparse:bool=True, device="cpu"):
        super().__init__()
        self.emb_size:int = emb_size
        self.emb_dim:int = embedding_dimension
        self.word_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, embedding_dim=self.emb_dim, device=device, sparse=sparse)

        if init_range is None:
            init_range = 0.5 / self.emb_dim
        self.word_emb.weight.data.uniform_(-init_range, init_range)

    def forward(self, centrals_words:list|torch.Tensor, pos_context:list|torch.Tensor, neg_context:list|torch.Tensor, weights:List|torch.Tensor):
        words_emb:torch.Tensor = self.word_emb(centrals_words) # [B, D]
        context_emb:torch.Tensor = self.word_emb(pos_context) # [B, D]
        neg_emb:torch.Tensor = self.word_emb(neg_context) # [B, K, D]

        pos_score = torch.sum(words_emb * context_emb, dim=1)
        pos_loss = F.logsigmoid(pos_score)

        neg_score = torch.bmm(neg_emb, words_emb.unsqueeze(-1)).squeeze(2)
        neg_loss = F.logsigmoid(-neg_score).sum(1)
        loss = -((pos_loss + neg_loss) * weights).mean()
        
        return loss
    

In [None]:
class OneEmbWeightedTarget(nn.Module):
    """
    This class apply a weight to target word
    
    """
    def __init__(self, emb_size:int, embedding_dimension:int=15, 
                init_range:float|None=None, sparse:bool=True, device="cpu"):
        super().__init__()
        self.emb_size:int = emb_size
        self.emb_dim:int = embedding_dimension
        self.word_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, 
                                                embedding_dim=self.emb_dim, device=device, sparse=sparse)

        if init_range is None:
            init_range = 0.5 / self.emb_dim
        self.word_emb.weight.data.uniform_(-init_range, init_range)

    def forward(self, centrals_words:torch.Tensor, pos_context:torch.Tensor,
                neg_context:torch.Tensor, weights:torch.Tensor):
        words_emb:torch.Tensor = self.word_emb(centrals_words) # [B, D]
        context_emb:torch.Tensor = self.word_emb(pos_context) # [B, D]
        neg_emb:torch.Tensor = self.word_emb(neg_context) # [B, K, D]
        
        weights = weights.view(-1, 1)
        
        def weight_hook(grad):
            return grad * weights
            
        words_emb.register_hook(weight_hook)

        pos_score = torch.sum(words_emb * context_emb, dim=1)
        pos_loss = F.logsigmoid(pos_score)

        neg_score = torch.bmm(neg_emb, words_emb.unsqueeze(-1)).squeeze(2)
        neg_loss = F.logsigmoid(-neg_score).sum(1)

        loss = -(pos_loss + neg_loss).mean()
        return loss

In [None]:
class WeightedTarget(nn.Module):
    """
    This class apply a weight to target word
    
    """
    def __init__(self, emb_size:int, embedding_dimension:int=15, 
                init_range:float|None=None, sparse:bool=True, device="cpu"):
        super().__init__()
        self.emb_size:int = emb_size
        self.emb_dim:int = embedding_dimension
        self.word_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, 
                                                embedding_dim=self.emb_dim, device=device, sparse=sparse)
        
        self.con_size = embedding_dimension 
        self.con_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, 
                                                 embedding_dim=self.con_size, device=device,sparse=sparse)

        if init_range is None:
            init_range = 0.5 / self.emb_dim
        self.word_emb.weight.data.uniform_(-init_range, init_range)
        self.con_emb.weight.data.uniform_(-init_range, init_range)

    def forward(self, centrals_words:torch.Tensor, pos_context:torch.Tensor,
                neg_context:torch.Tensor, weights:torch.Tensor):
        words_emb:torch.Tensor = self.word_emb(centrals_words) # [B, D]
        context_emb:torch.Tensor = self.con_emb(pos_context) # [B, D]
        neg_emb:torch.Tensor = self.con_emb(neg_context) # [B, K, D]
        
        weights = weights.view(-1, 1)
        
        def weight_hook(grad):
            return grad * weights
            
        words_emb.register_hook(weight_hook)

        pos_score = torch.sum(words_emb * context_emb, dim=1)
        pos_loss = F.logsigmoid(pos_score)

        neg_score = torch.bmm(neg_emb, words_emb.unsqueeze(-1)).squeeze(2)
        neg_loss = F.logsigmoid(-neg_score).sum(1)

        loss = -(pos_loss + neg_loss).mean()
        return loss

In [None]:
class SGwithNorm(nn.Module):
    def __init__(self, emb_size:int, embedding_dimension:int=15, context_dimension:int|None=None, init_range:float|None=None, sparse:bool=True, device="cpu"):
        """Initialisation du modèle SkipGram
        Args:
            emb_size: La taille de l'embedding, ce nombre devrais être déterminé après le process sur les data, et dépend de la taille de la fenêtre glissante.
            embedding_dimension: La taille souhaité de l'embedding. Pour notre cas d'utilisation nous préférons une taille très petit
            context_dimension: Il n'est pas recommandé de mettre un entier mais de laisser a None.
        
        """
        super().__init__()
        self.emb_size:int = emb_size
        self.emb_dim:int = embedding_dimension
        self.word_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, embedding_dim=self.emb_dim, device=device, sparse=sparse)

        self.con_size = embedding_dimension if context_dimension is None else context_dimension
        self.con_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, embedding_dim=self.con_size, device=device,sparse=sparse)

        if init_range is None:
            init_range = 0.5 / self.emb_dim
        self.word_emb.weight.data.uniform_(-init_range, init_range)
        self.con_emb.weight.data.uniform_(-init_range, init_range)

        self.scale = nn.Parameter(torch.tensor(10.0, device=device))

    def forward(self, centrals_words:list|torch.Tensor, pos_context:list|torch.Tensor, neg_context:list|torch.Tensor):
        """Fonction du forward pour le modèle SkipGramModel
        Args:
            centrals_words: Liste des ids des tokens des mots centraux [B]
            pos_context: Liste des ids des tokens des mots dans le contexte [B]
            neg_context: Liste des ids des tokens des mots non présent dans le contexte [B, K]
        """
        words_emb:torch.Tensor = self.word_emb(centrals_words) # [B, D]
        context_emb:torch.Tensor = self.con_emb(pos_context)   # [B, D]
        neg_emb:torch.Tensor = self.con_emb(neg_context)       # [B, K, D]
        words_norm = F.normalize(words_emb, p=2, dim=1)
        context_norm = F.normalize(context_emb, p=2, dim=1)
        neg_norm = F.normalize(neg_emb, p=2, dim=2)


        pos_dot = torch.sum(words_norm * context_norm, dim=1)
        pos_score = pos_dot * self.scale # Scale up
        pos_loss = F.logsigmoid(pos_score)

        neg_dot = torch.bmm(neg_norm, words_norm.unsqueeze(-1)).squeeze(2)
        neg_score = neg_dot * self.scale # Scale up
        neg_loss = F.logsigmoid(-neg_score).sum(1)


        loss = - (pos_loss + neg_loss).mean()
        return loss

In [None]:
def train_Word2Vec(modelW2V:nn.Module, dataLoader:Dataset, optimizer:optim.Optimizer, epochs:int, verbal:bool=True, log_interval=100, device="cpu"):
    """Fonction d’entraînement pour un modèle Word2Vec
    """
    for epoch in range(1, epochs + 1):
        epoch_loss = 0.0
        batches = 0
        loss_history = []
        global_step = 0
        
        modelW2V.train()

        for batch in dataLoader:
            # centers: [B], pos: [B], negs: [B, K]
            centers, pos, negs = batch
            centers = centers.to(device)
            pos = pos.to(device)
            negs = negs.to(device)

            optimizer.zero_grad()
            loss = modelW2V(centers, pos, negs)
            loss.backward()

            optimizer.step()

            batch_loss = loss.item()
            epoch_loss += batch_loss
            loss_history.append(batch_loss)
            batches += 1
            global_step += 1

            if verbal and log_interval and (global_step % log_interval == 0):
                print(f"Epoch {epoch} Step {global_step} AvgLoss {epoch_loss / batches:.6f}")

        avg_epoch_loss = epoch_loss / max(1, batches)
        if verbal : print(f"Epoch {epoch} finished. Avg loss: {avg_epoch_loss:.6f}")

    return {"loss_history": loss_history, "final_epoch_loss": avg_epoch_loss}

In [None]:
def cosine_similarity_matrix(embeddings:nn.Embedding) -> torch.Tensor:
    emb = embeddings.weight.detach()
    emb_norm = F.normalize(emb, p=2, dim=1)
    similarity_matrix = emb_norm @ emb_norm.t()
    return similarity_matrix

def update_sim_history(words: list[str], idx: List[int], cos_sim_history:Dict, similarity_matrix):
    num_words = len(words)

    for i in range(num_words):
        for j in range(num_words):
            similarity = ((similarity_matrix[idx[i], idx[j]] + 1) / 2) * 100
            cos_sim_history[words[i]][words[j]].append(round(float(similarity), 2))

def heat_map(words:List[str], similarity_matrix, figsize=(10, 8), save_file='tmp.png'):
    plt.close('all')
    plt.figure(figsize=figsize)
    sns.heatmap(similarity_matrix, annot=True, fmt=".2f", cmap="magma",
                xticklabels=words, yticklabels=words, cbar=True, robust=False,
                vmin=0, vmax=100,
                square=False, linewidths=0.)

    plt.title("Matrix de Similarité Cosinus")
    plt.xlabel("Mots", fontstyle="italic")
    plt.ylabel("Mots", fontstyle="italic")
    plt.savefig(save_file)
    return plt

# 1er exp
Sur le corpus générer par GPT5, comparé le SGNS avec deux embedding VS un seul embedding

In [None]:
dataset:SGNS_store_DataSet = pipe_data(
    language="french",
    dataseteur=SGNS_store_DataSet,
    window_size = 3,
    nb_neg=5,
    subsample_thresh= 1,
    vocab_size_limit=None,
    file="data/GPT5v2.txt",
    remove_accent=True,
    remove_ponct=True,
    keep_accent= False,
    contraction_map=None,
    stop_words=["le", "les", "sur", "fait", "de", "et", "la", "des", "sont"] + \
    ["the", "your", "a", "rubber"]

)
data = DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
k = ["chat", "chien", "animal", "animaux","train", "balle", "jouer"]

modelW2V:SkipGramModel = SkipGramModel(dataset.vocab_size, embedding_dimension=3, init_range=None, sparse=False)
# optimizer = torch.optim.SparseAdam(modelW2V.parameters(), lr=0.01)
optimizer = torch.optim.Adam(modelW2V.parameters(), lr=0.01)

nb_epoch = 30
for _ in range(nb_epoch):
    for sentence_nb, (centers, pos, negs) in enumerate(data):
        optimizer.zero_grad()
        loss = modelW2V(centers, pos, negs)
        loss.backward()
        optimizer.step()

similarity = cosine_similarity_matrix(modelW2V.word_emb)
m_to_h = similarity
m_to_h = ((m_to_h + 1) / 2) * 100
m_to_h_2 = m_to_h[dataset.encode(k),:]
m_to_h_2 = m_to_h_2[:, dataset.encode(k)]
plt = heat_map(words=k, similarity_matrix=m_to_h_2)
plt.show()

components_to_fig_3D(components=modelW2V.word_emb.weight.detach().cpu().numpy(),
    encoder=dataset.encoder,
    words_display=list(dataset.encoder.keys()),
    highlight_words=k,
    nb_neighbors=2, _min=-5, _max=5, base_color={}
)

In [None]:
modelW2V:OnlyOneEmb = OnlyOneEmb(dataset.vocab_size, embedding_dimension=3, init_range=None, sparse=True)
# optimizer = torch.optim.Adam(modelW2V.parameters(), lr=0.01)
optimizer = torch.optim.SparseAdam(modelW2V.parameters(), lr=0.005)

emb_hist = []
nb_epoch = 30


for _ in range(nb_epoch):
    for sentence_nb, (centers, pos, negs) in enumerate(data):
        optimizer.zero_grad()
        loss = modelW2V(centers, pos, negs)
        loss.backward()
        optimizer.step()
        
    w = deepcopy(modelW2V.word_emb.weight.detach().cpu().numpy())
    emb_hist.append(w)

similarity = cosine_similarity_matrix(modelW2V.word_emb)
m_to_h = similarity
m_to_h = ((m_to_h + 1) / 2) * 100
m_to_h_2 = m_to_h[dataset.encode(k),:]
m_to_h_2 = m_to_h_2[:, dataset.encode(k)]
plt = heat_map(words=k, similarity_matrix=m_to_h_2)
plt.show()

In [None]:
base_colors = {
    'chat': ("blue",  "cyan"),
    'chien': ("goldenrod", "yellow"),
    'balle': ("green", "lightgreen"),
    "jouer": ("magenta", "pink")
}


fig = components_to_fig_3D_animation(
    history_components=emb_hist,
    encoder=dataset.encoder,
    highlight_words=["chat", "chien", "balle", "jouer", "animal", "animaux"],
    nb_neighbors=6, base_color=base_colors
)

tool.DicToJson(dataset.encoder, "data/encoder")
tool.DicToJson(dataset.decoder, "data/decoder")

# Teste de fixer des vecteurs

In [None]:
modelW2V:OnlyOneEmb = OnlyOneEmb(dataset.vocab_size, embedding_dimension=3, init_range=0.5, sparse=True)
# optimizer = torch.optim.Adam(modelW2V.parameters(), lr=0.01)
optimizer = torch.optim.SparseAdam(modelW2V.parameters(), lr=0.005)


with torch.no_grad():
    modelW2V.word_emb.weight[dataset.encoder['animal']] = torch.tensor([1, 0, 0.3])
    modelW2V.word_emb.weight[dataset.encoder['train']] = torch.tensor([-1, 0, 0])

emb_hist = []
nb_epoch = 10

w = deepcopy(modelW2V.word_emb.weight.detach().cpu().numpy())
emb_hist.append(w)
for _ in range(nb_epoch):
    for sentence_nb, (centers, pos, negs) in enumerate(data):
        optimizer.zero_grad()
        loss = modelW2V(centers, pos, negs)
        loss.backward()
        optimizer.step()
    with torch.no_grad():
        modelW2V.word_emb.weight[dataset.encoder['animal']] = torch.tensor([1, 0, 0.3])
        
    w = deepcopy(modelW2V.word_emb.weight.detach().cpu().numpy())
    emb_hist.append(w)

similarity = cosine_similarity_matrix(modelW2V.word_emb)
m_to_h = similarity
m_to_h = ((m_to_h + 1) / 2) * 100
m_to_h_2 = m_to_h[dataset.encode(k),:]
m_to_h_2 = m_to_h_2[:, dataset.encode(k)]
plt = heat_map(words=k, similarity_matrix=m_to_h_2)
plt.show()

In [None]:
fig = components_to_fig_3D_animation(
    history_components=emb_hist,
    encoder=dataset.encoder,
    highlight_words=["chat", "chien", "train", "jouer", "animal", "animaux"],
    nb_neighbors=6, base_color=base_colors
)

# Nouveau data set, GoodNightGorilla 
Corpus plus riche et en lien avec un livre pour enfant.

In [None]:
dataset:SGNS_store_DataSet = pipe_data(
    language="english",
    dataseteur=SGNS_store_DataSet,
    window_size = 3,
    nb_neg=5,
    subsample_thresh= 1,
    vocab_size_limit=None,
    file="data/GoodNightGorilla.txt",
    remove_accent=True,
    remove_ponct=True,
    keep_accent= False,
    contraction_map=None,
    stop_words=[]

)
data = DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
print(data.__len__())
counter = Counter()
for centers, _, _ in data:
    ids = centers.flatten().tolist() if hasattr(centers, "flatten") else list(centers)
    for idx in ids:
        counter[dataset.decoder[int(idx)]] += 1

freq_central_words = dict(counter.most_common())
print(freq_central_words)

In [None]:
modelW2V:OnlyOneEmb = OnlyOneEmb(dataset.vocab_size, embedding_dimension=3,
								init_range=None, sparse=True)
modelW2V_2Emb:SkipGramModel = SkipGramModel(dataset.vocab_size, embedding_dimension=3,
								init_range=None, sparse=True)
optimizer = torch.optim.SparseAdam(modelW2V.parameters(), lr=0.005)
optimizer_2Emb = torch.optim.SparseAdam(modelW2V_2Emb.parameters(), lr=0.005)

emb_hist = []
nb_epoch = 5

for _ in range(nb_epoch):
	for sentence_nb, (centers, pos, negs) in enumerate(data):
		optimizer.zero_grad()
		optimizer_2Emb.zero_grad()
		loss = modelW2V(centers, pos, negs)
		loss.backward()
		loss_2Emb = modelW2V_2Emb(centers, pos, negs)
		loss_2Emb.backward()
		optimizer.step()
		optimizer_2Emb.step()
 
print(loss)

In [None]:
k = ["gorilla", "animals", "mouse", "monkey", "he", "say", "the", "zookeeper"]

similarity = cosine_similarity_matrix(modelW2V.word_emb)
m_to_h = similarity
m_to_h = ((m_to_h + 1) / 2) * 100
m_to_h_2 = m_to_h[dataset.encode(k),:]
m_to_h_2 = m_to_h_2[:, dataset.encode(k)]
plt = heat_map(words=k, similarity_matrix=m_to_h_2)
plt.show()

In [None]:
def find_nearest_neighbors(vector_word:torch.Tensor, tensor:torch.Tensor, top_n:int=5):
    all_scores = cosine_similarity(tensor, vector_word.reshape(1, -1))
    score_series = pd.Series(all_scores.flatten())
    top_words = score_series.sort_values(ascending=False).head(top_n)
    return top_words

In [None]:
matrix_of_similarity = cosine_similarity_matrix(modelW2V.word_emb)
word_a = "banana"
nearest_neighbors = find_nearest_neighbors(matrix_of_similarity[dataset.encode(word_a)], matrix_of_similarity,
                                            top_n=20)
nearest_neighbors = nearest_neighbors.rename(index=lambda x: dataset.decoder[x])
print(f"Nearest Neighbors to '{word_a}':")
print(nearest_neighbors)

In [None]:
word_a = "gorilla"
nearest_neighbors = find_nearest_neighbors(matrix_of_similarity[dataset.encode(word_a)], matrix_of_similarity,
                                            top_n=20)
nearest_neighbors = nearest_neighbors.rename(index=lambda x: dataset.decoder[x])
print(f"Nearest Neighbors to '{word_a}':")
print(nearest_neighbors)

In [None]:
weight = modelW2V.word_emb.weight.detach().cpu().numpy()

In [None]:
tool.analyser_anisotropie_advanced(weight)

In [None]:
modelW2V.save_weight("SGNS_weights/OneEmb/GoodNightGorilla")
tool.DicToJson(dataset.encoder, 'data/encoder')
tool.DicToJson(dataset.decoder, 'data/decoder')

In [None]:
norms = torch.linalg.vector_norm(modelW2V.word_emb.weight, dim=1)
print(norms.mean(), norms.std())

df_norm_vecteur = pd.DataFrame(norms.detach().numpy(), columns=['Norme des Vecteurs'], index=sorted(list(dataset.encoder.keys())))

In [None]:
norms_2emb = torch.linalg.vector_norm(modelW2V_2Emb.word_emb.weight, dim=1)
print(norms_2emb.mean(), norms_2emb.std())

df_norm_vecteur_2emb = pd.DataFrame(norms_2emb.detach().numpy(), columns=['Norme des Vecteurs'], index=sorted(list(dataset.encoder.keys())))

In [None]:
k = ["banana", "yellow", "mouse", "he", "the", "zookeeper"]

components_to_fig_3D(components=modelW2V.word_emb.weight.detach().cpu().numpy(),
    encoder=dataset.encoder,
    words_display=list(dataset.encoder.keys()),
    highlight_words=k,
    nb_neighbors=11, base_color={}
)

# Intonation

In [None]:
data = prepare_data_with_intonation(
    file_path="./data/GoodNightGorilla_Intonation.txt",
    language='english',
    remove_accent=True,
    remove_punct=True,
    keep_apostrophes=False,
    contraction_map={
        "that's" : "thatis",
        "it's" : "itis",
        "don't": "donot",
        "doesn't": "doesnot",},
    stop_words=["s", "n't"],
    break_line=False
)

texts, intonations = separate_text_intonation(data)
data_set = W2V_weighted_DataSet(sentences=texts, intonations=intonations)
loader = DataLoader(data_set, batch_size=16, shuffle=False)

## Analyse data

In [None]:
all_pair = data_set.pairs
word_central = []
word_context = []
word_intonat = []
for w_cen, w_con, into in all_pair:
    word_central.append(w_cen)
    word_context.append(w_con)
    word_intonat.append(into)

freq_cent = Counter(word_central)
freq_cont = Counter(word_context)
print(freq_cent.most_common())

In [None]:
modelW2V:SGNS_Weighted = SGNS_Weighted(len(data_set.encoder.values()), embedding_dimension=3)
optimizer = torch.optim.SparseAdam(modelW2V.parameters(), lr=0.005)

nb_epoch = 5
w = deepcopy(modelW2V.word_emb.weight.detach().cpu().numpy())
emb_hist = [w]

for _ in range(nb_epoch):
	for sentence_nb, (centers, pos, negs, intonation) in enumerate(loader):
		optimizer.zero_grad()
		loss = modelW2V(centers, pos, negs, intonation)
		loss.backward()
		optimizer.step()
	w = deepcopy(modelW2V.word_emb.weight.detach().cpu().numpy())
	emb_hist.append(w)
 
print(loss)

In [None]:
fig = components_to_fig_3D_animation(
    history_components=emb_hist,
    encoder=data_set.encoder,
    highlight_words=["banana", "gorilla", "little", "yellow", "zookeeper"],
    nb_neighbors=6
)

In [None]:
fig = components_to_fig_3D_animation(
    history_components=emb_hist,
    encoder=data_set.encoder,
    highlight_words=["banana", "gorilla", "little", "yellow", "zookeeper"],
    nb_neighbors=6
)

In [None]:
text_without_0intonation = []
intonation_without_0intonation = []

for sentence_t, sentence_i in zip(texts, intonations):
    text_without_0intonation.append([])
    intonation_without_0intonation.append([])
    for t, i in zip(sentence_t, sentence_i):
        if int(i) != 0:
            text_without_0intonation[-1].append(t)
            intonation_without_0intonation[-1].append(i)
            
data_set_2 = W2V_weighted_DataSet(sentences=text_without_0intonation, intonations=intonation_without_0intonation)
loader_2 = DataLoader(data_set_2, batch_size=64, shuffle=False)

In [None]:
modelW2V_w0:SGNS_Weighted = SGNS_Weighted(len(data_set.encoder.values()), embedding_dimension=3, device="cuda")
optimizer_w0 = torch.optim.SparseAdam(modelW2V_w0.parameters(), lr=0.01)

nb_epoch = 5
for _ in range(nb_epoch):
	for sentence_nb, (center, pos, negs, intonation) in enumerate(loader):
		center = center.to("cuda")
		pos = pos.to("cuda")
		negs = negs.to("cuda")
		intonation = intonation.to("cuda")
		optimizer_w0.zero_grad()
		loss = modelW2V_w0(center, pos, negs, intonation)
		loss.backward()
		optimizer_w0.step()
 
print(loss)

In [None]:
fig = components_to_fig_3D_animation(
    history_components=[modelW2V_w0.word_emb.weight.cpu().detach().numpy()],
    encoder=data_set_2.encoder,
    highlight_words=["banana", "gorilla", "little", "yellow", "zookeeper"],
    nb_neighbors=6
)

# Normalize intonation

In [None]:
data = prepare_data_with_intonation(
    file_path="./data/GoodNightGorilla_Intonation.txt",
    language='english',
    remove_accent=True,
    remove_punct=True,
    keep_apostrophes=False,
    contraction_map={
        "that's" : "thatis",
        "it's" : "itis",
        "don't": "donot",
        "doesn't": "doesnot",},
    stop_words=["s", "n't"],
    break_line=False
)

texts, intonations = separate_text_intonation(data)

intonations = tool.normalize_range_center(intonations, range_normalize=1.5, center=2.0)
print(intonations)

data_set:W2V_weighted_DataSet = W2V_weighted_DataSet(sentences=texts, intonations=intonations, nb_neg=3, window_size=6)
distribution = data_set.unigram_dist

all_token = []
for sentence in data_set.tokens:
    all_token.extend(sentence)
    
freq = Counter(all_token)
freq_list = [freq.get(i, 0) for i in range(len(data_set.decoder.keys()))]

unigram = torch.tensor([f**0.75 for f in freq_list], dtype=torch.float)
unigram = unigram / unigram.sum()

for w, idx in data_set.encoder.items():
    print(f"word :{w} frequencies : {freq_list[idx]}, probability {unigram[idx]}")

data_set.unigram_dist = unigram

loader = DataLoader(data_set, batch_size=16, shuffle=True)

## Analyse data

In [None]:
all_pair = data_set.pairs
word_central = []
word_context = []
word_intonat = []
for w_cen, w_con, into in all_pair:
    word_central.append(data_set.decode(w_cen))
    word_context.append(data_set.decode(w_con))

freq_cent = Counter(word_central)
freq_cont = Counter(word_context)
print(freq_cent.most_common())
print(freq_cont.most_common())

In [None]:
word_importance = data_set.word_importance
   
word_neg = []
for _, _, batch_neg, _ in loader:
    for neg in batch_neg:
        word_neg.extend(neg.tolist())

print(word_neg)
word_neg = data_set.decode(word_neg)
freq_neg = Counter(word_neg)
df_freq_neg = (pd.DataFrame.from_dict(freq_neg, orient='index', columns=['count'])
               .reset_index()
               .rename(columns={'index': 'word'})
               .sort_values('count', ascending=False)
               .reset_index(drop=True))
df_freq_neg.head()
print(next(zip(*freq_neg.most_common(5))))
freq_neg.most_common(5)


In [None]:
modelW2V:SGNS_Weighted = SGNS_Weighted(len(data_set.encoder.values()), embedding_dimension=3)
optimizer = torch.optim.SparseAdam(modelW2V.parameters(), lr=0.00001)

nb_epoch = 10
w = deepcopy(modelW2V.word_emb.weight.detach().cpu().numpy())
emb_hist = [w]

for _ in range(nb_epoch):
	for sentence_nb, (centers, pos, negs, intonation) in enumerate(loader):
		optimizer.zero_grad()
		loss = modelW2V(centers, pos, negs, intonation)
		loss.backward()
		optimizer.step()
	w = deepcopy(modelW2V.word_emb.weight.detach().cpu().numpy())
	emb_hist.append(w)
 
print(loss)

In [None]:
base_colors = {
    'banana': ("yellow",  "lightyellow"),
    'gorilla': ("gray", "lightgray"),
    'zookeeper': ("brown", "sandybrown"),
    "little": ("pink", "lightpink"),
    "yellow": ("gold", "lightgoldenrodyellow")
}

# histo_emb_norm = [] # Normalisation PCA for embedding > 3
# for emb in emb_hist:
#     pca = PCA(n_components=3)
#     X = pca.fit_transform(emb)
#     histo_emb_norm.append(X)
    
# fig = components_to_fig_3D_animation(
#     history_components=histo_emb_norm,
#     encoder=data_set.encoder,
#     highlight_words=["banana", "gorilla", "little", "yellow", "zookeeper"],
#     nb_neighbors=1, base_color=base_colors
# )

fig = components_to_fig_3D_animation( # for embedding = 3
    history_components=emb_hist,
    encoder=data_set.encoder,
    highlight_words=next(zip(*freq_neg.most_common(5))),
    nb_neighbors=1, base_color=base_colors
)

In [None]:
pairs = []

for pair in loader:
    centers, pos, negs, intonation = pair
    for c, p, n, i in zip(centers, pos, negs, intonation):
        word_c = data_set.decoder[int(c)]
        word_p = data_set.decoder[int(p)]
        words_n = [data_set.decoder[int(idx)] for idx in n]
        pairs.append((word_c, word_p, words_n, float(i)))
        
        
norm_all_vec = torch.linalg.vector_norm(modelW2V.word_emb.weight, dim=1)
print(norm_all_vec.mean(), norm_all_vec.std())

df_norm_vecteur = pd.DataFrame(norm_all_vec.detach().numpy(), columns=['Norme des Vecteurs'], index=list(data_set.encoder.keys()))

In [None]:
modelW2V:OnlyOneEmb = OneEmbWeightedTarget(len(data_set.pairs), embedding_dimension=3, device="cuda")
optimizer = torch.optim.SparseAdam(modelW2V.parameters(), lr=0.0001)

nb_epoch = 50
w = deepcopy(modelW2V.word_emb.weight.detach().cpu().numpy())
emb_hist = [w]

all_loss = []

for _ in range(nb_epoch):
	loss_inter = []
	for sentence_nb, (centers, pos, negs, intonation) in enumerate(loader):
		optimizer.zero_grad()
  
		intonation = intonation.float().to("cuda")
  
		centers = centers.to("cuda")
		pos = pos.to("cuda")
		negs = negs.to("cuda")

		loss = modelW2V(centers, pos, negs, intonation)
		loss.backward()
		loss_inter.append(float(loss))
		optimizer.step()
	all_loss.append(sum(loss_inter) / len(loss_inter))
	w = deepcopy(modelW2V.word_emb.weight.detach().cpu().numpy())
	emb_hist.append(w)
 
print(loss)

In [None]:
print(all_loss)

In [None]:
base_colors = {
    'banana': ("yellow",  "lightyellow"),
    'gorilla': ("gray", "lightgray"),
    'zookeeper': ("brown", "sandybrown"),
    "little": ("pink", "lightpink"),
    "yellow": ("gold", "lightgoldenrodyellow")
}

# histo_emb_norm = [] # Normalisation PCA for embedding > 3
# for emb in emb_hist:
#     pca = PCA(n_components=3)
#     X = pca.fit_transform(emb)
#     histo_emb_norm.append(X)
    
# fig = components_to_fig_3D_animation(
#     history_components=histo_emb_norm,
#     encoder=data_set.encoder,
#     highlight_words=["banana", "gorilla", "little", "yellow", "zookeeper"],
#     nb_neighbors=11, base_color=base_colors
# )

fig = components_to_fig_3D_animation( # for embedding = 3
    history_components=emb_hist,
    encoder=data_set.encoder,
    highlight_words=["banana", "gorilla", "mouse", "little", "yellow", "zookeeper"],
    nb_neighbors=11, base_color=base_colors
)

In [None]:
pairs = []

for pair in loader:
    centers, pos, negs, intonation = pair
    for c, p, n, i in zip(centers, pos, negs, intonation):
        word_c = data_set.decoder[int(c)]
        word_p = data_set.decoder[int(p)]
        words_n = [data_set.decoder[int(idx)] for idx in n]
        pairs.append((word_c, word_p, words_n, float(i)))
        
        
norm_all_vec = torch.linalg.vector_norm(modelW2V.word_emb.weight, dim=1)
print(norm_all_vec.mean(), norm_all_vec.std())

df_norm_vecteur = pd.DataFrame(norm_all_vec.detach().cpu().numpy(), 
                               columns=['Norme des Vecteurs'], index=list(data_set.encoder.keys()))

In [None]:
modelW2V:WeightedTarget = WeightedTarget(len(data_set.pairs), embedding_dimension=3, device="cuda")
optimizer = torch.optim.SparseAdam(modelW2V.parameters(), lr=0.001)

nb_epoch = 20
w = deepcopy(modelW2V.word_emb.weight.detach().cpu().numpy())
emb_hist = [w]

for _ in range(nb_epoch):
	for sentence_nb, (centers, pos, negs, intonation) in enumerate(loader):
		optimizer.zero_grad()
  
		intonation = intonation.float().to("cuda")
  
		centers = centers.to("cuda")
		pos = pos.to("cuda")
		negs = negs.to("cuda")

		loss = modelW2V(centers, pos, negs, intonation)
		loss.backward()
		optimizer.step()
	w = deepcopy(modelW2V.word_emb.weight.detach().cpu().numpy())
	emb_hist.append(w)
 
print(loss)

In [None]:
base_colors = {
    'banana': ("yellow",  "lightyellow"),
    'gorilla': ("gray", "lightgray"),
    'zookeeper': ("brown", "sandybrown"),
    "little": ("pink", "lightpink"),
    "yellow": ("gold", "lightgoldenrodyellow")
}

# histo_emb_norm = [] # Normalisation PCA for embedding > 3
# for emb in emb_hist:
#     pca = PCA(n_components=3)
#     X = pca.fit_transform(emb)
#     histo_emb_norm.append(X)
    
# fig = components_to_fig_3D_animation(
#     history_components=histo_emb_norm,
#     encoder=data_set.encoder,
#     highlight_words=["banana", "gorilla", "little", "yellow", "zookeeper"],
#     nb_neighbors=1, base_color=base_colors
# )

fig = components_to_fig_3D_animation( # for embedding = 3
    history_components=emb_hist,
    encoder=data_set.encoder,
    highlight_words=["banana", "gorilla", "little", "yellow", "zookeeper"],
    nb_neighbors=11, base_color=base_colors
)