In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

import random

from copy import deepcopy

from visuEmbedding import interactive_embedding_plot_3D, components_to_fig_3D, components_to_fig_3D_animation

In [None]:
class FixeFeatureSkipGramModel(nn.Module):
    def __init__(self, emb_size:int, embedding_dimension:int=15, init_range:float|None=None, sparse:bool=True, device="cpu"):
        """Init out model
        Args:
            emb_size: This is the number of words that our embedding will contain. 
            embedding_dimension: Number of dimension which will represent our words
        
        """
        super().__init__()
        self.emb_size:int = emb_size
        self.emb_dim:int = embedding_dimension
        self.con_size:int = embedding_dimension # Not our goal to modifie this parameter
        self.word_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, embedding_dim=self.emb_dim, device=device, sparse=sparse)
        self.con_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, embedding_dim=self.con_size, device=device,sparse=sparse)

        if init_range is None:
            init_range = 0.5 / self.emb_dim
        self.word_emb.weight.data.uniform_(-init_range, init_range)
        self.con_emb.weight.data.uniform_(-init_range, init_range)

    def forward(self, centrals_words:list[int]|torch.Tensor, pos_context:list|torch.Tensor, neg_context:list|torch.Tensor):
        """Forward for SkipGramModel (To modify in futur)
        For now based in SNSG
        Args:
            centrals_words: Index of central word
            pos_context: Index of similar words to the central words
            neg_context: Index of negative words to the central words. There is an additional dimension because there may be several negative words.
        """
        words_emb:torch.Tensor = self.word_emb(centrals_words) # [B, D]
        context_emb:torch.Tensor = self.con_emb(pos_context) # [B, D]
        neg_emb:torch.Tensor = self.con_emb(neg_context) # [B, K, D]

        pos_score = torch.sum(words_emb * context_emb, dim=1)
        pos_loss = F.logsigmoid(pos_score)
        neg_score = torch.bmm(neg_emb, words_emb.unsqueeze(-1)).squeeze(2)
        neg_loss = F.logsigmoid(-neg_score).sum(1)

        loss = - (pos_loss + neg_loss).mean()
        return loss
    
    def save_weight(self, path:str="SGNS_weights/"):
        """Save weight with name word_embedding.pt and con_embedding.pt
        Args :
            path: Folder to save.
        """
        word_weights = self.word_emb.weight.detach().cpu()
        con_weight = self.con_emb.weight.detach().cpu()
        torch.save(word_weights, path+'word_embedding.pt')
        torch.save(con_weight, path+'con_embedding.pt')

    def load_weight(self, path:str="SGNS_weights/", name_word_weights:str="word_embedding.pt", name_con_weights:str="con_embedding.pt"):
        """Load weights
        Args :
            path: Folder where are weights files
            name_word_weights: Name for central words
            name_con_weights: Name for context words
        """
        word_weights = torch.load(path + name_word_weights)
        con_weight = torch.load(path + name_con_weights)

        self.word_emb:nn.Embedding = nn.Embedding.from_pretrained(word_weights)
        self.con_emb:nn.Embedding = nn.Embedding.from_pretrained(con_weight)
        
    def __increase_embeddings(self, new_word_w:torch.Tensor, new_con_w:torch.Tensor):
        with torch.no_grad():
            updated_word_w = torch.cat([self.word_emb.weight.data, new_word_w.unsqueeze(0)], dim=0)
            updated_con_w = torch.cat([self.con_emb.weight.data, new_con_w.unsqueeze(0)], dim=0)

        self.word_emb = nn.Embedding.from_pretrained(updated_word_w, freeze=False, sparse=self.word_emb.sparse)
        self.con_emb = nn.Embedding.from_pretrained(updated_con_w, freeze=False, sparse=self.con_emb.sparse)
        self.emb_size+=1
        
    def extend_vocabulary(self, ref_central_vector: torch.Tensor, ref_context_vector: torch.Tensor, epsilon: float = 0.05):
        """
        Extends the vocabulary by adding new embeddings initialized close to a reference vector.
        
        Args:
            ref_vector: A tensor of shape (embedding_dimension,) to use as the "center".
            epsilon: The magnitude of the random noise.
        """
        
        noise_word = torch.randn(self.emb_dim) * epsilon
        noise_con = torch.randn(self.con_size) * epsilon
        
        new_word_w = ref_central_vector + noise_word
        new_con_w = ref_context_vector + noise_con
        
        self.__increase_embeddings(new_word_w, new_con_w)

In [None]:
class FixOnlyOneEmb(nn.Module):
    def __init__(self, emb_size:int, embedding_dimension:int=15, context_dimension:int|None=None, init_range:float|None=None, sparse:bool=True, device="cpu"):
        super().__init__()
        self.emb_size:int = emb_size
        self.emb_dim:int = embedding_dimension
        self.word_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, embedding_dim=self.emb_dim, device=device, sparse=sparse)

        if init_range is None:
            init_range = 0.5 / self.emb_dim
        self.word_emb.weight.data.uniform_(-init_range, init_range)

    def forward(self, centrals_words:list|torch.Tensor, pos_context:list|torch.Tensor, neg_context:list|torch.Tensor):
        words_emb:torch.Tensor = self.word_emb(centrals_words)
        context_emb:torch.Tensor = self.word_emb(pos_context)
        neg_emb:torch.Tensor = self.word_emb(neg_context)

        pos_score = torch.sum(words_emb * context_emb, dim=1)
        pos_loss = F.logsigmoid(pos_score)

        neg_score = torch.bmm(neg_emb, words_emb.unsqueeze(-1)).squeeze(2)
        neg_loss = F.logsigmoid(-neg_score).sum(1)

        loss = -(pos_loss + neg_loss).mean()
        return loss
    
    def __increase_embeddings(self, new_word_w:torch.Tensor):
        with torch.no_grad():
            updated_word_w = torch.cat([self.word_emb.weight.data, new_word_w.unsqueeze(0)], dim=0)

        self.word_emb = nn.Embedding.from_pretrained(updated_word_w, freeze=False, sparse=self.word_emb.sparse)
        self.emb_size+=1
        
    def extend_vocabulary(self, ref_central_vector: torch.Tensor, epsilon: float = 0.05):
        """
        Extends the vocabulary by adding new embeddings initialized close to a reference vector.
        
        Args:
            ref_vector: A tensor of shape (embedding_dimension,) to use as the "center".
            epsilon: The magnitude of the random noise.
        """
        
        noise_word = torch.randn(self.emb_dim) * epsilon
        
        new_word_w = ref_central_vector + noise_word
        
        self.__increase_embeddings(new_word_w)

In [None]:
def increase_optimizer(optimizer:torch.optim.Optimizer, old_weight:torch.nn.parameter.Parameter, new_weight:torch.nn.parameter.Parameter):
    """Transfer informations optimizer to a new optimizer (increase by one)
    """
    if old_weight in optimizer.state:
        old_state:dict = optimizer.state[old_weight]
        new_state = {}
        
        for key, value in old_state.items():
            if isinstance(value, torch.Tensor):
                new_shape = (value.shape[0] + 1, value.shape[1])
                new_buffer = torch.zeros(new_shape, device=value.device, dtype=value.dtype)
                new_buffer[:value.shape[0]] = value
                new_state[key] = new_buffer
            else:
                new_state[key] = value
        
        optimizer.state[new_weight] = new_state
        del optimizer.state[old_weight]

    for group in optimizer.param_groups:
        if old_weight in group['params']:
            index = group['params'].index(old_weight)
            group['params'][index] = new_weight
            
    return optimizer

In [3]:
remove_me = FixeFeatureSkipGramModel(2, 3, 1, True)
print(remove_me.parameters)
optimizer = torch.optim.SparseAdam(remove_me.parameters())

old_weight = remove_me.word_emb.weight
print(old_weight)
print(optimizer.state[old_weight])
print(optimizer.state_dict())
type(remove_me.word_emb.weight)

<bound method Module.parameters of FixeFeatureSkipGramModel(
  (word_emb): Embedding(2, 3, sparse=True)
  (con_emb): Embedding(2, 3, sparse=True)
)>
Parameter containing:
tensor([[ 0.0644,  0.2277,  0.2839],
        [ 0.3692, -0.0864, -0.4904]], requires_grad=True)
{}
{'state': {0: {}}, 'param_groups': [{'lr': 0.001, 'betas': (0.9, 0.999), 'eps': 1e-08, 'maximize': False, 'params': [0, 1]}]}


torch.nn.parameter.Parameter

In [None]:
# def extend_vocabulary(self, num_new_words: int, optimizer: torch.optim.Optimizer, ref_vector: torch.Tensor, epsilon: float = 0.05):
#         """
#         Extends vocabulary AND updates the optimizer to keep momentum.
#         """
#         device = self.word_emb.weight.device
        
#         # --- 1. PREPARE NEW WEIGHTS (Same as before) ---
#         ref_vector = ref_vector.to(device).view(1, -1)
        
#         noise_word = torch.randn(num_new_words, self.emb_dim, device=device) * epsilon
#         noise_con = torch.randn(num_new_words, self.con_size, device=device) * epsilon

#         new_word_part = ref_vector.expand(num_new_words, -1) + noise_word
#         new_con_part = ref_vector.expand(num_new_words, -1) + noise_con

#         # --- 2. CAPTURE OLD PARAMETERS ---
#         # We need references to the old tensor objects to find them in the optimizer
#         old_word_param = self.word_emb.weight
#         old_con_param = self.con_emb.weight

#         # --- 3. UPDATE MODEL (Create new larger embeddings) ---
#         with torch.no_grad():
#             updated_word_w = torch.cat([old_word_param.data, new_word_part], dim=0)
#             updated_con_w = torch.cat([old_con_param.data, new_con_part], dim=0)

#         self.word_emb = nn.Embedding.from_pretrained(updated_word_w, freeze=False, sparse=self.word_emb.sparse)
#         self.con_emb = nn.Embedding.from_pretrained(updated_con_w, freeze=False, sparse=self.con_emb.sparse)
        
#         self.emb_size += num_new_words

#         # --- 4. UPDATE OPTIMIZER STATE ---
#         # We define a helper to handle the nasty internal dictionary surgery
#         def patch_optimizer_state(optimizer, old_param, new_param, num_new_rows):
#             # 1. Check if the old parameter actually has state (momentum, etc.)
#             if old_param in optimizer.state:
#                 old_state = optimizer.state[old_param]
#                 new_state = {}
                
#                 # Iterate over buffers (e.g., 'exp_avg', 'exp_avg_sq' in Adam)
#                 for key, value in old_state.items():
#                     if isinstance(value, torch.Tensor):
#                         # Create a new buffer of zeros with the new larger size
#                         # We assume the state tensors match the param shape (which they usually do for embeddings)
#                         new_shape = (value.shape[0] + num_new_rows, value.shape[1])
#                         new_buffer = torch.zeros(new_shape, device=value.device, dtype=value.dtype)
                        
#                         # Copy the old momentum into the top part
#                         new_buffer[:value.shape[0]] = value
                        
#                         # The bottom part (new words) starts with 0 momentum (fresh start)
#                         new_state[key] = new_buffer
#                     else:
#                         # Copy non-tensor scalars (like 'step') directly
#                         new_state[key] = value
                
#                 # Assign this new state dictionary to the NEW parameter key
#                 optimizer.state[new_param] = new_state
                
#                 # Delete the old parameter from state
#                 del optimizer.state[old_param]

#             # 2. Update param_groups (The list the optimizer iterates over)
#             for group in optimizer.param_groups:
#                 # Replace the old param object with the new one in the list
#                 if old_param in group['params']:
#                     index = group['params'].index(old_param)
#                     group['params'][index] = new_param

#         # Apply the fix to both embeddings
#         patch_optimizer_state(optimizer, old_word_param, self.word_emb.weight, num_new_words)
#         patch_optimizer_state(optimizer, old_con_param, self.con_emb.weight, num_new_words)

#         print(f"Vocabulary extended to {self.emb_size}. Optimizer momentum preserved.")

In [5]:
model = FixeFeatureSkipGramModel(emb_size=5, embedding_dimension=3, sparse=True)
optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.01)

print(model.word_emb.weight)

with torch.no_grad():    
    model.word_emb.weight[0] = torch.tensor([1, 1, 1], dtype=float)
    model.con_emb.weight[0] = torch.tensor([1, 1, 1], dtype=float)
    
print(model.word_emb.weight)


ref_central:torch.Tensor = deepcopy(model.word_emb.weight[0].detach())
print(ref_central)

model.extend_vocabulary(
    ref_central_vector=ref_central,
    ref_context_vector=ref_central,
    epsilon=0.001
)

print(model.word_emb.weight)

Parameter containing:
tensor([[-0.1386, -0.0928,  0.0609],
        [-0.0620, -0.0965, -0.0840],
        [ 0.1020, -0.0543,  0.0769],
        [ 0.0617, -0.0991,  0.1503],
        [-0.0475,  0.0610, -0.1395]], requires_grad=True)
Parameter containing:
tensor([[ 1.0000,  1.0000,  1.0000],
        [-0.0620, -0.0965, -0.0840],
        [ 0.1020, -0.0543,  0.0769],
        [ 0.0617, -0.0991,  0.1503],
        [-0.0475,  0.0610, -0.1395]], requires_grad=True)
tensor([1., 1., 1.])
Parameter containing:
tensor([[ 1.0000,  1.0000,  1.0000],
        [-0.0620, -0.0965, -0.0840],
        [ 0.1020, -0.0543,  0.0769],
        [ 0.0617, -0.0991,  0.1503],
        [-0.0475,  0.0610, -0.1395],
        [ 1.0002,  0.9987,  1.0003]], requires_grad=True)


In [6]:
def train_Word2Vec(modelW2V:nn.Module, dataLoader:Dataset, optimizer:optim.Optimizer, epochs:int, verbal:bool=True, log_interval=100, device="cpu"):
    """Fonction d’entraînement pour un modèle Word2Vec
    """
    for epoch in range(1, epochs + 1):
        epoch_loss = 0.0
        batches = 0
        loss_history = []
        global_step = 0
        
        modelW2V.train()

        for batch in dataLoader:
            # centers: [B], pos: [B], negs: [B, K]
            centers, pos, negs = batch
            centers = centers.to(device)
            pos = pos.to(device)
            negs = negs.to(device)

            optimizer.zero_grad()
            loss = modelW2V(centers, pos, negs)
            loss.backward()

            optimizer.step()

            batch_loss = loss.item()
            epoch_loss += batch_loss
            loss_history.append(batch_loss)
            batches += 1
            global_step += 1

            if verbal and log_interval and (global_step % log_interval == 0):
                print(f"Epoch {epoch} Step {global_step} AvgLoss {epoch_loss / batches:.6f}")

        avg_epoch_loss = epoch_loss / max(1, batches)
        if verbal : print(f"Epoch {epoch} finished. Avg loss: {avg_epoch_loss:.6f}")

    return {"loss_history": loss_history, "final_epoch_loss": avg_epoch_loss}

In [None]:
# Fix : A, B, C
# A = [-1, 1, 0], B = [0, -1, 1], C = [-1, 0, 1]
# No fixe : R, S, T, U, V, W, X, Y, Z
data1 = ['R', 'A', ['B', 'C', 'T']] 
data2 = ['S', 'A', ['B', 'C', 'T']]
data3 = ['T', 'A', ['B', 'S', 'R']]
data4 = ['S', 'R', ['B', 'C', 'T']]

# On veut rapprocher A, X, Y, Z avec Z proche de C et loin de X, Y (X, Y proche)

encoder = {
    'A' : 0,
    'B' : 1,
    'C' : 2,
    'R' : 3,
    'S' : 4,
    'T' : 5
}


In [None]:
model = FixeFeatureSkipGramModel(emb_size=3, embedding_dimension=3, sparse=True)
optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.8)


with torch.no_grad():
    model.word_emb.weight[encoder['A']] = torch.tensor([1, 0, 0])
    model.word_emb.weight[encoder['B']] = torch.tensor([0, 1, 0])
    model.word_emb.weight[encoder['C']] = torch.tensor([0, 0, 1])
    model.con_emb.weight[encoder['A']] = torch.tensor([1, 0, 0])
    model.con_emb.weight[encoder['B']] = torch.tensor([0, 1, 0])
    model.con_emb.weight[encoder['C']] = torch.tensor([0, 0, 1])
    
print(model.word_emb.weight)

model.extend_vocabulary(
    ref_central_vector=deepcopy(model.word_emb.weight[0].detach()),
    ref_context_vector=deepcopy(model.word_emb.weight[0].detach()),
    epsilon=0.01
) # Pose X

model.extend_vocabulary(
    ref_central_vector=deepcopy(model.word_emb.weight[0].detach()),
    ref_context_vector=deepcopy(model.word_emb.weight[0].detach()),
    epsilon=0.01
) # Pose Y

model.extend_vocabulary(
    ref_central_vector=deepcopy(model.word_emb.weight[0].detach()),
    ref_context_vector=deepcopy(model.word_emb.weight[0].detach()),
    epsilon=0.01
) # Pose Z

optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.8)

print(model.word_emb.weight)

fig = components_to_fig_3D(components=model.word_emb.weight.detach().numpy(), encoder=encoder, highlight_words=["A", "R"], nb_neighbors=0)
fig.show()

for center, pos, negs in [data1, data2, data3, data4]:
    optimizer.zero_grad()

    central_w = torch.tensor(encoder[center], dtype=int).unsqueeze(0)
    pos_w = torch.tensor(encoder[pos], dtype=int).unsqueeze(0)
    negs_w = torch.tensor([encoder[idx] for idx in negs], dtype=int).unsqueeze(0)

    loss = model(central_w, pos_w, negs_w)
    loss.backward()

    optimizer.step()
    



Parameter containing:
tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]], requires_grad=True)
Parameter containing:
tensor([[ 1.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  1.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  1.0000e+00],
        [ 9.8660e-01,  1.1191e-02, -6.1792e-04],
        [ 9.9958e-01,  1.3668e-02,  1.5306e-02],
        [ 1.0198e+00,  6.8537e-03, -6.0352e-03]], requires_grad=True)


defaultdict(<class 'dict'>, {Parameter containing:
tensor([[ 1.0000,  0.0000,  0.0000],
        [ 0.0000,  1.0000,  0.0000],
        [ 0.0000,  0.0000,  1.0000],
        [ 0.1866, -0.7888, -0.8006],
        [ 2.1842,  1.0219, -0.9292],
        [ 0.5087,  0.5179, -0.5171]], requires_grad=True): {'step': 4, 'exp_avg': tensor([[ 0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.0473,  0.0499,  0.0492],
        [-0.0917, -0.0664,  0.1035],
        [ 0.1034, -0.0212,  0.0060]]), 'exp_avg_sq': tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.2388e-04, 2.4946e-04, 2.4173e-04],
        [5.2326e-04, 5.5898e-04, 1.8994e-03],
        [1.0698e-03, 4.4745e-05, 3.6299e-06]])}, Parameter containing:
tensor([[ 3.2337,  2.2298,  0.0785],
        [-2.8865, -1.9910,  0.1595],
        [-2.1330, -2.2945,  1.4541],
        [ 0.5158, -0.0604,  0.05

In [12]:

print(optimizer.param_groups)


[{'params': [Parameter containing:
tensor([[ 1.0000,  0.0000,  0.0000],
        [ 0.0000,  1.0000,  0.0000],
        [ 0.0000,  0.0000,  1.0000],
        [ 0.1866, -0.7888, -0.8006],
        [ 2.1842,  1.0219, -0.9292],
        [ 0.5087,  0.5179, -0.5171]], requires_grad=True), Parameter containing:
tensor([[ 3.2337,  2.2298,  0.0785],
        [-2.8865, -1.9910,  0.1595],
        [-2.1330, -2.2945,  1.4541],
        [ 0.5158, -0.0604,  0.0597],
        [ 0.4876, -0.5130,  0.5091],
        [-1.2066, -2.1479,  0.6345]], requires_grad=True)], 'lr': 0.8, 'betas': (0.9, 0.999), 'eps': 1e-08, 'maximize': False}]


In [9]:
fig = components_to_fig_3D(components=deepcopy(model.con_emb.weight.detach().numpy()), encoder=encoder, highlight_words=["A", data1[0]], nb_neighbors=0)
fig.show()