In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics.pairwise import cosine_similarity

from copy import deepcopy

import numpy as np

import numpy as np
import plotly.graph_objects as go
import plotly.colors as pc

from typing import Sequence, Optional, Callable, List, Dict

from visuEmbedding import interactive_embedding_plot_3D, components_to_fig_3D, components_to_fig_3D_animation

In [None]:
print([1, 2, 3].extend([4, 5, 6]))

In [None]:
def visualize_normalized_gradients(
    embeddings,
    gradients=None,
    embeddings_after:List=[],
    labels=[],
    labels_after=None,
    visual_length=0.5,
    word_to_display:Optional[List]=None,
    _min=None,
    _max=None,
):
    """
    Visualizes embeddings and gradients in 3D using Plotly.
    Gradients are normalized to a fixed visual length to act as direction indicators.

    """
    fig = go.Figure()
    if _min is None or _max is None:
        all_data = embeddings_after + [embeddings]
        combined = np.vstack(all_data)
        max_abs_val = np.max(np.abs(combined))
            
        limit = max_abs_val * 1.1
        if _min is None:
            _min = -limit
        if _max is None:
            _max = limit
    # if gradients is not None:
    #     grad_norms = np.linalg.norm(gradients, axis=1, keepdims=True)
    #     grad_norms[grad_norms == 0] = 1.0
    #     normalized_grads = (gradients / grad_norms) * visual_length
    #     normalized_grads = (gradients / grad_norms) * visual_length
    
    nb_label = len(labels)
    for i, emb in enumerate(embeddings):
        label = labels[i] if i < nb_label else i
        to_dis = (word_to_display is None or label in word_to_display)
            
        fig.add_trace(
            go.Scatter3d(
                x=[0, emb[0]],
                y=[0, emb[1]],
                z=[0, emb[2]],
                mode="lines+markers+text",
                marker=dict(size=3, color="black"),
                line=dict(color="gray", width=5),
                name=f"{label}",
                legendgroup="group_start",
                legendgrouptitle_text="Initial State" if i == 0 else None, # Titre du groupe
                text=["", label],
                textposition="top center",
                textfont=dict(size=15, color="black"),
                hoverinfo="none",
                visible=True if to_dis else 'legendonly'
            )
        )

        if gradients is None:
            continue
        
        grad_vis = gradients[i]
        grad_end = (emb + grad_vis * visual_length)
        real_grad_norm = np.linalg.norm(gradients[i])  # Actual magnitude for hover info

        fig.add_trace(
            go.Scatter3d(
                x=[emb[0], grad_end[0]],
                y=[emb[1], grad_end[1]],
                z=[emb[2], grad_end[2]],
                mode="lines",
                line=dict(color="red", width=4),
                showlegend=False,
                hoverinfo="text",
                text=f"Gradient Strength: {real_grad_norm:.4f}",
            )
        )

        fig.add_trace(
            go.Cone(
                x=[grad_end[0]],
                y=[grad_end[1]],
                z=[grad_end[2]],
                u=[grad_vis[0]],
                v=[grad_vis[1]],
                w=[grad_vis[2]],
                showscale=False,
                sizemode="absolute",
                sizeref=0.5,  # Controls size of the arrow head
                anchor="tip",  # Tip of cone is at the end of the line
                colorscale=[[0, "red"], [1, "red"]],
                name="Gradient Dir",
            )
        )
        
    for i, embedding in enumerate(embeddings_after):
        group_name = labels_after[i] if labels_after else i
        
        for j in range(len(embedding)):
            label = labels[j] if j < nb_label else j
            to_dis = (word_to_display is None or label in word_to_display)
            
            scale_val = 0.3 + 0.7 * (i / (len(embeddings_after) - 1)) if len(embeddings_after) > 1 else 1
            color_code = pc.sample_colorscale('Viridis', [scale_val])[0]
            
            emb = embedding[j]

            fig.add_trace(
                go.Scatter3d(
                    x=[0, emb[0]],
                    y=[0, emb[1]],
                    z=[0, emb[2]],
                    mode="lines+markers+text",
                    name=label,
                    text=["", label],
                    textposition="top center",
                    legendgroup=group_name,
                    legendgrouptitle_text=group_name if j == 0 else None,
                    marker=dict(size=3, color="red"),
                    line=dict(color=color_code, width=5),
                    visible=True if to_dis else 'legendonly'
                )
            )

    fig.update_layout(
        width=1000,
        height=800,
        title="3D Gradient Direction (Normalized Length)",
        scene=dict(
            xaxis=dict(nticks=4, range=[_min, _max], title="X"),
            yaxis=dict(nticks=4, range=[_min, _max], title="Y"),
            zaxis=dict(nticks=4, range=[_min, _max], title="Z"),
        ),
        scene_aspectmode="cube",
        legend=dict(
            groupclick="toggleitem"
        )
    )
    fig.show()

In [None]:
l = [["a", "aaezfee" , "1zefczc", "TRZ", "f", "0", "c", "89"]]

all_str = [t for s in l for t in s if t.isalpha()]
print(all_str)
it = iter(l[0])
for x, y in zip(it, it):
    print(x, y)
    
    
d = {
    "a": 1,
    "b": 3,
    "c": 2
}

d2 = {
    1 : "guessed",
    145: "inside",
    23 : "Look",
    21 : "Now",
    89 : "get",
    65 : "aren",
    23 : "wait",
    90 : "numpy",
    12 : "at",
    34 : "the",
    
}
    
l = dict(sorted(d2.items())).values()
print(l)

In [None]:
print(type(np.array(1)))

A = np.ones((3,5))
print(A)

B = np.zeros(A.shape)
print(B)

print(np.concatenate((A, B), axis=0))

listest = [A]
listest.append(B)

print(listest)


tensorToMult = torch.tensor([1, 2, 2]) * 3
print(tensorToMult)

print("dot product : ", torch.tensor([1, 2, 2]).dot(torch.tensor([0, -2, 2])))
print("sum *, dim=1 : ", torch.sum(torch.tensor([[1, 2, 2], [2, 1, 2]]) * torch.tensor([[0, -2, 2], [3, 0, -2]]), dim=1))

print("unsqueeze : ", torch.tensor([[1, 2, 2], [2, 1, 2]]).unsqueeze(-1))
print("size unsqueeze : ", torch.tensor([[1, 2, 2], [2, 1, 2]]).unsqueeze(0).size())

print('bmm : ', )



In [None]:
print(cosine_similarity([[0, 0, 0]], [[0, 0.4, 1]]))
print(cosine_similarity([[0, 0.4, 1]], [[1, 0, 1]]))
print(cosine_similarity([[1, 0, 1]], [[0, 0.4, 1]]))
print(cosine_similarity([[1, 0, 0]], [[-1, 0, 0]]))


tensor1 = torch.tensor([1, 2, 3])
tensor2 = torch.tensor([7, 8, 9])

print(tensor1 * tensor2)

tensor3 = torch.tensor([[7., 8., 9.], [0., 1., 0.], [0., 0., 1.]])

torch.linalg.vector_norm(tensor3, dim=1)

In [None]:
man = torch.tensor([1, 0, 0], dtype=float)
woman = torch.tensor([1, 1, 0], dtype=float)
king = torch.tensor([1, 0, 1], dtype=float)

grad_man = torch.tensor([0.4621, 0.7311, -0.2689], dtype=float)
grad_woman = torch.tensor([0.7311, 0.0000, 0.0000], dtype=float)
grad_king = torch.tensor([-0.2689, -0.0000, -0.0000], dtype=float)

man2 = torch.tensor([0.5, 0, 0], dtype=float)
woman2 = torch.tensor([1, 3, 4], dtype=float)
king2 = torch.tensor([1, 0, 1], dtype=float)

man3 = torch.tensor([0.1, 0, 10], dtype=float)
woman3 = torch.tensor([3, 4, 0], dtype=float)
king3 = torch.tensor([1, 0, 1], dtype=float)

# Stack them
emb_weights = torch.stack([man, woman, king]).numpy()
grads = torch.stack([grad_man, grad_woman, grad_king]).numpy()
emb_after2 = torch.stack([man2, woman2, king2]).numpy()
emb_after3 = torch.stack([man3, woman3, king3]).numpy()

print(grads)
labels = ["man", "woman", "king", "queen"]

visualize_normalized_gradients(
    emb_weights,
    grads,
    embeddings_after=[emb_after2, emb_after3],
    labels=labels,
    word_to_display=["man"],
    visual_length=0.2,
)

In [None]:
emb_init = nn.Embedding(10, 3)
emb_init.weight.data.uniform_(-1, 1)
facteur = 1
# with torch.no_grad():
#     emb_init.weight[0] = torch.tensor([5, 0, -1], dtype=float, requires_grad=True) * facteur # Man
#     emb_init.weight[1] = torch.tensor([-5, 0, 1], dtype=float, requires_grad=True) * facteur # Woman
#     emb_init.weight[2] = torch.tensor([5, 5, 1], dtype=float, requires_grad=True) * facteur # King
#     emb_init.weight[3] = torch.tensor([5, -5, -1], dtype=float, requires_grad=True) * facteur # Farmer
    
    
# visualize_normalized_gradients(
#     embeddings=emb_init.weight.detach().numpy(), 
#     embeddings_after=[],
#     labels=["homme", "femme", "roi", "fermier"],
#     word_to_display=["homme", "femme", "roi", "fermier"],
#     gradients=None, labels_after=None,
# )

# print("Similar pair :")
# print(f"Loss for man and king : {F.logsigmoid(emb_init.weight[0].dot(emb_init.weight[2]))} \
#       (dot product {emb_init.weight[0].dot(emb_init.weight[2])})")
# print(f"Loss for man and farmer : {F.logsigmoid(emb_init.weight[0].dot(emb_init.weight[3]))} \
#       (dot product {emb_init.weight[0].dot(emb_init.weight[3])})")

# print("Disimilar pair :")
# print(f"Loss for king and farmer : {F.logsigmoid(emb_init.weight[2].dot(-emb_init.weight[3]))} \
#       (dot product {emb_init.weight[2].dot(-emb_init.weight[3])})")


In [None]:
emb = deepcopy(emb_init)
emb2 = deepcopy(emb_init)
emb3 = deepcopy(emb_init)

histo = [deepcopy(emb.weight.detach().numpy())]
histo2 = [deepcopy(emb2.weight.detach().numpy())]
histo3 = [deepcopy(emb3.weight.detach().numpy())]

emb = deepcopy(emb_init)
emb2 = deepcopy(emb_init)
emb3 = deepcopy(emb_init)

# opti = torch.optim.Adam(emb.parameters(), lr=10.8)
# opti2 = torch.optim.Adam(emb2.parameters(), lr=10.8)
# opti3 = torch.optim.Adam(emb3.parameters(), lr=10.8)

opti = torch.optim.SGD(emb.parameters(), lr=0.8)
opti2 = torch.optim.SGD(emb2.parameters(), lr=0.8)
opti3 = torch.optim.SGD(emb3.parameters(), lr=0.8)


In [None]:
loss = (F.cosine_similarity(emb.weight, emb.weight))
print(loss)
cosine_similarity(emb.weight.detach().numpy())


In [None]:
opti.zero_grad()
opti2.zero_grad()
opti3.zero_grad()

############################################################ Batch (moyen grad)
loss = -(F.logsigmoid(torch.sum(emb(torch.tensor([0, 1])) * emb(torch.tensor([2, 2])), dim=1))).mean()
print("Batch Loss : ", loss)
loss.backward()
print('here')
print(emb.weight.grad)
opti.step()
histo.append(deepcopy(emb.weight.detach().numpy()))

############################################################ Epoch
loss = -F.logsigmoid(torch.sum(emb2(torch.tensor([0])) * emb2(torch.tensor([2])), dim=1))
print("Epoch Loss (1): ", loss)
loss.backward()
opti2.step()
opti2.zero_grad()
loss = -F.logsigmoid(torch.sum(emb2(torch.tensor([1])) * emb2(torch.tensor([2])), dim=1))
print("Epoch Loss (2) : ", loss)
loss.backward()
opti2.step()
histo2.append(deepcopy(emb2.weight.detach().numpy()))

############################################################ Sum
loss = -(F.logsigmoid(torch.sum(emb3(torch.tensor([0])) * emb3(torch.tensor([2])), dim=1)) 
         + F.logsigmoid(torch.sum(emb3(torch.tensor([1])) * emb3(torch.tensor([2])), dim=1))).mean()
print("Sum Loss : ", loss)
loss.backward()

opti3.step()
histo3.append(deepcopy(emb3.weight.detach().numpy()))

############################################################ Poids
print("Weight init :")
print(emb_init.weight.detach().numpy()[0:3])
print("Weight after batch :")
print(emb.weight.detach().numpy()[0:3])
print("Weight after epoch by epoch :")
print(emb2.weight.detach().numpy()[0:3])
print("Weight after sum of loss :")
print(emb3.weight.detach().numpy()[0:3])


In [None]:
visualize_normalized_gradients(
    embeddings=emb_init.weight.detach().numpy(), 
    embeddings_after=[histo[-1], histo2[-1], histo3[-1]],
    labels=["homme", "femme", "roi"],
    word_to_display=["homme", "femme", "roi"],
    gradients=None, labels_after=None,
)

In [None]:
visualize_normalized_gradients(
    embeddings=emb_init.weight.detach().numpy(), 
    embeddings_after=histo3,
    labels=["homme", "femme", "roi"],
    word_to_display=["homme", "femme", "roi"],
    gradients=None, labels_after=None,
)
    # embeddings=histo3[-2], 
    # embeddings_after=[histo3[-1]],

In [None]:
raise SystemError

In [None]:
print("grad :", emb3.weight.grad)
print("grad :", emb2.weight.grad)

loss = -(F.logsigmoid(torch.sum(emb3(torch.tensor([0])) * emb3(torch.tensor([2])), dim=1)) 
         + F.logsigmoid(torch.sum(emb3(torch.tensor([1])) * emb3(torch.tensor([2])), dim=1))).mean()
print(loss)


In [None]:
print("Dot product between w1 and w2", torch.sum(emb3.weight[1] * emb3.weight[2]))
print("Dot product between w0 and w2", torch.sum(emb3.weight[0] * emb3.weight[2]))


In [None]:
man = torch.tensor([1, 0, 0], dtype=float)
woman = torch.tensor([-1, 0, 0], dtype=float)
royalty = torch.tensor([0, 0, 1], dtype=float)

king = torch.tensor([1, 0, 1], dtype=float)
queen = torch.tensor([-1, 0, 1], dtype=float)

# On a [masculin, féminin, royauté]
print("homme = king - royalty")
print("king - royalty : ", king - royalty)

# 3 dimensions qui permet d'encoder 5 mots, chaque dimensions est en quelque sorte un "sens/concept"

# Deux mots qui partage un sens et pas de différence :
print("homme * king : ", man.dot(king).numpy())
print("king * homme : ", king.dot(man).numpy())

# Deux mots qui sont différents :
print("homme * femme", man.dot(woman).numpy())
print("homme * queen", man.dot(queen).numpy())

# Deux mots qui n'ont rien en commun (orthogonal)
print("homme * royalty", man.dot(royalty).numpy())


visualize_normalized_gradients(
    embeddings=np.array([[1, 0, 0],
                        [-1, 0, 0],
                        [0, 0, 1],
                        [1, 0, 1],
                        [-1, 0, 1]]),
    labels=["man", "woman", "royalty", "king", "queen"]
    )

$$
J = - \log \sigma(\mathbf{u}_c \cdot \mathbf{v}_w) - \sum_{i=1}^{k} \log \sigma(- \mathbf{u}_{n_i} \cdot \mathbf{v}_w)
$$

$$
- \log \sigma(\mathbf{u}_c \cdot \mathbf{v}_w)
$$


In [None]:
print("-log(sin(homme * king))", F.logsigmoid(man.dot(king)))
print("-log(sin(homme * king))", F.logsigmoid(man.dot(king)))

In [None]:
# On remarque :
print("-log(sin(homme * homme))", F.logsigmoid(man.dot(man)))

In [None]:
man2 = torch.tensor([100, 0, 0], dtype=float)
woman2 = torch.tensor([10, 100, 0], dtype=float)
royalty2 = torch.tensor([0, 0, 100], dtype=float)

king2 = torch.tensor([100, 0, 100], dtype=float)
queen2 = torch.tensor([0, 100, 100], dtype=float)

In [None]:
print("-log(sin(king * homme))", -F.logsigmoid(man.dot(woman)))

![image.png](formuleSG.png)

La formule de base du SG

In [None]:
print(queen @ man)

![image.png](cosinSim.png)

Formule de similarité
(Avec un epsilon pour éviter la division par 0)

In [None]:
print("la similarité entre homme et homme", nn.functional.cosine_similarity(man.unsqueeze(0), man.unsqueeze(0)))
print("la similarité entre homme et roi ", nn.functional.cosine_similarity(man.unsqueeze(0), king.unsqueeze(0)))
print("la similarité entre homme et femme ", nn.functional.cosine_similarity(man.unsqueeze(0), woman.unsqueeze(0)))
print("la loss entre homme et homme", F.logsigmoid(man.dot(man)))
print("la loss entre homme et roi", F.logsigmoid(man.dot(king)))
print("la loss entre homme et femme (similaire) ", F.logsigmoid(man.dot(woman)))
print("la loss entre homme et femme (dissimilaire)", F.logsigmoid(woman.dot(-man)))


print("\nValeur plus grandes")
print("la similarité entre man et man", nn.functional.cosine_similarity(man2.unsqueeze(0), man2.unsqueeze(0)))
print("la similarité entre man et roi ", nn.functional.cosine_similarity(man2.unsqueeze(0), king2.unsqueeze(0)))
print("la loss entre homme et homme", F.logsigmoid(man2.dot(man2)))
print("la loss entre homme et roi", F.logsigmoid(man2.dot(king2)))
print("la loss entre homme et femme (pos) ", F.logsigmoid(man2.dot(woman2)))
print("la loss entre homme et femme (négatif)", F.logsigmoid((-man2).dot(woman2)))

On remarque que la fonction de loss n'est pas égal à la similarité. La loss à pour objectifs d'être le plus "simple" pour pouvoir process les mots le plus rapidement possible.  

# Init embedding

In [None]:
man3 = torch.tensor([1, 0, 0], dtype=float, requires_grad=True)
woman3 = torch.tensor([1, 1, 0], dtype=float, requires_grad=True)
king3 = torch.tensor([1, 0, 1], dtype=float, requires_grad=True)

encoder = {
    "man": 0,
    "woman": 1,
    "king": 2,
}
base_color = {}

emb = nn.Embedding(3, 3)

with torch.no_grad():
    emb.weight[0] = man3
    emb.weight[1] = woman3
    emb.weight[2] = king3
init_emb = deepcopy(emb.weight.detach().numpy())

fig = components_to_fig_3D(
    components=deepcopy(emb.weight.detach().numpy()),
    encoder=encoder,
    highlight_words=["man", "woman", "king"],
    nb_neighbors=0,
    base_color=base_color
)
fig.show()


# Compute loss and see gradient

In [None]:
loss = -(
    F.logsigmoid(emb.weight[0].dot(emb.weight[2]))
    + F.logsigmoid(-emb.weight[0].dot(emb.weight[1]))
).mean()
print("loss for man similar to king and dissimilar to woman", loss)
loss.backward(retain_graph=None)

print(emb.weight.grad.clone())

print(
    "Cosine similarity man and woman ",
    nn.functional.cosine_similarity(man3.unsqueeze(0), woman3.unsqueeze(0)),
)
print(
    "Cosine similarity man and king",
    nn.functional.cosine_similarity(man3.unsqueeze(0), king.unsqueeze(0)),
)
print(
    "Cosine similarity woman and king",
    nn.functional.cosine_similarity(woman3.unsqueeze(0), king3.unsqueeze(0)),
)

gradient = -emb.weight.grad.detach().numpy()
visualize_normalized_gradients(
    emb.weight.detach().numpy(), gradients=gradient, labels=labels, visual_length=1
)

In [None]:
embAdam = deepcopy(emb)
embSGD = deepcopy(emb)

embAdam.weight.grad = emb.weight.grad
embSGD.weight.grad = emb.weight.grad


# loss = -(F.logsigmoid(emb.weight[0].dot(emb.weight[2])) + F.logsigmoid(-emb.weight[0].dot(emb.weight[1]))).mean()

opti = torch.optim.SGD(embSGD.parameters(), lr=1.6)
opti.step()


opti = torch.optim.Adam(embAdam.parameters(), lr=1)
opti.step()

visualize_normalized_gradients(
    embeddings=init_emb,
    embeddings_after=[embSGD.weight.detach().numpy(), embAdam.weight.detach().numpy()],
    gradients=gradient,
    labels=labels,
    labels_after=["SGD", "ADAM"],
    visual_length=0.2,
)


In [None]:
emb.weight.detach().numpy()

# Initialisation et raffinage
On va essayer de placer intelligemment un nouveau mots (Queen) avec des mots déjà placer : Man, Woman, King, royalty

## Premise, avoir des vecteurs stables ?
Première question est quels vecteurs sont stables par rapport a la loss et l'optimiser ?  
- Pour savoir si nos vecteurs sont stable, il faut définir un objectif.  
L'objectif que je propose est donc d'avoir les pairs similaire suivante : King et homme | roi et royalty  
Les paires différentes suivante : homme et femme | roi et femme  
Et les pairs sans rapport homme et royalty | femme et roi

Deuxième question (on peut l'ignorer pour l'instant si c'est gênant) est ce que le modèle peut avoir des dimensions non utilisé ?


In [None]:
facteur = 0.1

man = torch.tensor([5, 0, -1], dtype=float, requires_grad=True) * facteur
woman = torch.tensor([-5, 0, -1], dtype=float, requires_grad=True) * facteur
king = torch.tensor([5, 5, 1], dtype=float, requires_grad=True) * facteur
royalty = torch.tensor([1, 5, -5], dtype=float, requires_grad=True) * facteur

encoder = {
    "man": 0,
    "woman": 1,
    "king": 2,
    "royalty": 3,
    "queen": 4,
    "queen2": 5
}

emb = nn.Embedding(10, 3)

labels = list(encoder.keys())

with torch.no_grad():
    emb.weight[0] = man
    emb.weight[1] = woman
    emb.weight[2] = king
    emb.weight[3] = royalty
    
init_emb = deepcopy(emb.weight.detach().numpy())

# On vérifie la stabilité 
print("Similar pair :")
print(f"Loss for man and king : {F.logsigmoid(emb.weight[encoder["man"]].dot(emb.weight[encoder["king"]]))} (dot product {emb.weight[encoder["man"]].dot(emb.weight[encoder["king"]])})")
print(f"Loss for king and royalty : {F.logsigmoid(emb.weight[encoder["king"]].dot(emb.weight[encoder["royalty"]]))} (dot product {emb.weight[encoder["king"]].dot(emb.weight[encoder["royalty"]])})")
print("Dissimilar Pair -v * u")
print(f"Loss for man and woman : {F.logsigmoid(emb.weight[encoder["woman"]].dot(-emb.weight[encoder["man"]]))} (dot product {emb.weight[encoder["woman"]].dot(-emb.weight[encoder["man"]])})")
print(f"Loss for king and woman : {F.logsigmoid(emb.weight[encoder["woman"]].dot(-emb.weight[encoder["king"]]))} (dot product {emb.weight[encoder["woman"]].dot(-emb.weight[encoder["king"]])})")
print(f"Neutral pair objectif no loss but dot product :")
print(f"Loss for man and royalty : dot product {emb.weight[encoder["man"]].dot(-emb.weight[encoder["royalty"]])})")
print(f"Loss for woman and royalty : dot product {emb.weight[encoder["woman"]].dot(-emb.weight[encoder["royalty"]])})")

In [None]:
print(labels)
visualize_normalized_gradients(
    emb.weight.detach().numpy(),
    labels=labels, gradients=None, 
    embeddings_after=[],
    labels_after= ['SGD', 'ADAM'],
    word_to_display=labels,
    visual_length=2.6
)

## Première idée
Le but est de voir comment, après un placement bouger un nouveaux mot de manière correcte.  
Premier scénario, on ajoute queen :  
On met queen proche de woman. Puis on rend se mots similaire de royalty.

In [None]:
queen = torch.tensor([-2, 0, -0.4], dtype=float, requires_grad=True) * facteur
queen2 = torch.tensor([-5, 0, -1], dtype=float, requires_grad=True) * facteur

with torch.no_grad():
    emb.weight[encoder["queen"]] = queen
    emb.weight[encoder["queen2"]] = queen2

loss = -(F.logsigmoid(emb.weight[encoder['queen']].dot(emb.weight[encoder["royalty"]])))

print(loss)
loss.backward(retain_graph=None)
grad1 = emb.weight.grad.clone().numpy()

emb_SGD = deepcopy(emb)
emb_SGD.weight.grad = deepcopy(emb.weight.grad)

emb_ADAM = deepcopy(emb)
emb_ADAM.weight.grad = deepcopy(emb.weight.grad)

opti_SGD = torch.optim.SGD(emb_SGD.parameters(), lr=1)
opti_SGD.step()

visualize_normalized_gradients(
    emb.weight.detach().numpy(),
    labels=labels, gradients=-grad1, 
    embeddings_after=[emb_SGD.weight.detach().numpy(), emb_ADAM.weight.detach().numpy()],
    labels_after= ['SGD', 'ADAM'],
    word_to_display=labels,
    visual_length=1
)

In [None]:
loss_SGD = -(F.logsigmoid(emb_SGD.weight[encoder['queen']].dot(emb_SGD.weight[encoder["royalty"]])))
loss_ADAM = -(F.logsigmoid(emb_ADAM.weight[encoder['queen']].dot(emb_ADAM.weight[encoder["royalty"]])))

emb_SGD1 = deepcopy(emb_SGD)
emb_ADAM1 = deepcopy(emb_ADAM)

loss_SGD.backward(retain_graph=None)
grad_SGD = emb_SGD.weight.grad.clone().numpy()
loss_ADAM.backward(retain_graph=None)
grad_ADAM = emb_ADAM.weight.grad.clone().numpy()

opti_SGD.step()

In [None]:
visualize_normalized_gradients(
    emb_SGD1.weight.detach().numpy(),
    labels=labels, gradients=-grad_SGD, 
    embeddings_after=[emb_SGD.weight.detach().numpy()], 
    labels_after= ['SGD'], word_to_display=["queen", "royalty", "woman"],
    visual_length=2.6
)

visualize_normalized_gradients(
    emb_ADAM1.weight.detach().numpy(),
    labels=labels, gradients=-grad_ADAM, 
    embeddings_after=[emb_ADAM.weight.detach().numpy()], 
    labels_after= ['ADAM'],
    visual_length=2.6
)

In [None]:
print("Similar pair :")
print(f"Loss for man and king : {F.logsigmoid(emb_SGD.weight[encoder["man"]].dot(emb_SGD.weight[encoder["king"]]))} (dot product {emb_SGD.weight[encoder["man"]].dot(emb_SGD.weight[encoder["king"]])})")
print(f"Loss for king and royalty : {F.logsigmoid(emb_SGD.weight[encoder["king"]].dot(emb_SGD.weight[encoder["royalty"]]))} (dot product {emb_SGD.weight[encoder["king"]].dot(emb_SGD.weight[encoder["royalty"]])})")
print("Dissimilar Pair -v * u")
print(f"Loss for man and woman : {F.logsigmoid(emb_SGD.weight[encoder["woman"]].dot(-emb_SGD.weight[encoder["man"]]))} (dot product {emb_SGD.weight[encoder["woman"]].dot(-emb_SGD.weight[encoder["man"]])})")
print(f"Loss for king and woman : {F.logsigmoid(emb_SGD.weight[encoder["woman"]].dot(-emb_SGD.weight[encoder["king"]]))} (dot product {emb_SGD.weight[encoder["woman"]].dot(-emb_SGD.weight[encoder["king"]])})")
print(f"Neutral pair objectif no loss but dot product :")
print(f"Loss for man and royalty : dot product {emb_SGD.weight[encoder["man"]].dot(-emb_SGD.weight[encoder["royalty"]])})")
print(f"Loss for woman and royalty : dot product {emb_SGD.weight[encoder["woman"]].dot(-emb_SGD.weight[encoder["royalty"]])})")

In [None]:
print(f"Loss for queen and king (Dissimilar) : {F.logsigmoid(-emb_SGD.weight[encoder["queen"]].dot(emb_SGD.weight[encoder["king"]]))} (dot product {emb_SGD.weight[-encoder["queen"]].dot(emb_SGD.weight[encoder["king"]])})")
print(f"Loss for queen and woman (similar) : {F.logsigmoid(emb_SGD.weight[encoder["queen"]].dot(emb_SGD.weight[encoder["woman"]]))} (dot product {emb_SGD.weight[encoder["queen"]].dot(emb_SGD.weight[encoder["woman"]])})")

# How use intonation in embedding ? 
The main idea is that the caregiver emphasizes the words that are important to “understand.” Word 2 Vec is based on the ability to look at lots of words in order to find the best vector space. With a very large corpus, it is possible to identify words that have a “deep” meaning using different statistical techniques. And we hope that this also affects W2V. But children don't need a lot of data to acquire language.
So there is (maybe) a way to know which words we should create a deep meaning for and guess which ones are only useful for syntax. Et je suppose que les intonations dans la voix peuvent aider a créer plus rapidement un espace intéressant. En boostant la réctification d'erreur sur les mots qui sont important.

In [None]:
pairs = [
    ("veryImportantWord", "stopWord", ["veryImportantWord2"], 1.9),
    ("veryImportantWord2", "stopWord", ["veryImportantWord"], 1.9),
    ("stopWord", "stopWord2", ["veryImportantWord"], 0.1),
    ("veryImportantWord", "adjectifNiceToDefineOtherWord", ["veryImportantWord2"], 1),
    ("veryImportantWord2", "adjectifNiceToDefineOtherWord2", ["veryImportantWord2"], 1),
    ("adjectifNiceToDefineOtherWord", "stopWord", ["veryImportantWord"], 1),
    ("adjectifNiceToDefineOtherWord", "stopWord2", ["veryImportantWord2"], 1),
    ("veryImportantWord", "stopWord", ["veryImportantWord2"], 1.9),
    ("adjectifNiceToDefineOtherWord", "stopWord2", ["veryImportantWord"], 1),
    ("veryImportantWord2", "stopWord2", ["veryImportantWord2"], 1.9),
    ("adjectifNiceToDefineOtherWord", "stopWord", ["veryImportantWord2"], 1),
    ("veryImportantWord2", "stopWord", ["veryImportantWord"], 1.9),
]

In [None]:
embedding = nn.Embedding(6, 3)
word_to_idx = {
    "veryImportantWord": 0,
    "stopWord": 1,
    "stopWord2": 2,
    "adjectifNiceToDefineOtherWord": 3,
    "adjectifNiceToDefineOtherWord2": 4,
    "veryImportantWord2": 5,
}
with torch.no_grad():
    embedding.weight[0] = torch.tensor([1, 0, 0], dtype=float, requires_grad=True)
    embedding.weight[1] = torch.tensor([0, 1, 0], dtype=float, requires_grad=True)
    embedding.weight[2] = torch.tensor([0, 0, 1], dtype=float, requires_grad=True)
    embedding.weight[3] = torch.tensor([-1, 0, 0], dtype=float, requires_grad=True)
    embedding.weight[4] = torch.tensor([0, -1, 0], dtype=float, requires_grad=True)
    embedding.weight[5] = torch.tensor([0, 0, -1], dtype=float, requires_grad=True)

# with torch.no_grad():
#     random_weights = torch.randn(6, 3)
#     normalized_weights = F.normalize(random_weights, p=2, dim=1)
#     embedding.weight.data.copy_(normalized_weights)
    
    
optimizer = optim.SGD(embedding.parameters(), lr=0.1)

all_emb = [deepcopy(embedding.weight.detach().numpy())]

In [None]:
visualize_normalized_gradients(
    embeddings=all_emb[0], 
    embeddings_after=[],
    labels=list(word_to_idx.keys()),
    word_to_display=None,
    gradients=None, labels_after=None
)

In [None]:
embedding.zero_grad()

wordTarget, wordContext, listOfNeg, weight = pairs[0]
wordTarget = embedding.weight[word_to_idx[wordTarget]]
wordContext = embedding.weight[word_to_idx[wordContext]]
listOfNeg = embedding(torch.tensor([word_to_idx[word_con] for word_con in listOfNeg]))

print(f"Word target {wordTarget}")
print(f"word context {wordContext}")
print(f"list of neg {listOfNeg}")
print(f"Weight {weight}")

pos_score = F.logsigmoid(wordTarget.dot(wordContext))
neg_score = -F.logsigmoid(listOfNeg @ wordTarget)

print(f"pos score : {pos_score}")
print(f"neg score : {neg_score}")

loss = -((pos_score + neg_score)).mean()
# loss = -((pos_score + neg_score)).mean() * weight

print(f"loss : {loss}")
loss.backward()
print(f"grad : {embedding.weight.grad}")
   

In [None]:
embedding.zero_grad()

wordTarget, wordContext, listOfNeg, weight = pairs[0]
wordTargetTensor = embedding.weight[word_to_idx[wordTarget]]
wordContextTensor = embedding.weight[word_to_idx[wordContext]]
listOfNegTensor = embedding(torch.tensor([word_to_idx[word_con] for word_con in listOfNeg]))

print(f"Word target {wordTargetTensor}")
print(f"word context {wordContextTensor}")
print(f"list of neg {listOfNegTensor}")
print(f"Weight {weight}")

pos_score = F.logsigmoid(wordTargetTensor.dot(wordContextTensor))
neg_score = -F.logsigmoid(listOfNegTensor @ wordTargetTensor)

print(f"pos score : {pos_score}")
print(f"neg score : {neg_score}")

loss = -((pos_score + neg_score)).mean()
# loss = -((pos_score + neg_score)).mean() * weight

print(f"loss : {loss}")
loss.backward()
print(f"grad : {embedding.weight.grad}")
embedding.weight.grad[word_to_idx[wordTarget]] *= weight

print(f"grad : {embedding.weight.grad}")

In [None]:
class OnlyOneEmbWeighted(nn.Module):
    def __init__(self, emb_size:int, embedding_dimension:int=15, 
                init_range:float|None=None, sparse:bool=True, device="cpu"):
        super().__init__()
        self.emb_size:int = emb_size
        self.emb_dim:int = embedding_dimension
        self.word_emb:nn.Embedding = nn.Embedding(num_embeddings=self.emb_size, 
                                                embedding_dim=self.emb_dim, device=device, sparse=sparse)

        if init_range is None:
            init_range = 0.5 / self.emb_dim
        self.word_emb.weight.data.uniform_(-init_range, init_range)

    def forward(self, centrals_words:torch.Tensor, pos_context:torch.Tensor,
                neg_context:torch.Tensor, weights:torch.Tensor):
        words_emb:torch.Tensor = self.word_emb(centrals_words) # [B, D]
        context_emb:torch.Tensor = self.word_emb(pos_context) # [B, D]
        neg_emb:torch.Tensor = self.word_emb(neg_context) # [B, K, D]
        
        weights = weights.view(-1, 1)
        
        def weight_hook(grad):
            return grad * weights
            
        words_emb.register_hook(weight_hook)

        pos_score = torch.sum(words_emb * context_emb, dim=1)
        pos_loss = F.logsigmoid(pos_score)

        neg_score = torch.bmm(neg_emb, words_emb.unsqueeze(-1)).squeeze(2)
        neg_loss = F.logsigmoid(-neg_score).sum(1)

        loss = -(pos_loss + neg_loss).mean()
        return loss

In [None]:
model_test = OnlyOneEmb(emb_size=len(word_to_idx.keys()), embedding_dimension=3)
optimizer = optim.SGD(model_test.parameters(), lr=0.1)

with torch.no_grad():
    model_test.word_emb.weight[0] = torch.tensor([1, 0, 0], dtype=float, requires_grad=True)
    model_test.word_emb.weight[1] = torch.tensor([0, 1, 0], dtype=float, requires_grad=True)
    model_test.word_emb.weight[2] = torch.tensor([0, 0, 1], dtype=float, requires_grad=True)
    model_test.word_emb.weight[3] = torch.tensor([-1, 0, 0], dtype=float, requires_grad=True)
    model_test.word_emb.weight[4] = torch.tensor([0, -1, 0], dtype=float, requires_grad=True)
    model_test.word_emb.weight[5] = torch.tensor([0, 0, -1], dtype=float, requires_grad=True)

In [None]:
print(model_test.word_emb(torch.tensor([0, 1,2,3])).size())

In [None]:
# Simulation of batch
wordTargetIdx = []
wordContextIdx = []
listOfNegTensor = []
weightsTensor = []

for wT, wC, wN, wW in pairs:
    wordTargetIdx.append(word_to_idx[wT])
    wordContextIdx.append(word_to_idx[wC])
    listOfNegTensor.append([word_to_idx[word_con] for word_con in wN])
    weightsTensor.append(wW)

batchWordTarget = torch.tensor(wordTargetIdx)
batchWordContext = torch.tensor(wordContextIdx)
batchWordNeg = torch.tensor(listOfNegTensor)
batchWeights = torch.tensor(weightsTensor)


# print(f"Word target {batchWordTarget}, {model_test.word_emb(batchWordTarget)}")
# print(f"word context {batchWordContext}")
# print(f"list of neg {batchWordNeg.size()}")
# print(f"Weight {batchWeights}")

optimizer.zero_grad()
loss = model_test(batchWordTarget, batchWordContext, batchWordNeg, batchWeights)
loss.backward()

In [None]:
model_test.word_emb.zero_grad()
for wordTarget, wordContext, listOfNeg, weights in pairs :
    wordTargetIdx = torch.tensor([word_to_idx[wordTarget]])
    wordContextIdx = torch.tensor([word_to_idx[wordContext]])
    listOfNegTensor = torch.tensor([[word_to_idx[word_con] for word_con in listOfNeg]])
    weightsTensor = torch.tensor([weights])
    
    print(f"Word target {wordTargetIdx}, {model_test.word_emb(wordTargetIdx)}")
    print(f"word context {wordContextIdx}")
    print(f"list of neg {listOfNegTensor}")
    print(f"Weight {weightsTensor}")
    
    optimizer.zero_grad()
    loss = model_test(wordTargetIdx, wordContextIdx, listOfNegTensor, weightsTensor)
    loss.backward()
    break
    
    # loss.backward()
	# optimizer.step()