<a href="https://colab.research.google.com/github/Wahiba275/CBOW-Skip-gram-from-scratch/blob/main/CBOW_%26_Skip_gram_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
corpus1 =["Le match 'hier était incroyable" , "C'était un match mémorable" , "La récente allocution du premier ministre"]

# ***CBOW***

In [None]:
import numpy as np
def tokenize_corpus(corpus):
    tokenized_corpus = [sentence.split() for sentence in corpus]
    word_to_index = {}
    index_to_word = {}
    for sentence in tokenized_corpus:
        for word in sentence:
            if word not in word_to_index:
                index = len(word_to_index)
                word_to_index[word] = index
                index_to_word[index] = word
    return tokenized_corpus, word_to_index, index_to_word

In [None]:
tokenized_corpus, word_to_index, index_to_word=tokenize_corpus(corpus1)
print(index_to_word)

{0: 'Le', 1: 'match', 2: "'hier", 3: 'était', 4: 'incroyable', 5: "C'était", 6: 'un', 7: 'mémorable', 8: 'La', 9: 'récente', 10: 'allocution', 11: 'du', 12: 'premier', 13: 'ministre'}


In [None]:
def generate_training_data(corpus, window_size, V, word_to_index):
    training_data = []
    for sentence in corpus:
        sentence_length = len(sentence)
        for target_word_index, target_word in enumerate(sentence):
            context = []
            start = max(0, target_word_index - window_size)
            end = min(sentence_length, target_word_index + window_size + 1)
            context = [sentence[i] for i in range(start, end) if i != target_word_index]
            target = np.zeros(V)
            target[word_to_index[target_word]] = 1
            training_data.append((context, target))
    return training_data

In [None]:
def initialize_weights(V, N):
    context_weights = np.random.rand(V, N)
    target_weights = np.random.rand(N, V)
    return context_weights, target_weights

def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)


In [None]:
import matplotlib.pyplot as plt

def train_CBOW(corpus, V, N, epochs, learning_rate):
    tokenized_corpus, word_to_index, index_to_word = tokenize_corpus(corpus)
    context_weights, target_weights = initialize_weights(V, N)

    losses = []  # List to store loss values for each epoch

    for epoch in range(epochs):
        loss = 0

        for context, target in generate_training_data(tokenized_corpus, window_size=1, V=V, word_to_index=word_to_index):
            context_vector = np.mean([context_weights[word_to_index[word]] for word in context], axis=0)
            predicted_target = softmax(np.dot(context_vector, target_weights))

            # Calcul de la perte
            loss += -np.sum(target * np.log(predicted_target))

            # Mise à jour des poids
            error = predicted_target - target
            target_weights -= learning_rate * np.outer(context_vector, error)
            for word in context:
                context_weights[word_to_index[word]] -= learning_rate * np.dot(error, target_weights.T[:, word_to_index[word]])

        print(f'Epoch: {epoch+1}, Loss: {loss}')
        losses.append(loss)
    return context_weights, word_to_index, losses

# Exemple d'utilisation avec votre corpus
corpus = [
    "trouver bonne assurance",
    "contrat satisfaisant",
    "changement contrat assurance"
]
V = 6  # Taille du vocabulaire
N = 6  # Dimension des vecteurs de mots
epochs = 1
learning_rate = 0.01

context_weights, word_to_index , losses  = train_CBOW(corpus, V, N, epochs, learning_rate)

Epoch: 1, Loss: 14.959915613863211


In [None]:
def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    similarity = dot_product / (norm_v1 * norm_v2)
    return similarity

# Example words
word1 = "trouver"
word2 = "assurance"
word3 = "contrat"

# Get word vectors
vector1 = context_weights[word_to_index[word1]]
vector2 = context_weights[word_to_index[word2]]
vector3 = context_weights[word_to_index[word3]]

# Compute cosine similarities
similarity_1_2 = cosine_similarity(vector1, vector2)
similarity_1_3 = cosine_similarity(vector1, vector3)

print(f"Similarity between '{word1}' and '{word2}': {similarity_1_2}")
print(f"Similarity between '{word1}' and '{word3}': {similarity_1_3}")


Similarity between 'trouver' and 'assurance': 0.4084681655856083
Similarity between 'trouver' and 'contrat': 0.7995828034394701


In [None]:
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
# Votre corpus
corpus2 = [
    "trouver bonne assurance",
    "contrat satisfaisant",
    "changement contrat assurance"
]

# Tokenisation du corpus en mots
tokenized_corpus = [word_tokenize(sentence) for sentence in corpus2]

# Création du modèle CBOW
model_cbow = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, sg=0, min_count=1)
# Entraînement du modèle
model_cbow.train(tokenized_corpus, total_examples=len(corpus2), epochs=100)

# Vous pouvez maintenant utiliser le modèle entraîné pour effectuer différentes opérations, telles que la similarité entre les mots.
similarity_score = model_cbow.wv.similarity('trouver', 'assurance')
print(f"Cosine similarity between 'trouver' and 'assurance' - CBOW : {similarity_score}")
similarity_score2 = model_cbow.wv.similarity('trouver', 'contrat')
print(f"Cosine similarity between 'trouver' and 'contrat' - CBOW : {similarity_score2}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Cosine similarity between 'trouver' and 'assurance' - CBOW : 0.009510169737040997
Cosine similarity between 'trouver' and 'contrat' - CBOW : -0.059844862669706345


# ***Skip-gram***

In [None]:
import numpy as np
from collections import defaultdict
import random

# Function to preprocess the text data
def preprocess_text(corpus):
    tokens = [sentence.lower().split() for sentence in corpus]
    return tokens

# Function to create the skip-gram training pairs
def generate_skipgram_pairs(tokens, window_size=2):
    pairs = []
    for sentence in tokens:
        for i, target_word in enumerate(sentence):
            context = sentence[max(0, i - window_size):i] + sentence[i+1:i+window_size+1]
            for context_word in context:
                pairs.append((target_word, context_word))
    return pairs

# Function to convert words to one-hot vectors
def word_to_onehot(word, word2index):
    onehot = np.zeros(len(word2index))
    onehot[word2index[word]] = 1
    return onehot

# Skip-gram training function
def train_skipgram(tokens, embedding_size=100, window_size=2, epochs=100, learning_rate=0.01):
    vocabulary = list(set(word for sentence in tokens for word in sentence))
    word2index = {word: i for i, word in enumerate(vocabulary)}
    index2word = {i: word for word, i in word2index.items()}
    vocab_size = len(vocabulary)

    W_input = np.random.uniform(-1, 1, (vocab_size, embedding_size))
    W_output = np.random.uniform(-1, 1, (embedding_size, vocab_size))
    #losses=[]
    for epoch in range(epochs):
        total_loss = 0
        for target, context in generate_skipgram_pairs(tokens, window_size):
            target_onehot = word_to_onehot(target, word2index)
            y = np.dot(W_input[word2index[target]], W_output)
            y_pred = 1 / (1 + np.exp(-y))

            error = y_pred - target_onehot
            total_loss += np.sum(error**2)
            W_output -= learning_rate * np.outer(W_input[word2index[target]], error)
            W_input[word2index[target]] -= learning_rate * np.dot(W_output, error)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss}")
    return W_input, word2index, index2word

# Test the skip-gram model
corpus1 = [
    "trouver bonne assurance",
    "contrat satisfaisant",
    "changement contrat assurance"
]

tokens = preprocess_text(corpus1)
embedding_size = 100
trained_embeddings, word2index, index2word = train_skipgram(tokens, embedding_size=embedding_size, epochs=200)

# Print the learned word vectors
for i, word in index2word.items():
    print(f"Word: {word}, Vector: {trained_embeddings[i]}")


Epoch 1/200, Loss: 28.961256288313976
Epoch 2/200, Loss: 20.234556776496856
Epoch 3/200, Loss: 12.497496299224213
Epoch 4/200, Loss: 7.114988215133363
Epoch 5/200, Loss: 3.9837856054884337
Epoch 6/200, Loss: 2.2946752518503475
Epoch 7/200, Loss: 1.4220351329607692
Epoch 8/200, Loss: 0.9646900977593477
Epoch 9/200, Loss: 0.7014407926272845
Epoch 10/200, Loss: 0.5354332254296016
Epoch 11/200, Loss: 0.4233248216117466
Epoch 12/200, Loss: 0.3436963481348283
Epoch 13/200, Loss: 0.284928468973719
Epoch 14/200, Loss: 0.2402311410976525
Epoch 15/200, Loss: 0.2053981679592222
Epoch 16/200, Loss: 0.17770027766221516
Epoch 17/200, Loss: 0.15529856123962976
Epoch 18/200, Loss: 0.13691473027864964
Epoch 19/200, Loss: 0.12163649470832828
Epoch 20/200, Loss: 0.10879792000779016
Epoch 21/200, Loss: 0.09790329660578118
Epoch 22/200, Loss: 0.0885772407853157
Epoch 23/200, Loss: 0.08053113644348169
Epoch 24/200, Loss: 0.07354005095323607
Epoch 25/200, Loss: 0.06742653512043838
Epoch 26/200, Loss: 0.06204

In [None]:
tokens=preprocess_text(corpus)
tokens

[['trouver', 'bonne', 'assurance'],
 ['contrat', 'satisfaisant'],
 ['changement', 'contrat', 'assurance']]

In [None]:
pairs= generate_skipgram_pairs(tokens,2)
pairs

[('trouver', 'bonne'),
 ('trouver', 'assurance'),
 ('bonne', 'trouver'),
 ('bonne', 'assurance'),
 ('assurance', 'trouver'),
 ('assurance', 'bonne'),
 ('contrat', 'satisfaisant'),
 ('satisfaisant', 'contrat'),
 ('changement', 'contrat'),
 ('changement', 'assurance'),
 ('contrat', 'changement'),
 ('contrat', 'assurance'),
 ('assurance', 'changement'),
 ('assurance', 'contrat')]

In [None]:
# Function to calculate cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    similarity = dot_product / (norm1 * norm2)
    return similarity
# Test word similarity
def test_word_similarity(word1, word2, word_vectors, word2index):
    vec1 = word_vectors[word2index[word1]]
    vec2 = word_vectors[word2index[word2]]
    similarity = cosine_similarity(vec1, vec2)
    return similarity
# Test the similarity of words
similarity_result = test_word_similarity("trouver", "assurance", trained_embeddings, word2index)
print(f"Similarity between 'trouver' and 'assurance': {similarity_result}")

similarity_result = test_word_similarity("contrat", "assurance", trained_embeddings, word2index)
print(f"Similarity between 'contrat' and 'assurance': {similarity_result}")

similarity_result = test_word_similarity("trouver", "contrat", trained_embeddings, word2index)
print(f"Similarity between 'trouver' and 'contrat': {similarity_result}")


Similarity between 'trouver' and 'assurance': -0.1157287408436277
Similarity between 'contrat' and 'assurance': -0.050230775882811815
Similarity between 'trouver' and 'contrat': -0.04861483230841686


In [None]:
!pip install gensim



In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Your corpus
corpus1 = [
    "trouver bonne assurance",
    "contrat satisfaisant",
    "changement contrat assurance"
]
nltk.download('stopwords')
# Tokenize and preprocess the corpus
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus1]
# Remove stopwords
stop_words = set(stopwords.words('french'))
tokenized_corpus = [[word for word in sentence if word not in stop_words] for sentence in tokenized_corpus]

# Set up and train the Skip-gram model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, sg=1, min_count=1)

# Save the trained model to a file
model.save("skip_gram_model")
# Print the learned word vectors
word_vectors = model.wv
for word in word_vectors.key_to_index:
    print(f"Word: {word}, Vector: {word_vectors[word]}")

# Calculate and print cosine similarity between all pairs of words in the vocabulary
vocab_words = list(word_vectors.key_to_index.keys())

for i in range(len(vocab_words)):
    for j in range(i + 1, len(vocab_words)):
        word1 = vocab_words[i]
        word2 = vocab_words[j]
        similarity_result = cosine_similarity([word_vectors[word1]], [word_vectors[word2]])[0][0]
        print(f"Cosine Similarity between '{word1}' and '{word2}': {similarity_result}")



Word: contrat, Vector: [-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826881e-03
 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
