### Implementation of a RNN

In [4]:
import numpy as np

class RNN:
    def __init__(self, taille_entree, taille_cachee, taille_sortie):
        self.taille_entree = taille_entree
        self.taille_cachee = taille_cachee
        self.taille_sortie = taille_sortie
        
        # Initialisation des poids
        self.W_xh = np.random.randn(taille_cachee, taille_entree) * 0.01
        self.W_hh = np.random.randn(taille_cachee, taille_cachee) * 0.01
        self.W_hy = np.random.randn(taille_sortie, taille_cachee) * 0.01
        
        # Initialisation des biais
        self.b_h = np.zeros((taille_cachee, 1))
        self.b_y = np.zeros((taille_sortie, 1))
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def etape_rnn(self, x, h_prev):
        # Calcul de l'état caché
        h = np.tanh(np.dot(self.W_xh, x) + np.dot(self.W_hh, h_prev) + self.b_h)
        # Calcul de la sortie
        y = self.sigmoid(np.dot(self.W_hy, h) + self.b_y)
        return h, y
    
    def propagation_avant(self, X):
        h = np.zeros((self.taille_cachee, 1))
        sorties = []
        
        for x in X:
            h, y = self.etape_rnn(x, h)
            sorties.append(y)
        
        return np.array(sorties)



    def propagation_arriere(self, X, Y, sorties, taux_apprentissage):
        dW_xh = np.zeros_like(self.W_xh)
        dW_hh = np.zeros_like(self.W_hh)
        dW_hy = np.zeros_like(self.W_hy)
        db_h = np.zeros_like(self.b_h)
        db_y = np.zeros_like(self.b_y)

        h = np.zeros((self.taille_cachee, 1))
        dh_next = np.zeros_like(h)
        
        for t in reversed(range(len(X))):
            dy = sorties[t] - Y[t]
            dW_hy += np.dot(dy, h.T)
            db_y += dy
            
            dh = np.dot(self.W_hy.T, dy) + dh_next
            dh_raw = (1 - h**2) * dh
            db_h += dh_raw
            dW_xh += np.dot(dh_raw, X[t].T)
            dW_hh += np.dot(dh_raw, h.T)
            
            dh_next = np.dot(self.W_hh.T, dh_raw)
            
            h, _ = self.etape_rnn(X[t], h)

        # Mise à jour des poids et biais
        self.W_xh -= taux_apprentissage * dW_xh
        self.W_hh -= taux_apprentissage * dW_hh
        self.W_hy -= taux_apprentissage * dW_hy
        self.b_h -= taux_apprentissage * db_h
        self.b_y -= taux_apprentissage * db_y

    def entrainer(self, X, Y, epochs, taux_apprentissage):
        for epoch in range(epochs):
            sorties = self.propagation_avant(X)
            self.propagation_arriere(X, Y, sorties, taux_apprentissage)
            
            if epoch % 100 == 0:
                perte = np.mean(np.square(sorties - Y))
                print(f"Époque {epoch}, Perte: {perte}")

# Exemple d'utilisation
taille_entree = 3
taille_cachee = 5
taille_sortie = 2
rnn = RNN(taille_entree, taille_cachee, taille_sortie)

# Données d'entrée (séquence de 4 vecteurs d'entrée)
X = np.random.randn(4, taille_entree, 1)

# Propagation avant
sorties = rnn.propagation_avant(X)
print("Sorties:", sorties)

Sorties: [[[0.49991064]
  [0.49997851]]

 [[0.50008478]
  [0.50000366]]

 [[0.50001331]
  [0.50002984]]

 [[0.4998113 ]
  [0.50012126]]]


### Implementation of a Transformer

In [15]:
import numpy as np

class Transformer:
    def __init__(self, taille_vocab, dim_modele, nb_tetes, dim_ff, nb_couches, is_decodeur=False):
        self.taille_vocab = taille_vocab
        self.dim_modele = dim_modele
        self.nb_tetes = nb_tetes
        self.dim_ff = dim_ff
        self.nb_couches = nb_couches
        self.is_decodeur = is_decodeur

        
        # Initialisation des poids
        self.embedding = np.random.randn(taille_vocab, dim_modele)
        self.couches = [TransformerCouche(dim_modele, nb_tetes, dim_ff) for _ in range(nb_couches)]
        self.fc = np.random.randn(dim_modele, taille_vocab)
        self.encodage_position = EncodagePosition(dim_modele)

        
    def forward(self, x):
        x = self.embedding[x]
        x = self.encodage_position.forward(x)
        masque = None
        if self.is_decodeur:
            masque = self.creer_masque_causal(x.shape[1])
        for couche in self.couches:
            x = couche.forward(x, masque)
        return np.dot(x, self.fc)
    
    
    

class TransformerCouche:
    def __init__(self, dim_modele, nb_tetes, dim_ff):
        self.attention = AttentionMultiTetes(dim_modele, nb_tetes)
        self.ff = FeedForward(dim_modele, dim_ff)
        self.norm1 = LayerNorm(dim_modele)
        self.norm2 = LayerNorm(dim_modele)
        
    def forward(self, x, masque=None):
        x = x + self.attention.forward(x, masque)
        x = self.norm1.forward(x)
        x = x + self.ff.forward(x)
        return self.norm2.forward(x)

class AttentionMultiTetes:
    def __init__(self, dim_modele, nb_tetes):
        self.nb_tetes = nb_tetes
        self.dim_tete = dim_modele // nb_tetes
        self.dim_modele = dim_modele
        self.wq = np.random.randn(dim_modele, dim_modele)
        self.wk = np.random.randn(dim_modele, dim_modele)
        self.wv = np.random.randn(dim_modele, dim_modele)
        self.wo = np.random.randn(dim_modele, dim_modele)
    
    def creer_masque_causal(self, size):
        return np.triu(np.ones((size, size)), k=1).astype(bool)
    
        
    def forward(self, x, masque=None):
        batch_size, seq_len, _ = x.shape
        
        q = np.dot(x, self.wq).reshape(batch_size, seq_len, self.nb_tetes, self.dim_tete)
        k = np.dot(x, self.wk).reshape(batch_size, seq_len, self.nb_tetes, self.dim_tete)
        v = np.dot(x, self.wv).reshape(batch_size, seq_len, self.nb_tetes, self.dim_tete)
        
        q = q.transpose(0, 2, 1, 3)  # (batch_size, nb_tetes, seq_len, dim_tete)
        k = k.transpose(0, 2, 1, 3)
        v = v.transpose(0, 2, 1, 3)
        
        scores = np.matmul(q, k.transpose(0, 1, 3, 2)) / np.sqrt(self.dim_tete)
        if masque is not None:
            scores = np.ma.masked_array(scores, mask=masque)
        
        attention = self.softmax(scores)        
        out = np.matmul(attention, v)
        out = out.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, self.dim_modele)
        return np.dot(out, self.wo)
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=-1, keepdims=True)

class FeedForward:
    def __init__(self, dim_modele, dim_ff):
        self.w1 = np.random.randn(dim_modele, dim_ff)
        self.w2 = np.random.randn(dim_ff, dim_modele)
        
    def forward(self, x):
        return np.dot(self.relu(np.dot(x, self.w1)), self.w2)
    
    def relu(self, x):
        return np.maximum(0, x)

class LayerNorm:
    def __init__(self, dim):
        self.gamma = np.ones(dim)
        self.beta = np.zeros(dim)
        
    def forward(self, x):
        mean = np.mean(x, axis=-1, keepdims=True)
        var = np.var(x, axis=-1, keepdims=True)
        return self.gamma * (x - mean) / np.sqrt(var + 1e-8) + self.beta
    
class EncodagePosition:
    def __init__(self, dim_modele, max_len=5000):
        self.encodage = self.creer_encodage_position(max_len, dim_modele)
        
    def creer_encodage_position(self, max_len, dim_modele):
        encodage = np.zeros((max_len, dim_modele))
        position = np.arange(max_len)[:, np.newaxis]
        div_term = np.exp(np.arange(0, dim_modele, 2) * -(np.log(10000.0) / dim_modele))
        encodage[:, 0::2] = np.sin(position * div_term)
        encodage[:, 1::2] = np.cos(position * div_term)
        return encodage[np.newaxis, :, :]
    
    def forward(self, x):
        return x + self.encodage[:, :x.shape[1], :]

# Exemple d'utilisation
taille_vocab = 1000
dim_modele = 256
nb_tetes = 8
dim_ff = 512
nb_couches = 6

transformer = Transformer(taille_vocab, dim_modele, nb_tetes, dim_ff, nb_couches)

# Séquence d'entrée (batch de 2 séquences de longueur 10)
x = np.random.randint(0, taille_vocab, size=(2, 10))

# Propagation avant
sortie = transformer.forward(x)
print("Forme de la sortie:", sortie.shape)

Forme de la sortie: (2, 10, 1000)
