In [1]:
# -*- coding: utf-8 -*-
"""
Réseau de neurones récurrent, modèle de langue par mot, pour paroles de chansons
Version avec couche vectorisation de mots et RNN 
"""

import torch
torch.manual_seed(0) # Pour résultats reproductibles
from torch import nn
import pandas as pd
from collections import Counter

class DatasetParoles(torch.utils.data.Dataset):
    """ Créer un Dataset avec les paroles de la colonne Lyric du fichier 
    taille_sequence : taille d'une séquence de mots pour le modèle de langue
    Le texte est découpé en séquences de la taille taille_sequence
    """
    def __init__(self, nom_fichier, taille_sequence=4):
        self.nom_fichier = nom_fichier
        self.taille_sequence = taille_sequence
        self.mots = self.charger_mots()
        self.mots_uniques = self.chercher_mots_uniques()

        self.index_a_mot = {index: mot for index, mot in enumerate(self.mots_uniques)}
        self.mot_a_index = {mot: index for index, mot in enumerate(self.mots_uniques)}

        self.mots_indexes = [self.mot_a_index[w] for w in self.mots]

    def charger_mots(self):
        dataframe_entrainement = pd.read_csv(self.nom_fichier)
        texte_concatene = dataframe_entrainement.iloc[0:100]['Lyric'].str.cat(sep=' ')
        return texte_concatene.split(' ')

    def chercher_mots_uniques(self):
        frequence_mot = Counter(self.mots)
        return sorted(frequence_mot, key=frequence_mot.get, reverse=True)

    def __len__(self):
        return len(self.mots_indexes) - self.taille_sequence

    def __getitem__(self, index):
        return (
            torch.tensor(self.mots_indexes[index:index+self.taille_sequence]),
            torch.tensor(self.mots_indexes[index+1:index+self.taille_sequence+1]),
        )



In [3]:
ds_paroles = DatasetParoles("lyrics-data.csv",taille_sequence=6)
print(ds_paroles[0])

(tensor([1588,    3,    1,    7,   26,  225]), tensor([   3,    1,    7,   26,  225, 1589]))


In [28]:
class Modele(nn.Module):
    """Modèle de RNR avec une couche vectorisation, suivie d'une couche RNN et d'une couche linéaire"""
    def __init__(self, ds_paroles):
        super(Modele, self).__init__()
        self.taille_H_RNN = 128
        self.taille_vectorisation_mots = 64
        self.nombre_couches_RNR = 2

        taille_vocabulaire = len(ds_paroles.mots_uniques)
        self.vectorisation_mots = nn.Embedding(num_embeddings=taille_vocabulaire,
            embedding_dim=self.taille_vectorisation_mots)
        self.rnn = nn.RNN(input_size=self.taille_vectorisation_mots,hidden_size=self.taille_H_RNN,
            num_layers=self.nombre_couches_RNR,dropout=0.2,batch_first=True)
        self.fc = nn.Linear(self.taille_H_RNN, taille_vocabulaire)

    def forward(self, lot_X, etat_0):
        vectorisation = self.vectorisation_mots(lot_X)
        lot_Ht, etat = self.rnn(vectorisation, etat_0)
        lot_Yt = self.fc(lot_Ht)

        return lot_Yt, etat

    def initializer_etat(self, taille_sequence):
        return (torch.zeros(self.nombre_couches_RNR, taille_sequence, self.taille_H_RNN))
    
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader

def entrainer_RNR(ds_paroles, modele, taille_lot=32, epochs=10, taille_sequence=6):
    modele.train()
    dl_paroles = DataLoader(ds_paroles,batch_size=taille_lot)

    fonction_cout = nn.CrossEntropyLoss()
    optimizeur = optim.Adam(modele.parameters(), lr=0.001)

    for epoch in range(epochs):

        for lot, (lot_X, lot_Y) in enumerate(dl_paroles):

            optimizeur.zero_grad()
            etat = modele.initializer_etat(lot_X.shape[0])
            lot_Y_predictions, etat = modele(lot_X, etat)
            cout = fonction_cout(lot_Y_predictions.transpose(1, 2), lot_Y)
            
            cout.backward()
            optimizeur.step()
            if lot%10 == 0:
                print(f'-------- > epoch {epoch} lot {lot} :  coût = {cout.item()}')

def predire(ds, modele, debut_texte, nb_mots=20):
    """ Prédire une suite de nb_mots à partir de debut_texte selon le modele"""
    mots = debut_texte.split(' ')
    modele.eval()
    etat = modele.initializer_etat(1)

    for i in range(0, nb_mots):
        lot_X = torch.tensor([[ds.mot_a_index[m] for m in mots[i:]]])
        lot_Y_predictions, etat = modele(lot_X, etat)
        dernier_mot_Yt = lot_Y_predictions[0][-1]
        probs_dernier_mot = torch.nn.functional.softmax(dernier_mot_Yt, dim=0).detach().numpy()
        index_mot_choisi = np.random.choice(len(dernier_mot_Yt), p=probs_dernier_mot)
        mots.append(ds.index_a_mot[index_mot_choisi])

    return mots

ds_paroles = DatasetParoles("ColdPlay.csv", taille_sequence=6)
modele = Modele(ds_paroles)

entrainer_RNR(ds_paroles, modele, taille_lot=32, epochs=5, taille_sequence=6)
print(predire(ds_paroles, modele, debut_texte='you know'))

-------- > epoch 0 lot 0 :  coût = 7.6551690101623535
-------- > epoch 0 lot 10 :  coût = 7.510778427124023
-------- > epoch 0 lot 20 :  coût = 7.000609874725342
-------- > epoch 0 lot 30 :  coût = 6.249145030975342
-------- > epoch 0 lot 40 :  coût = 6.797878265380859
-------- > epoch 0 lot 50 :  coût = 6.162596225738525
-------- > epoch 0 lot 60 :  coût = 6.240902423858643
-------- > epoch 0 lot 70 :  coût = 5.520714282989502
-------- > epoch 0 lot 80 :  coût = 6.088428020477295
-------- > epoch 0 lot 90 :  coût = 5.626760005950928
-------- > epoch 0 lot 100 :  coût = 6.3954644203186035
-------- > epoch 0 lot 110 :  coût = 6.560697078704834
-------- > epoch 0 lot 120 :  coût = 5.489595413208008
-------- > epoch 0 lot 130 :  coût = 5.921864986419678
-------- > epoch 0 lot 140 :  coût = 5.709108352661133
-------- > epoch 0 lot 150 :  coût = 8.206951141357422
-------- > epoch 0 lot 160 :  coût = 7.392282962799072
-------- > epoch 0 lot 170 :  coût = 6.257867336273193
-------- > epoch 0 

-------- > epoch 2 lot 260 :  coût = 4.519418239593506
-------- > epoch 2 lot 270 :  coût = 4.713341236114502
-------- > epoch 2 lot 280 :  coût = 4.937442302703857
-------- > epoch 2 lot 290 :  coût = 5.499549865722656
-------- > epoch 2 lot 300 :  coût = 4.506556034088135
-------- > epoch 2 lot 310 :  coût = 4.757735729217529
-------- > epoch 2 lot 320 :  coût = 5.512849807739258
-------- > epoch 2 lot 330 :  coût = 4.396300792694092
-------- > epoch 2 lot 340 :  coût = 4.324193000793457
-------- > epoch 2 lot 350 :  coût = 4.632694721221924
-------- > epoch 2 lot 360 :  coût = 5.568561553955078
-------- > epoch 2 lot 370 :  coût = 5.697877883911133
-------- > epoch 2 lot 380 :  coût = 5.20526647567749
-------- > epoch 2 lot 390 :  coût = 4.980088233947754
-------- > epoch 2 lot 400 :  coût = 5.121632099151611
-------- > epoch 2 lot 410 :  coût = 5.251952648162842
-------- > epoch 2 lot 420 :  coût = 3.957341194152832
-------- > epoch 2 lot 430 :  coût = 4.451236248016357
-------- > 

-------- > epoch 4 lot 520 :  coût = 3.8704893589019775
-------- > epoch 4 lot 530 :  coût = 3.3905506134033203
-------- > epoch 4 lot 540 :  coût = 3.04787278175354
-------- > epoch 4 lot 550 :  coût = 3.6801671981811523
-------- > epoch 4 lot 560 :  coût = 4.641909599304199
-------- > epoch 4 lot 570 :  coût = 4.579174995422363
-------- > epoch 4 lot 580 :  coût = 2.9572792053222656
-------- > epoch 4 lot 590 :  coût = 4.501807689666748
-------- > epoch 4 lot 600 :  coût = 4.818921089172363
-------- > epoch 4 lot 610 :  coût = 5.041574001312256
['you', 'know', 'the', 'promise', 'of', 'stars', "we've", 'no', 'myself', 'meet', 'the', 'bubble', 'and', 'written', 'poison', 'frey', 'the', 'fireplace', 'what', 'if', 'mmm', 'god']


In [9]:
import pandas as pd
dataframe_entrainement = pd.read_csv('ColdPlay.csv')
print(dataframe_entrainement.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  344 non-null    object 
 1   Artist      339 non-null    object 
 2   Title       339 non-null    object 
 3   Album       262 non-null    object 
 4   Year        240 non-null    float64
 5   Date        240 non-null    object 
 6   Lyric       328 non-null    object 
dtypes: float64(1), object(6)
memory usage: 18.9+ KB
None


In [20]:
texte_concatene = dataframe_entrainement.iloc[0:2]['Lyric'].str.cat(sep=' ')
print(texte_concatene)

come up to meet you tell you i'm sorry you don't know how lovely you are i had to find you tell you i need you tell you i set you apart tell me your secrets and ask me your questions oh let's go back to the start running in circles coming up tails heads on a science apart   nobody said it was easy it's such a shame for us to part nobody said it was easy no one ever said it would be this hard oh take me back to the start   i was just guessing at numbers and figures pulling the puzzles apart questions of science science and progress do not speak as loud as my heart tell me you love me come back and haunt me oh and i rush to the start running in circles chasing our tails coming back as we are   nobody said it was easy oh it's such a shame for us to part nobody said it was easy no one ever said it would be so hard i'm going back to the start   ohooh oohoohoohooh ahooh oohoohoohooh ohooh oohoohoohooh ohooh oohoohoohooh chris martin i used to rule the world seas would rise when i gave the wo

In [11]:
print(dataframe_entrainement.iloc[0:2]['Lyric'])

0    come up to meet you tell you i'm sorry you don...
1    chris martin i used to rule the world seas wou...
Name: Lyric, dtype: object


In [21]:
print(texte_concatene.split(' '))

['come', 'up', 'to', 'meet', 'you', 'tell', 'you', "i'm", 'sorry', 'you', "don't", 'know', 'how', 'lovely', 'you', 'are', 'i', 'had', 'to', 'find', 'you', 'tell', 'you', 'i', 'need', 'you', 'tell', 'you', 'i', 'set', 'you', 'apart', 'tell', 'me', 'your', 'secrets', 'and', 'ask', 'me', 'your', 'questions', 'oh', "let's", 'go', 'back', 'to', 'the', 'start', 'running', 'in', 'circles', 'coming', 'up', 'tails', 'heads', 'on', 'a', 'science', 'apart', '', '', 'nobody', 'said', 'it', 'was', 'easy', "it's", 'such', 'a', 'shame', 'for', 'us', 'to', 'part', 'nobody', 'said', 'it', 'was', 'easy', 'no', 'one', 'ever', 'said', 'it', 'would', 'be', 'this', 'hard', 'oh', 'take', 'me', 'back', 'to', 'the', 'start', '', '', 'i', 'was', 'just', 'guessing', 'at', 'numbers', 'and', 'figures', 'pulling', 'the', 'puzzles', 'apart', 'questions', 'of', 'science', 'science', 'and', 'progress', 'do', 'not', 'speak', 'as', 'loud', 'as', 'my', 'heart', 'tell', 'me', 'you', 'love', 'me', 'come', 'back', 'and', 'h