In [26]:
# -*- coding: utf-8 -*-
"""
Réseau de neurones récurrent, modèle de langue par mot, pour paroles de chansons
Version avec couche vectorisation de mots et RNN 
"""

import torch
torch.manual_seed(0) # Pour résultats reproductibles
from torch import nn
import pandas as pd
from collections import Counter

class DatasetParoles(torch.utils.data.Dataset):
    """ Créer un Dataset avec les paroles de la colonne Lyric du fichier 
    https://www.kaggle.com/neisse/scrapped-lyrics-from-6-genres?select=lyrics-data.csv
    taille_sequence : taille d'une séquence de mots pour le modèle de langue
    Le texte est découpé en séquences de la taille taille_sequence
    """
    def __init__(self,taille_sequence=4):
        self.taille_sequence = taille_sequence
        self.mots = self.charger_mots()
        self.mots_uniques = self.chercher_mots_uniques()

        self.index_a_mot = {index: mot for index, mot in enumerate(self.mots_uniques)}
        self.mot_a_index = {mot: index for index, mot in enumerate(self.mots_uniques)}

        self.mots_indexes = [self.mot_a_index[w] for w in self.mots]

    def charger_mots(self):
        dataframe_entrainement = pd.read_csv('lyrics-data.csv')
        texte_concatene = dataframe_entrainement.iloc[0:100]['Lyric'].str.cat(sep=' ')
        return texte_concatene.split(' ')

    def chercher_mots_uniques(self):
        frequence_mot = Counter(self.mots)
        return sorted(frequence_mot, key=frequence_mot.get, reverse=True)

    def __len__(self):
        return len(self.mots_indexes) - self.taille_sequence

    def __getitem__(self, index):
        return (
            torch.tensor(self.mots_indexes[index:index+self.taille_sequence]),
            torch.tensor(self.mots_indexes[index+1:index+self.taille_sequence+1]),
        )



In [27]:
ds_paroles = DatasetParoles(taille_sequence=6)
print(ds_paroles[0])

(tensor([  3,  43, 156,  37,   0, 282]), tensor([ 43, 156,  37,   0, 282, 169]))


In [28]:
class Modele(nn.Module):
    """Modèle de RNR avec une couche vectorisation, suivie d'une couche RNN et d'une couche linéaire"""
    def __init__(self, ds_paroles):
        super(Modele, self).__init__()
        self.taille_H_RNN = 128
        self.taille_vectorisation_mots = 64
        self.nombre_couches_RNR = 2

        taille_vocabulaire = len(ds_paroles.mots_uniques)
        self.vectorisation_mots = nn.Embedding(num_embeddings=taille_vocabulaire,
            embedding_dim=self.taille_vectorisation_mots)
        self.rnn = nn.RNN(input_size=self.taille_vectorisation_mots,hidden_size=self.taille_H_RNN,
            num_layers=self.nombre_couches_RNR,dropout=0.2,batch_first=True)
        self.fc = nn.Linear(self.taille_H_RNN, taille_vocabulaire)

    def forward(self, lot_X, etat_0):
        vectorisation = self.vectorisation_mots(lot_X)
        lot_Ht, etat = self.rnn(vectorisation, etat_0)
        lot_Yt = self.fc(lot_Ht)

        return lot_Yt, etat

    def initializer_etat(self, taille_sequence):
        return (torch.zeros(self.nombre_couches_RNR, taille_sequence, self.taille_H_RNN))
    
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader

def entrainer_RNR(ds_paroles, modele, taille_lot=32, epochs=10, taille_sequence=6):
    modele.train()
    dl_paroles = DataLoader(ds_paroles,batch_size=taille_lot)

    fonction_cout = nn.CrossEntropyLoss()
    optimizeur = optim.Adam(modele.parameters(), lr=0.001)

    for epoch in range(epochs):

        for lot, (lot_X, lot_Y) in enumerate(dl_paroles):

            optimizeur.zero_grad()
            etat = modele.initializer_etat(lot_X.shape[0])
            lot_Y_predictions, etat = modele(lot_X, etat)
            cout = fonction_cout(lot_Y_predictions.transpose(1, 2), lot_Y)
            
            cout.backward()
            optimizeur.step()
            if lot%10 == 0:
                print(f'-------- > epoch {epoch} lot {lot} :  coût = {cout.item()}')

def predire(ds, modele, debut_texte, nb_mots=20):
    """ Prédire une suite de nb_mots à partir de debut_texte selon le modele"""
    mots = debut_texte.split(' ')
    modele.eval()
    etat = modele.initializer_etat(1)

    for i in range(0, nb_mots):
        lot_X = torch.tensor([[ds.mot_a_index[m] for m in mots[i:]]])
        lot_Y_predictions, etat = modele(lot_X, etat)
        dernier_mot_Yt = lot_Y_predictions[0][-1]
        probs_dernier_mot = torch.nn.functional.softmax(dernier_mot_Yt, dim=0).detach().numpy()
        index_mot_choisi = np.random.choice(len(dernier_mot_Yt), p=probs_dernier_mot)
        mots.append(ds.index_a_mot[index_mot_choisi])

    return mots

ds_paroles = DatasetParoles(taille_sequence=6)
modele = Modele(ds_paroles)

entrainer_RNR(ds_paroles, modele, taille_lot=32, epochs=5, taille_sequence=6)
print(predire(ds_paroles, modele, debut_texte='There was'))

-------- > epoch 0 lot 0 :  coût = 8.468780517578125
-------- > epoch 0 lot 10 :  coût = 8.410581588745117
-------- > epoch 0 lot 20 :  coût = 8.255254745483398
-------- > epoch 0 lot 30 :  coût = 7.961673259735107
-------- > epoch 0 lot 40 :  coût = 6.762079238891602
-------- > epoch 0 lot 50 :  coût = 8.112771987915039
-------- > epoch 0 lot 60 :  coût = 7.3384881019592285
-------- > epoch 0 lot 70 :  coût = 7.892124652862549
-------- > epoch 0 lot 80 :  coût = 7.5355753898620605
-------- > epoch 0 lot 90 :  coût = 7.656606197357178
-------- > epoch 0 lot 100 :  coût = 7.703867435455322
-------- > epoch 0 lot 110 :  coût = 7.927034854888916
-------- > epoch 0 lot 120 :  coût = 7.2673468589782715
-------- > epoch 0 lot 130 :  coût = 7.287990093231201
-------- > epoch 0 lot 140 :  coût = 7.883811950683594
-------- > epoch 0 lot 150 :  coût = 6.367599964141846
-------- > epoch 0 lot 160 :  coût = 5.854033946990967
-------- > epoch 0 lot 170 :  coût = 8.066438674926758
-------- > epoch 0

-------- > epoch 2 lot 280 :  coût = 6.680131435394287
-------- > epoch 2 lot 290 :  coût = 4.900870323181152
-------- > epoch 2 lot 300 :  coût = 5.959470272064209
-------- > epoch 2 lot 310 :  coût = 5.931654453277588
-------- > epoch 2 lot 320 :  coût = 5.499532699584961
-------- > epoch 2 lot 330 :  coût = 5.5902791023254395
-------- > epoch 2 lot 340 :  coût = 4.9119791984558105
-------- > epoch 2 lot 350 :  coût = 5.8251051902771
-------- > epoch 2 lot 360 :  coût = 6.189371585845947
-------- > epoch 2 lot 370 :  coût = 4.9279093742370605
-------- > epoch 2 lot 380 :  coût = 5.948924541473389
-------- > epoch 2 lot 390 :  coût = 4.328136920928955
-------- > epoch 2 lot 400 :  coût = 5.568192958831787
-------- > epoch 2 lot 410 :  coût = 4.754176616668701
-------- > epoch 2 lot 420 :  coût = 4.141364097595215
-------- > epoch 2 lot 430 :  coût = 5.4257073402404785
-------- > epoch 2 lot 440 :  coût = 4.927351474761963
-------- > epoch 2 lot 450 :  coût = 6.1730122566223145
-------

-------- > epoch 4 lot 550 :  coût = 5.481354236602783
-------- > epoch 4 lot 560 :  coût = 4.946446418762207
-------- > epoch 4 lot 570 :  coût = 5.284590244293213
-------- > epoch 4 lot 580 :  coût = 3.502293825149536
-------- > epoch 4 lot 590 :  coût = 4.459530353546143
-------- > epoch 4 lot 600 :  coût = 2.636174440383911
['There', 'was', 'no', 'stranger..', 'of', 'watching.', 'are', 'series', 'faces', 'harmony', 'my', 'mind', 'on', 'the', 'corps.', 'strive.', 'cross.', 'try.', 'want..', 'No', 'nearer,', 'half']


In [13]:
import pandas as pd
dataframe_entrainement = pd.read_csv('lyrics-data.csv')
print(dataframe_entrainement.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209522 entries, 0 to 209521
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   ALink   209522 non-null  object
 1   SName   209522 non-null  object
 2   SLink   209522 non-null  object
 3   Lyric   209484 non-null  object
 4   Idiom   206375 non-null  object
dtypes: object(5)
memory usage: 8.0+ MB
None


In [23]:
texte_concatene = dataframe_entrainement.iloc[0:2]['Lyric'].str.cat(sep=' ')
texte_concatene.split(' ')
print(texte_concatene)

I could feel at the time. There was no way of knowing. Fallen leaves in the night. Who can say where they're blowing. As free as the wind. Hopefully learning. Why the sea on the tide. Has no way of turning. More than this. You know there's nothing. More than this. Tell me one thing. More than this. You know there's nothing. It was fun for a while. There was no way of knowing. Like a dream in the night. Who can say where we're going. No care in the world. Maybe I'm learning. Why the sea on the tide. Has no way of turning. More than this. You know there's nothing. More than this. Tell me one thing. More than this. You know there's nothing. More than this. You know there's nothing. More than this. Tell me one thing. More than this. There's nothing. Take me now, baby, here as I am. Hold me close, and try and understand. Desire is hunger is the fire I breathe. Love is a banquet on which we feed. Come on now, try and understand. The way I feel under your command. Take my hand, as the sun des

In [24]:
print(dataframe_entrainement.iloc[0:2]['Lyric'])

0    I could feel at the time. There was no way of ...
1    Take me now, baby, here as I am. Hold me close...
Name: Lyric, dtype: object


In [20]:
print(dataframe_entrainement.iloc[0:2]['Lyric'].str.cat(sep=' '))

I could feel at the time. There was no way of knowing. Fallen leaves in the night. Who can say where they're blowing. As free as the wind. Hopefully learning. Why the sea on the tide. Has no way of turning. More than this. You know there's nothing. More than this. Tell me one thing. More than this. You know there's nothing. It was fun for a while. There was no way of knowing. Like a dream in the night. Who can say where we're going. No care in the world. Maybe I'm learning. Why the sea on the tide. Has no way of turning. More than this. You know there's nothing. More than this. Tell me one thing. More than this. You know there's nothing. More than this. You know there's nothing. More than this. Tell me one thing. More than this. There's nothing. Take me now, baby, here as I am. Hold me close, and try and understand. Desire is hunger is the fire I breathe. Love is a banquet on which we feed. Come on now, try and understand. The way I feel under your command. Take my hand, as the sun des