In [49]:
# -*- coding: utf-8 -*-
"""
Réseau de neurones récurrent, modèle de langue par mot, pour paroles de chansons
Fichier lyrics-data.csv de https://www.kaggle.com/datasets/deepshah16/song-lyrics-dataset
Version avec couche vectorisation de mots et RNN
"""

import torch
torch.manual_seed(0) # Pour résultats reproductibles
from torch import nn
import pandas as pd
from collections import Counter

taille_sequence = 8
nombre_textes = 100

# Déterminer si un GPU est disponible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Entrainement sur ',device)

class DatasetParolesLyricsDataEn(torch.utils.data.Dataset):
    """ Créer un Dataset avec les paroles de la colonne Lyric du fichier nom_fichier
    taille_sequence : taille d'une séquence de mots pour le modèle de langue
    Le texte est découpé en séquences de la taille taille_sequence
    """
    def __init__(self, nombre_textes=100, taille_sequence=4,):
        self.nombre_textes = nombre_textes
        self.taille_sequence = taille_sequence
        self.mots = self.charger_mots()
        self.mots_uniques = self.chercher_mots_uniques()

        self.index_a_mot = {index: mot for index, mot in enumerate(self.mots_uniques)}
        self.mot_a_index = {mot: index for index, mot in enumerate(self.mots_uniques)}

        self.mots_indexes = [self.mot_a_index[w] for w in self.mots]

    def charger_mots(self):
        dataframe_entrainement = pd.read_csv('lyrics-data.csv')
        texte_concatene = dataframe_entrainement[(dataframe_entrainement['language'] == 'en')].iloc[0:self.nombre_textes]['Lyric'].str.cat(sep=' ')
        return re.sub(r'[^\w\s]', '', texte_concatene).lower().split()

    def chercher_mots_uniques(self):
        frequence_mot = Counter(self.mots)
        return sorted(frequence_mot, key=frequence_mot.get, reverse=True)

    def __len__(self):
        return len(self.mots_indexes) - self.taille_sequence

    def __getitem__(self, index):
        return (
            torch.tensor(self.mots_indexes[index:index+self.taille_sequence]),
            torch.tensor(self.mots_indexes[index+1:index+self.taille_sequence+1]),
        )
    
class Modele(nn.Module):
    """Modèle de RNR avec une couche vectorisation, suivie d'une couche RNN et d'une couche linéaire"""
    def __init__(self, ds_paroles):
        super(Modele, self).__init__()
        self.taille_H_RNN = 128
        self.taille_vectorisation_mots = 64
        self.nombre_couches_RNR = 1

        taille_vocabulaire = len(ds_paroles.mots_uniques)
        self.vectorisation_mots = nn.Embedding(num_embeddings=taille_vocabulaire,
            embedding_dim=self.taille_vectorisation_mots)
        self.rnn = nn.RNN(input_size=self.taille_vectorisation_mots,hidden_size=self.taille_H_RNN,
            num_layers=self.nombre_couches_RNR,batch_first=True)
        self.dense_linaire = nn.Linear(self.taille_H_RNN, taille_vocabulaire)

    def forward(self, lot_X, etat_0):
        vectorisation = self.vectorisation_mots(lot_X)
        lot_Ht, etat = self.rnn(vectorisation, etat_0)
        lot_Yt = self.dense_linaire(lot_Ht)
        return lot_Yt, etat

    def initializer_etat(self, taille_sequence):
        return (torch.zeros(self.nombre_couches_RNR, taille_sequence, self.taille_H_RNN))

ds_paroles = DatasetParolesLyricsDataEn(nombre_textes = nombre_textes, taille_sequence=taille_sequence)
modele = Modele(ds_paroles)
# Placer le modèle en mode GPU si possible
modele = modele.to(device)
    
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader
import time
def entrainer_RNR(ds_paroles, modele, taille_lot=32, epochs=5, taille_sequence=6):
    debut = time.time()
    modele.train()
    dl_paroles = DataLoader(ds_paroles,batch_size=taille_lot)

    fonction_cout = nn.CrossEntropyLoss()
    optimizeur = optim.Adam(modele.parameters(), lr=0.001)

    for epoch in range(epochs):

        for lot, (lot_X, lot_Y) in enumerate(dl_paroles):
            lot_X = lot_X.to(device)
            lot_Y = lot_Y.to(device)
            etat = modele.initializer_etat(lot_X.shape[0])
            etat = etat.to(device)
            optimizeur.zero_grad()
            
            lot_Y_predictions, etat = modele(lot_X, etat)
            cout = fonction_cout(lot_Y_predictions.transpose(1, 2), lot_Y)
            
            cout.backward()
            optimizeur.step()
            if lot%100 == 0:
                print(f'-------- > epoch {epoch} lot {lot} :  coût = {cout.item()}')
                temps_ecoule = time.time() - debut
                print('Temps écoulé : {:.0f}m {:.0f}s'.format(temps_ecoule // 60, temps_ecoule % 60))


def predire(ds, modele, debut_texte, nb_mots=20, mode =0):
    """ Prédire une suite de nb_mots à partir de debut_texte selon le modele"""

    mots = debut_texte.split(' ')
    modele.eval()
    etat = modele.initializer_etat(1)
    etat = etat.to(device)
    for i in range(0, nb_mots):
        lot_X = torch.tensor([[ds.mot_a_index[m] for m in mots[i:]]])
        lot_X = lot_X.to(device)
        lot_Y_predictions, etat = modele(lot_X, etat)
        dernier_mot_Yt = lot_Y_predictions[0][-1]
        probs_dernier_mot = torch.nn.functional.softmax(dernier_mot_Yt, dim=0).data
        if mode == 0 :
            index_mot_choisi = torch.max(probs_dernier_mot, dim=0)[1].item()
        else :
            index_mot_choisi = torch.multinomial(probs_dernier_mot, 1)[0].item()
        mots.append(ds.index_a_mot[index_mot_choisi])
    return mots

entrainer_RNR(ds_paroles, modele, taille_lot=32, epochs=5, taille_sequence=taille_sequence)
print("Texte stochastique 1 :",predire(ds_paroles, modele, debut_texte='we could', mode = 1))
print("Texte stochastique 2 :",predire(ds_paroles, modele, debut_texte='we could', mode = 1))
print("Texte stochastique 3 :",predire(ds_paroles, modele, debut_texte='we could', mode = 1))
print("Texte max :",predire(ds_paroles, modele, debut_texte='we could', mode = 0))

Entrainement sur  cpu
-------- > epoch 0 lot 0 :  coût = 7.972325801849365
Temps écoulé : 0m 0s
-------- > epoch 0 lot 100 :  coût = 5.107701778411865
Temps écoulé : 0m 2s
-------- > epoch 0 lot 200 :  coût = 6.186596870422363
Temps écoulé : 0m 5s
-------- > epoch 0 lot 300 :  coût = 6.162123680114746
Temps écoulé : 0m 7s
-------- > epoch 0 lot 400 :  coût = 5.290503025054932
Temps écoulé : 0m 10s
-------- > epoch 0 lot 500 :  coût = 3.3843472003936768
Temps écoulé : 0m 12s
-------- > epoch 0 lot 600 :  coût = 6.347045421600342
Temps écoulé : 0m 14s
-------- > epoch 0 lot 700 :  coût = 5.083999156951904
Temps écoulé : 0m 17s
-------- > epoch 0 lot 800 :  coût = 5.900958061218262
Temps écoulé : 0m 19s
-------- > epoch 0 lot 900 :  coût = 6.766435623168945
Temps écoulé : 0m 21s
-------- > epoch 0 lot 1000 :  coût = 5.8724517822265625
Temps écoulé : 0m 24s
-------- > epoch 1 lot 0 :  coût = 5.9766106605529785
Temps écoulé : 0m 25s
-------- > epoch 1 lot 100 :  coût = 2.891873836517334
Tem

In [11]:
import pandas as pd
dataframe_entrainement = pd.read_csv('lyrics-data.csv')
print(dataframe_entrainement.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 379931 entries, 0 to 379930
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   ALink     379930 non-null  object
 1   SName     379928 non-null  object
 2   SLink     379930 non-null  object
 3   Lyric     379854 non-null  object
 4   language  365296 non-null  object
dtypes: object(5)
memory usage: 14.5+ MB
None


In [7]:
dataframe_entrainement[(dataframe_entrainement['language'] == 'en')].iloc[0:10]

Unnamed: 0,ALink,SName,SLink,Lyric,language
69,/ivete-sangalo/,Careless Whisper,/ivete-sangalo/careless-whisper.html,I feel so unsure\nAs I take your hand and lead...,en
86,/ivete-sangalo/,Could You Be Loved / Citação Musical do Rap: S...,/ivete-sangalo/could-you-be-loved-citacao-musi...,"Don't let them fool, ya\nOr even try to school...",en
88,/ivete-sangalo/,Cruisin' (Part. Saulo),/ivete-sangalo/cruisin-part-saulo.html,"Baby, let's cruise, away from here\nDon't be c...",en
111,/ivete-sangalo/,Easy,/ivete-sangalo/easy.html,"Know it sounds funny\nBut, I just can't stand ...",en
140,/ivete-sangalo/,For Your Babies (The Voice cover),/ivete-sangalo/for-your-babies-the-voice-cover...,You've got that look again\nThe one I hoped I ...,en
147,/ivete-sangalo/,Human Nature,/ivete-sangalo/human-nature.html,Looking out\nAcross the night time\nThe city w...,en
159,/ivete-sangalo/,Losing Control (Miss Cady feat. Ivete Sangalo),/ivete-sangalo/losing-control-miss-cady-feat-i...,"Uh, yeah.\nGo, go, go.\nUh, yeah.\nUh, Uh, Uhh...",en
168,/ivete-sangalo/,Master Blaster (Jammin'),/ivete-sangalo/master-blaster-jammin.html,Everyone's feeling pretty\nIt's hotter than Ju...,en
187,/ivete-sangalo/,More Than Words,/ivete-sangalo/more-than-words.html,Saying 'I Love you'\nIs not the words I want t...,en
207,/ivete-sangalo/,Natural Collie,/ivete-sangalo/natural-collie.html,Been down in the valley\nSmoking natural colli...,en


In [41]:
texte_concatene = dataframe_entrainement[(dataframe_entrainement['language'] == 'en')].iloc[0:2]['Lyric'].str.cat(sep=' ')
print(texte_concatene)

I feel so unsure
As I take your hand and lead you to the dance floor
As the music dies, something in your eyes
Calls to mind a silver screen
And all those sad goodbyes

I'm never gonna dance again
Guilty feet have got no rhythm
Though it's easy to pretend
I know you're not a fool

Should've known better than to cheat a friend
And waste the chance that I've been given
So I'm never gonna dance again
The way I danced with you

Time can never mend
The careless whispers of a good friend
To the heart and mind
Ignorance is kind
There's no comfort in the truth
Pain is all you'll find

I'm never gonna dance again
Guilty feet have got no rhythm
Though it's easy to pretend
I know you're not a fool

Should've known better than to cheat a friend
And waste this chance that I've been given
So I'm never gonna dance again
The way I danced with you

Never without your love

Tonight the music seems so loud
I wish that we could lose this crowd
Maybe it's better this way
We'd hurt each other with the thing

In [43]:
import re
texte_concatene=re.sub(r'[^\w\s]', '', texte_concatene)
print(texte_concatene)

I feel so unsure
As I take your hand and lead you to the dance floor
As the music dies something in your eyes
Calls to mind a silver screen
And all those sad goodbyes

Im never gonna dance again
Guilty feet have got no rhythm
Though its easy to pretend
I know youre not a fool

Shouldve known better than to cheat a friend
And waste the chance that Ive been given
So Im never gonna dance again
The way I danced with you

Time can never mend
The careless whispers of a good friend
To the heart and mind
Ignorance is kind
Theres no comfort in the truth
Pain is all youll find

Im never gonna dance again
Guilty feet have got no rhythm
Though its easy to pretend
I know youre not a fool

Shouldve known better than to cheat a friend
And waste this chance that Ive been given
So Im never gonna dance again
The way I danced with you

Never without your love

Tonight the music seems so loud
I wish that we could lose this crowd
Maybe its better this way
Wed hurt each other with the things wed want to say

In [44]:
print(texte_concatene.lower().split())

['i', 'feel', 'so', 'unsure', 'as', 'i', 'take', 'your', 'hand', 'and', 'lead', 'you', 'to', 'the', 'dance', 'floor', 'as', 'the', 'music', 'dies', 'something', 'in', 'your', 'eyes', 'calls', 'to', 'mind', 'a', 'silver', 'screen', 'and', 'all', 'those', 'sad', 'goodbyes', 'im', 'never', 'gonna', 'dance', 'again', 'guilty', 'feet', 'have', 'got', 'no', 'rhythm', 'though', 'its', 'easy', 'to', 'pretend', 'i', 'know', 'youre', 'not', 'a', 'fool', 'shouldve', 'known', 'better', 'than', 'to', 'cheat', 'a', 'friend', 'and', 'waste', 'the', 'chance', 'that', 'ive', 'been', 'given', 'so', 'im', 'never', 'gonna', 'dance', 'again', 'the', 'way', 'i', 'danced', 'with', 'you', 'time', 'can', 'never', 'mend', 'the', 'careless', 'whispers', 'of', 'a', 'good', 'friend', 'to', 'the', 'heart', 'and', 'mind', 'ignorance', 'is', 'kind', 'theres', 'no', 'comfort', 'in', 'the', 'truth', 'pain', 'is', 'all', 'youll', 'find', 'im', 'never', 'gonna', 'dance', 'again', 'guilty', 'feet', 'have', 'got', 'no', 'r

In [45]:
texte_concatene = dataframe_entrainement[(dataframe_entrainement['language'] == 'en')].iloc[0:2]['Lyric'].str.cat(sep=' ')
re.sub(r'[^\w\s]', '', texte_concatene).lower().split()

['i',
 'feel',
 'so',
 'unsure',
 'as',
 'i',
 'take',
 'your',
 'hand',
 'and',
 'lead',
 'you',
 'to',
 'the',
 'dance',
 'floor',
 'as',
 'the',
 'music',
 'dies',
 'something',
 'in',
 'your',
 'eyes',
 'calls',
 'to',
 'mind',
 'a',
 'silver',
 'screen',
 'and',
 'all',
 'those',
 'sad',
 'goodbyes',
 'im',
 'never',
 'gonna',
 'dance',
 'again',
 'guilty',
 'feet',
 'have',
 'got',
 'no',
 'rhythm',
 'though',
 'its',
 'easy',
 'to',
 'pretend',
 'i',
 'know',
 'youre',
 'not',
 'a',
 'fool',
 'shouldve',
 'known',
 'better',
 'than',
 'to',
 'cheat',
 'a',
 'friend',
 'and',
 'waste',
 'the',
 'chance',
 'that',
 'ive',
 'been',
 'given',
 'so',
 'im',
 'never',
 'gonna',
 'dance',
 'again',
 'the',
 'way',
 'i',
 'danced',
 'with',
 'you',
 'time',
 'can',
 'never',
 'mend',
 'the',
 'careless',
 'whispers',
 'of',
 'a',
 'good',
 'friend',
 'to',
 'the',
 'heart',
 'and',
 'mind',
 'ignorance',
 'is',
 'kind',
 'theres',
 'no',
 'comfort',
 'in',
 'the',
 'truth',
 'pain',
 'i

In [50]:
ds_paroles = DatasetParolesLyricsDataEn(nombre_textes = nombre_textes, taille_sequence=taille_sequence)

In [51]:
ds_paroles[0]

(tensor([   1,   67,   20, 1610,  184,    1,   74,   14]),
 tensor([  67,   20, 1610,  184,    1,   74,   14,  318]))