In [1]:
import os
import time
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import string
from unidecode import unidecode
import pickle
import json

In [2]:
# List from: https://fr.wikipedia.org/wiki/Cat%C3%A9gorie:Rappeur_fran%C3%A7ais
rappers_list = [unidecode(l.strip().lower()) for l in open('rappers.txt', encoding="utf8")]

In [3]:
rappers_list = [''.join(c if c in string.ascii_lowercase or c.isnumeric() else '-' for c in name ) for name in rappers_list]

In [4]:
len(rappers_list)

381

In [5]:
def get_song_names(artist_name):
    response = requests.get(f'https://www.paroles.net/{artist_name}')
    soup = BeautifulSoup(response.text, 'html.parser')
    songs_list = []
    main_div = soup.find("div", {"id": "main"})
    
    if main_div:
        for table in main_div.find_all("table", {"class": "song-list"}):
            if 'tab-mobile' in table.attrs['class']:
                continue
            for song in table.find_all("td", {"class": "song-name"}):
                try:
                    song_name = song.find('a').text
                    song_link = song.find('a')['href']
                    songs_list.append((song_name, song_link))
                except:
                    print("Problem with", artist_name)
    return songs_list

pickle.dump(songs_lists, open('songs_lists.pickle', 'wb'))

In [6]:
songs_lists = pickle.load(open('songs_lists.pickle', 'rb'))

In [7]:
songs_lists['bigflo-oli']

[('Tandem', '/bigflo-oli/paroles-tandem'),
 ('FREESTYLE EN MODE NEW WAVE !',
  '/bigflo-oli/paroles-freestyle-en-mode-new-wave'),
 ('FREESTYLE TECHNIQUE 2023 !',
  '/bigflo-oli/paroles-freestyle-technique-2023'),
 ("FREESTYLE À L'ANCIENNE", '/bigflo-oli/paroles-freestyle-a-l-ancienne'),
 ('30 ans', '/bigflo-oli/paroles-30-ans'),
 ('OLI Freestyle Planète Rap',
  '/bigflo-oli/paroles-oli-freestyle-planete-rap'),
 ('Alors Alors', '/bigflo-oli/paroles-alors-alors'),
 ('Bienvenue Chez Moi', '/bigflo-oli/paroles-bienvenue-chez-moi'),
 ('Comme d’hab', '/bigflo-oli/paroles-comme-d-hab'),
 ('Coup De Blues/Soleil', '/bigflo-oli/paroles-coup-de-blues-soleil'),
 ('Coup De Vieux', '/bigflo-oli/paroles-coup-de-vieux'),
 ('Demain', '/bigflo-oli/paroles-demain'),
 ('Dernière', '/bigflo-oli/paroles-derniere'),
 ('Dommage', '/bigflo-oli/paroles-dommage'),
 ('Gangsta', '/bigflo-oli/paroles-gangsta'),
 ('Insolent #4', '/bigflo-oli/paroles-insolent-4'),
 ("J'étais Pas Là", '/bigflo-oli/paroles-j-etais-pas-

In [8]:
songs_lists['aya-nakamura']

[('Chérie', '/aya-nakamura/paroles-cherie'),
 ('Come back', '/aya-nakamura/paroles-come-back'),
 ('Bisous', '/aya-nakamura/paroles-bisous'),
 ('Fin', '/aya-nakamura/paroles-fin'),
 ('Chacun', '/aya-nakamura/paroles-chacun'),
 ('Coller', '/aya-nakamura/paroles-coller'),
 ('40%', '/aya-nakamura/paroles-40'),
 ('Baby', '/aya-nakamura/paroles-baby'),
 ('Belek', '/aya-nakamura/paroles-bellek'),
 ('Biff', '/aya-nakamura/paroles-biff'),
 ('Bloqué', '/aya-nakamura/paroles-bloque'),
 ('Bobo', '/aya-nakamura/paroles-bobo'),
 ('Ça Blesse', '/aya-nakamura/paroles-ca-blesse'),
 ('Cadeau', '/aya-nakamura/paroles-cadeau'),
 ('Chacun', '/aya-nakamura/paroles-chacun'),
 ('Chérie', '/aya-nakamura/paroles-cherie'),
 ('Claqué', '/aya-nakamura/paroles-claque'),
 ('Coller', '/aya-nakamura/paroles-coller'),
 ('Comportement', '/aya-nakamura/paroles-comportement'),
 ('Copines', '/aya-nakamura/paroles-copines'),
 ('Corazon', '/aya-nakamura/paroles-corazon'),
 ('Daddy', '/aya-nakamura/paroles-daddy'),
 ('Dégaine

In [9]:
len(_)

141

In [10]:
len(get_song_names('aya-nakamura'))

55

In [11]:
def get_song_lyrics(song_url):
    response = requests.get(f'https://www.paroles.net/{song_url}')
    soup = BeautifulSoup(response.text, 'html.parser')
    main_div = soup.find("div", {"class": "song-text"})
    
    song_text = []
    if main_div:
        for div in main_div.find_all("div"):
            line = div.text.strip()
            if line:
                lines = line.split('\r\n')
                song_text.extend(lines)
    return song_text

In [12]:
lyrics = {}
progress = tqdm(songs_lists)
for artist in progress:
    songs_list = songs_lists[artist]
    if songs_list == []:
        continue
    
    progress.set_description("Processing " + artist)
    lyrics[artist] = {}
    for song_title, song_url in songs_list:
        song_lyrics = get_song_lyrics(song_url)
        if lyrics:
            lyrics[artist][song_title] = song_lyrics
        time.sleep(0.5)

  0%|          | 0/380 [00:00<?, ?it/s]

In [18]:
len(lyrics)

277

In [19]:
import json
with open('all_lyrics_v2.json', 'w') as fp:
    json.dump(lyrics, fp)

In [22]:
pickle.dump(lyrics, open('songs_lyrics_v2.pickle', 'wb'))

In [25]:
from lingua import Language, LanguageDetectorBuilder
languages = [Language.ENGLISH, Language.FRENCH]
detector = LanguageDetectorBuilder.from_languages(*languages).build()

In [26]:
only_french_lyrics = {}
for artist in tqdm(lyrics):
    for title, song_lyrics in lyrics[artist].items():
        if detector.detect_language_of(' '.join(song_lyrics)) == Language.FRENCH:
            if artist not in only_french_lyrics:
                only_french_lyrics[artist] = {}
            only_french_lyrics[artist][title] = song_lyrics

  0%|          | 0/277 [00:00<?, ?it/s]

In [70]:
len(only_french_lyrics)

271

In [71]:
sum([len(' '.join(song_lyrics).split()) for artist, song_lyrics in only_french_lyrics.items()])

35077

In [28]:
import json
with open('all_lyrics_fr_v2.json', 'w') as fp:
    json.dump(only_french_lyrics, fp)

In [29]:
pickle.dump(only_french_lyrics, open('french_lyrics_only_v2.pickle', 'wb'))

In [30]:
only_french_lyrics.keys()

dict_keys(['abd-al-malik', 'ademo', 'admiral-t', 'aketo', 'akhenaton', 'l-algerino', 'ali', 'alkpote', 'alonzo', 'alpha-5-20', 'ap', 'arm', 'ateyaba', 'axiom', 'aya-nakamura', 'pit-baccardi', 'bakar', 'gerard-baste', 'bekar', 'ben-j', 'benab', 'benash', 'bigflo-oli', 'big-red', 'black-kent', 'black-m', 'blacko', 'boef', 'bolemvn', 'booba', 'boostee', 'bosh', 'bouga', 'brasco', 'brav', 'lucio-bukowski', 'deen-burbigo', 'busta-flex', 'canardo', 'nick-conrad', 'cuizinier', 'da-uzi', 'daddy-lord-c', 'dadoo', 'dany-dan', 'abou-debeing', 'dee-nasty', 'def-bond', 'james-delleck', 'demi-portion', 'demon-one', 'diamond-deuklo', 'souleymane-diamanka', 'disiz', 'doc-gyneco', 'don-choa', 'doomams', 'dooz-kawa', 'dosseh', 'doums', 'driver', 'edge', 'ejm', 'eklips', 'elams', 'elh-kmer', 'benjamin-epps', 'eureka', 'f', 'fababy', 'fabe', 'faf-larage', 'fave', 'gael-faye', 'fefe', 'fisto', 'flynt', 'la-fouine', 'freeman', 'freeze-corleone', 'fuzati', 'gambi', 'gazo', 'georgio', 'gradur', 'grand-corps-m