In [5]:
from bs4 import BeautifulSoup  # parcourir la page
import requests as re  # Récup la page web
from typing import List, Set
import bs4
from driver import create_chrome_driver, create__tor_driver
from typing import List, Tuple
from selenium.webdriver.common.by import By
from time import sleep
from tqdm.notebook import tqdm
import pickle

# from classes import Artist, Song, Album


class Artist():
    def __init__(self, link, name):
        self.link = link
        self.name = name
        self.songs = dict()
        self.featurings = set()
    def __repr__(self):
        return f"Artist({self.name})"
    
    def __eq__(self, other: "Artist"):
        return (
            self.__class__ == other.__class__ and
            self.link == other.link and 
            self.name == other.name
        )
    def __hash__(self):
        return hash(self.link)
    def test(self):
        return self
    def update_songs(self):
        albums = artist_to_albums(self)
        for album in tqdm(albums):
            song_links, song_names = album_to_song_links(album)
            for i,link in enumerate(song_links):
                if song_names[i] not in self.songs:
                    song = Song(link,song_names[i], self, link_to_artists(link))
                    self.songs[song_names[i]] = song
                    if self in song.writers:
                        self.featurings.update(song.writers)

    def save(self):
        with open(f'./data/{self.name}.pickle', 'wb') as file:
            pickle.dump(self, file) 


class Song():
    def __init__(self, link: str, name: str, artist: Artist, writers: Set[Artist]):
        self.link = link
        self.artist = artist
        self.writers = writers
        self.name = name

    def __repr__(self):
        return f"Song(by = {self.artist}, name: {self.name})"
    def __eq__(self, other: "Song"):
        return (
            self.__class__ == other.__class__ and
            self.link == other.link
        )
    def __hash__(self):
        return hash(self.link)

class Album():
    def __init__(self, artist: Artist, name: str, link: str, year: int, song_list: List[str] = []):
        self.artist = artist
        self.name = name
        self.link = link
        self.songs = song_list
        self.year = year

    def __repr__(self):
        return f"Album(by {self.artist}, name: {self.name})"


def div_to_type(div: str) -> str:
    return div.find("div").text


def a_to_artist(a: bs4.element.Tag) -> Artist:
    artist_name = a.text
    artist_link = a["href"]
    return Artist(artist_link, artist_name)


def link_to_artists(lien: str) -> Set[Artist]:
    content = re.get(lien).text
    artists = set()
    filtered_content = BeautifulSoup(content).findAll(
        "div", class_="SongInfo__Credit-nekw6x-3")
    ecrit_par = [c for c in filtered_content if div_to_type(c) == "Written By"]
    if len(ecrit_par) <=0:
        return []

    for element in ecrit_par[0].findAll('a'):
        artists.add(a_to_artist(element))
    return artists


def artist_to_albums(artist: Artist) -> List[Album]:
    DRIVER.get(artist.link)
    elements = DRIVER.find_elements(By.CLASS_NAME, "full_width_button")
    button_list = [e for e in elements if "Show all albums" in e.text]
    if len(button_list) <= 0:
        print("no albums for artist", artist)
        return []
    button = button_list[0]
    button.click()
    albums = []
    i = 0 
    while len(albums) == 0:
        sleep(1)
        i += 1
        if i >15:
            raise TimeoutError("Too long")
        soup = BeautifulSoup(DRIVER.page_source, "lxml")
        albums = [(x.find('a')["title"], x.find('a')["href"], x.find("div", class_="mini_card-subtitle").text) for x in soup.findAll('mini-album-card')]
    if len(albums):
        sleep(1)

    return [Album(artist, name, link, year) for name, link, year in albums]


def album_to_song_links(album: Album) -> List[str]:
    content = re.get(album.link).text
    soup = BeautifulSoup(content)
    song_names = []
    song_links = []
    for div in soup.findAll("div", class_="chart_row-content"):
        song = div.find("a")
        name, link = (song.text.strip().split("\n")[0].replace(u'\xa0', u' '), song["href"])
        song_names.append(name)
        song_links.append(link)
    return song_links,song_names


DRIVER = create_chrome_driver()
# DRIVER.get("http://check.torproject.org")


In [10]:
from ast import literal_eval
from pathlib import Path


filepath = "./rapeurs1.txt"
filepath2 = './rapeurs2.txt'
def file_to_artists(path: str) -> List[Tuple[str,str]]:
    genius_path = "https://genius.com/artists/"
    artists = []
    with open(path, "r", encoding = "utf-8") as f:
        l = literal_eval(f.readlines()[0])
    for r in l:
        artists.append(Artist(genius_path+ r[0],r[1]))
    return artists
all_artists  = file_to_artists(filepath) + file_to_artists(filepath2)

In [11]:
for artist in tqdm(all_artists[:]):
    path = Path(f"./data/{artist.name}.pickle")
    if not path.is_file():
        print(artist)   
        artist.update_songs()
        artist.save()



HBox(children=(FloatProgress(value=0.0, max=318.0), HTML(value='')))

Artist(Tiakola)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Artist(Tiitof)


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


Artist(Timal)


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Artist(Titi Official)


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Artist(TLF)


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


Artist(Martial Tricoche)
no albums for artist Artist(Martial Tricoche)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Artist(Tsew The Kid)


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


Artist(Tunisiano)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Artist(Jewel Usain)


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Artist(Usky)


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


Artist(Vald)


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


Artist(Vegedream)


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Artist(Vicelow)


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Artist(Vincha)


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Artist(Vîrus)


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))


Artist(Wacko)


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Artist(Alpha Wann)


HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


Artist(Sulee B Wax)
no albums for artist Artist(Sulee B Wax)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Artist(Yannick)


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Artist(Yosoji)
no albums for artist Artist(Yosoji)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Artist(Michaël Youn)
no albums for artist Artist(Michaël Youn)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Artist(Young Chang Mc)


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Artist(Youssoupha)


HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))


Artist(Youv Dee)


HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


Artist(Jean-Pascal Zadi)
no albums for artist Artist(Jean-Pascal Zadi)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Artist(Zamdane)


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


Artist(Sofiane Zermani)


HBox(children=(FloatProgress(value=0.0, max=13.0), HTML(value='')))


Artist(Zesau)


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


Artist(Darryl Zeuja)


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


Artist(Zkr)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))


Artist(Zola)


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))


Artist(Zoxea)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))





In [None]:
import pickle 
with open("./data/Alkpote.pickle", "rb") as f:
    faye = pickle.load(f)
    print(faye.songs)

In [51]:
import pyspark 

sc = pyspark.SparkContext()
rdd_artists = sc.parallelize(all_artists)


ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=pyspark-shell, master=local[*]) created by __init__ at C:\Users\bench\AppData\Local\Temp/ipykernel_51708/3530531434.py:3 

In [52]:
rdd_artists.foreach(lambda x: x.update_songs())

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 0.0 failed 1 times, most recent failure: Lost task 7.0 in stage 0.0 (TID 7) (192.168.1.104 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Spark\spark-3.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 619, in main
  File "C:\Spark\spark-3.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 609, in process
  File "c:\Users\bench\anaconda3\envs\p38\lib\site-packages\pyspark\rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "c:\Users\bench\anaconda3\envs\p38\lib\site-packages\pyspark\rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "c:\Users\bench\anaconda3\envs\p38\lib\site-packages\pyspark\rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "c:\Users\bench\anaconda3\envs\p38\lib\site-packages\pyspark\rdd.py", line 417, in func
    return f(iterator)
  File "c:\Users\bench\anaconda3\envs\p38\lib\site-packages\pyspark\rdd.py", line 916, in processPartition
    for x in iterator:
  File "C:\Spark\spark-3.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 138, in load_stream
    yield self._read_with_length(stream)
  File "C:\Spark\spark-3.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 160, in _read_with_length
    return self.loads(obj)
  File "C:\Spark\spark-3.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
AttributeError: Can't get attribute 'Artist' on <module 'pyspark.worker' from 'C:\\Spark\\spark-3.2.1-bin-hadoop2.7\\python\\lib\\pyspark.zip\\pyspark\\worker.py'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:555)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:713)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:695)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:508)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1030)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2254)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2454)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2402)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2402)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1160)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1160)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2642)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2584)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2573)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Spark\spark-3.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 619, in main
  File "C:\Spark\spark-3.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 609, in process
  File "c:\Users\bench\anaconda3\envs\p38\lib\site-packages\pyspark\rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "c:\Users\bench\anaconda3\envs\p38\lib\site-packages\pyspark\rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "c:\Users\bench\anaconda3\envs\p38\lib\site-packages\pyspark\rdd.py", line 2918, in pipeline_func
    return func(split, prev_func(split, iterator))
  File "c:\Users\bench\anaconda3\envs\p38\lib\site-packages\pyspark\rdd.py", line 417, in func
    return f(iterator)
  File "c:\Users\bench\anaconda3\envs\p38\lib\site-packages\pyspark\rdd.py", line 916, in processPartition
    for x in iterator:
  File "C:\Spark\spark-3.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 138, in load_stream
    yield self._read_with_length(stream)
  File "C:\Spark\spark-3.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 160, in _read_with_length
    return self.loads(obj)
  File "C:\Spark\spark-3.2.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
AttributeError: Can't get attribute 'Artist' on <module 'pyspark.worker' from 'C:\\Spark\\spark-3.2.1-bin-hadoop2.7\\python\\lib\\pyspark.zip\\pyspark\\worker.py'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:555)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:713)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:695)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:508)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1030)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2254)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
