In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen

# Inspection de la page Pitchfork

Choissisons les chansons des années 60.  
On remarque qu'il y a une liste de 20 morceaux répartis sur 10 pages. Il faudra naviguer un peu.

## Les morceaux

Chaque morceau est constitué d'un *classement* `class="rank"`, d'une liste `<ul><li>` de *groupe* ou *artiste* `class="artist-list list-blurb__artists"`, ainsi que d'un *titre* `<h2 class="list-blurb__work-title"</h2>`. Le tout wrappé dans dans un container `class="list-blurb blurb-container container-fluid"`.  
Possibilité d'ajouter la production et l'année.

## La pagination
La pagination est assez simple: chaque page est accessible par une URL `url?page=n` avec `n` le numéro de la page.  
Cette pagination est contenu dans le tag `<ol class="fts-pagination__list">` et les liens répertoriés par la class `fts-pagination__list-item__link`. Je n'aurai qu'à les dénombrer.


In [2]:
# Premier point de départ
url = 'https://pitchfork.com/features/lists-and-guides/6405-the-200-greatest-songs-of-the-1960s/?page=1'
# Ouverture de la page
page = urlopen(url)
# Parse avec BeautifulSoup 
bs = BeautifulSoup(page, 'html.parser')

In [3]:
# Test de récupération des nom des morceaux
titles_tags = bs.findAll(name='h2', attrs={'class': 'list-blurb__work-title'})
for t in titles_tags:
    print(t.text)
# Pas mal mais enlevons les guillemets

“Sunny Afternoon”
“Black Is the Color of My True Love’s Hair”
“Walk on By”
“Solo Dancer”
“Time Is on My Side”
“Night Train (Live at the Apollo)”
“Build Me Up Buttercup”
“Jackson”
“I’m Still in Love With You”
“Mercy, Mercy, Mercy”
“So Long, Marianne”
“Strychnine”
“Debora”
“The Sun Ain’t Gonna Shine Anymore”
“Bus Stop”
“Get Ready”
“Mother Popcorn (You Got to Have a Mother for Me)”
“Beyond the Sea”
“She’s Got You”
“Laisse Tomber les Filles”


In [4]:
import re
titles = []

pattern = "[“”]"

for t in titles_tags:
    titles.append(re.sub(pattern, "", t.text))

titles

# Le fonctionnement est pas mal !

['Sunny Afternoon',
 'Black Is the Color of My True Love’s Hair',
 'Walk on By',
 'Solo Dancer',
 'Time Is on My Side',
 'Night Train (Live at the Apollo)',
 'Build Me Up Buttercup',
 'Jackson',
 'I’m Still in Love With You',
 'Mercy, Mercy, Mercy',
 'So Long, Marianne',
 'Strychnine',
 'Debora',
 'The Sun Ain’t Gonna Shine Anymore',
 'Bus Stop',
 'Get Ready',
 'Mother Popcorn (You Got to Have a Mother for Me)',
 'Beyond the Sea',
 'She’s Got You',
 'Laisse Tomber les Filles']

Ajoutons une fonction qui permet d'aller chercher les titres et autres sans avoir à tout réécrire à chaque fois

In [5]:
def fetch(url, name, attrs, chars_to_remove=False, tag_attr=False):

    """Fetch elements in a webpage with BS4:
    Params:
    - url, str, url to read
    - name: str, tag name in the page, ie: h1
    - attr: dict, with the class, id and maybe data-*
    - chars_to_remove: str, remove chars, otherwise False

    Returns:
    Array with clean data
    """

    # URL open
    page = urlopen(url)
    # Parse 
    bs = BeautifulSoup(page, 'html.parser')
    # Find tags
    tags = bs.findAll(name=name, attrs=attrs)

    # If chars_to_remove is defined
    if chars_to_remove:
        # Create the pattern
        pattern = "[" + chars_to_remove + "]"
        # Then gets all tags
        # Removes the chars and append to result
        return [re.sub(pattern, "", t.text) for t in tags]

    # If tag_attr is defined
    if tag_attr:
        # Return the attribute from the tag
        return [t.get(tag_attr) for t in tags]

    # Otherwise, return default, ie. text inside the tag
    return [t.text for t in tags]

In [6]:
titles_attrs={'class': 'list-blurb__work-title'}
titles = fetch(url, 'h2', titles_attrs, "“”")
titles

['Sunny Afternoon',
 'Black Is the Color of My True Love’s Hair',
 'Walk on By',
 'Solo Dancer',
 'Time Is on My Side',
 'Night Train (Live at the Apollo)',
 'Build Me Up Buttercup',
 'Jackson',
 'I’m Still in Love With You',
 'Mercy, Mercy, Mercy',
 'So Long, Marianne',
 'Strychnine',
 'Debora',
 'The Sun Ain’t Gonna Shine Anymore',
 'Bus Stop',
 'Get Ready',
 'Mother Popcorn (You Got to Have a Mother for Me)',
 'Beyond the Sea',
 'She’s Got You',
 'Laisse Tomber les Filles']

Okay, tout à l'air de fonctionner.
Testons la navigation.

In [7]:
# Récupère les liens
anchors = fetch(url, name='a', attrs={'class', 'fts-pagination__list-item__link'}, tag_attr='href')


In [8]:
df = pd.DataFrame(columns=['rank', 'song', 'artist', 'label', 'year'])

titles_conf = {
    'tag': 'h2',
    'attrs': {'class': 'list-blurb__work-title'},
    'skip': "“”"
}

rank_conf = {
    'tag': 'div',
    'attrs': {'class': 'rank'}
}

label_conf = {
    'tag': 'li',
    'attrs': {'class': 'labels-list__item'}
}

artists_conf = {
    'tag': 'ul',
    'attrs': 'artist-list list-blurb__artists'
}

url = 'https://pitchfork.com'

# Pour chaque liens, on va récupérer les titres, les artistes, le rang, le label et l'année
for a in anchors:
    print('Fetching... {}'.format(a))
    uri = url + a
    temp = pd.DataFrame(columns=['rank', 'song', 'artist', 'label', 'year'])

    print('Looking for rank...')
    temp['rank'] = fetch(uri, name=rank_conf['tag'], attrs=rank_conf['attrs'])
    print('Done.')
    print('Looking for songs...')
    temp['song'] = fetch(uri, name=titles_conf['tag'], attrs=titles_conf['attrs'], chars_to_remove=titles_conf['skip'])
    print('Done.')
    print('Looking for artists...')
    temp['artist'] = fetch(uri, name=artists_conf['tag'], attrs=artists_conf['attrs'])
    print('Done.')
    print('Looking for labels...')
    temp['label'] = fetch(uri, name=label_conf['tag'], attrs=label_conf['attrs'])
    #print('Formating labels and years...')
    formating = temp['label'].values
    #temp['label'], temp['year'] = formating[0], formating[1]
    print('Done.')

    df = pd.concat([df, temp], axis=0)
    print('Saving... done.')

    

Fetching... /features/lists-and-guides/6405-the-200-greatest-songs-of-the-1960s/?page=1
Looking for rank...
Done.
Looking for songs...
Done.
Looking for artists...
Done.
Looking for labels...
Done.
Saving... done.
Fetching... /features/lists-and-guides/6405-the-200-greatest-songs-of-the-1960s/?page=2
Looking for rank...
Done.
Looking for songs...
Done.
Looking for artists...
Done.
Looking for labels...
Done.
Saving... done.
Fetching... /features/lists-and-guides/6405-the-200-greatest-songs-of-the-1960s/?page=3
Looking for rank...
Done.
Looking for songs...
Done.
Looking for artists...
Done.
Looking for labels...
Done.
Saving... done.
Fetching... /features/lists-and-guides/6405-the-200-greatest-songs-of-the-1960s/?page=4
Looking for rank...
Done.
Looking for songs...
Done.
Looking for artists...
Done.
Looking for labels...
Done.
Saving... done.
Fetching... /features/lists-and-guides/6405-the-200-greatest-songs-of-the-1960s/?page=5
Looking for rank...
Done.
Looking for songs...
Done.
Loo

In [9]:
df.head()

Unnamed: 0,rank,song,artist,label,year
0,200,Sunny Afternoon,The Kinks,"Marble Arch Records, 1966",
1,199,Black Is the Color of My True Love’s Hair,Nina Simone,"Colpix Records, 1964",
2,198,Walk on By,Dionne Warwick,"Scepter Records, 1964",
3,197,Solo Dancer,Charles Mingus,"Impulse!, 1963",
4,196,Time Is on My Side,Irma Thomas,"Imperial, 1964",


In [10]:
df.tail()

Unnamed: 0,rank,song,artist,label,year
15,5,A Day in the Life,The Beatles,"Parlophone/Capitol, 1967",
16,4,Like a Rolling Stone,Bob Dylan,"CBS, 1965",
17,3,A Change Is Gonna Come,Sam Cooke,"RCA Victor, 1964",
18,2,I Want You Back,The Jackson 5,"Motown, 1969",
19,1,God Only Knows,The Beach Boys,"Capitol, 1966",


# Résultats
Quelques arrangements à faire avec les labels et les années avant de faire un peu d'analyse !
