In [None]:
import os
import readtop2000
import tqdm
import pandas as pd

In [None]:
URL_TOP2000_HISTORY = 'https://nl.wikipedia.org/wiki/Lijst_van_Radio_2-Top_2000%27s'

In [None]:
readtop2000

## Base tables

In [None]:
def read():
    fulllist_reader = readtop2000.WikipediaTableExtractor(URL_TOP2000_HISTORY, [1], [0])
    df = fulllist_reader.extract_table_as_dataframe()
    
    cleaner = readtop2000.Top2000Cleaner(df)
    notering, song, songartist, artist = cleaner.clean()
    return notering, song, songartist, artist
notering, song, songartist, artist = read()

In [None]:
artist[artist.groupby('Name')['Link'].transform('nunique').gt(1)]

## Detailed information for artists from inboxes
Drop Space Monkey because it does have an infobox, but it is about the song, not the artist

Drop Anita Garbo because the wikipedia redirects to a song, instead of to a page about her


In [None]:
def download_infobox_details(links):
    result = []
    for link in tqdm.tqdm(links):
        info = readtop2000.InfoboxReader(link, allow_errors=True).read()
        result.append(info)
    return pd.concat(result)

links = artist.loc[~artist['Name'].isin(['Anita Garbo', 'Space Monkey']), 'Link']
artist_details = download_infobox_details(links)

In [None]:
# The band members are much harder to handle because of all the functions they can have: it gives a many-to-many relation for members and bands
# So we ignore those. The same goes vice versa for the member pages in which it is discussed in what bands they were active
extra_artist_details = (artist_details[~artist_details['Header'].isin(['Leden', 'Oud-leden', 'Bezetting']) & ~artist_details['Header'].str.startswith('Actief')]  
                         .set_index(['OriginalLink', 'Variable'])['Value'].unstack())

In [None]:
artist_full = artist.merge(extra_artist_details, right_index=True, left_on='Link', how='left')

## Save all tables

In [None]:
tables = {'notering': notering,
          'song': song,
          'songartist': songartist,
          'artist_small': artist,
          'artist': artist_full,
          'artist_details': artist_details
         }
for name, table in tables.items():
    table.to_parquet(os.path.join('Data', f'{name}.parquet'))