In [1]:
from tqdm import tqdm
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import re
import sys

## Querying Wikidata
These functions query Wikidata database to extract artists info, such as:
- name of the artist
- website of the artist
- start of working period (only year)
- end of working period (only year)
- when the artist has dissolved (if ever; only year)
- citizenship(s) for humans and country of origin for groups
- the awards the artist won

In [2]:
# function that given a spotify artist ID looks for data about the artist
def get_info_by_spotify_id(spotify_artist_id):
    url = "https://query.wikidata.org/sparql"
    
    query = """
        SELECT ?artist ?artistLabel (YEAR(?startPeriod) AS ?start) (YEAR(?endPeriod) AS ?end) (YEAR(?dissolvedTime) AS ?dissolved) ?websiteLabel
        WHERE {
            ?artist wdt:P1902 "%s" .
            OPTIONAL { ?artist wdt:P856 ?website . }
            OPTIONAL { ?artist wdt:P2031 ?startPeriod . }
            OPTIONAL { ?artist wdt:P2032 ?endPeriod . }
            OPTIONAL { ?artist wdt:P576 ?dissolvedTime . }
            SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
        }
    """ % spotify_artist_id

    user_agent = "database2_project Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    
    sparql = SPARQLWrapper(url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    # there can be at most 1 row (else, duplicate Spotify ID but luckily this cannot happen)
    # so if there is len != 0 it can only be 1 -> return first element of the list
    if len(results['results']['bindings']):
        return results['results']['bindings'][0]
    else: # if len == 0, no result
        return None

In [3]:
# function that given the artist's name looks for data about the artist
def get_info_by_label(artist_label):
    url = "https://query.wikidata.org/sparql"
    
    query = """
        SELECT ?artist ?artistLabel (YEAR(?startPeriod) AS ?start) (YEAR(?endPeriod) AS ?end) (YEAR(?dissolvedTime) AS ?dissolved) ?websiteLabel
        WHERE {
            ?artist rdfs:label "%s"@en .
            OPTIONAL { ?artist wdt:P856 ?website . }
            OPTIONAL { ?artist wdt:P2031 ?startPeriod . }
            OPTIONAL { ?artist wdt:P2032 ?endPeriod . }
            OPTIONAL { ?artist wdt:P576 ?dissolvedTime . }
            SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
        }
        LIMIT 1
    """ % artist_label

    user_agent = "database2_project Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    
    sparql = SPARQLWrapper(url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    # there can be at most 1 row (LIMIT 1)
    # -> return first element of the list
    if len(results['results']['bindings']):
      return results['results']['bindings'][0]
    else: # if len == 0, no result
      return None

In [4]:
# function that given the artist's name looks for data about the artist
def get_countrycode_by_wd_entity(entity_id):
    url = "https://query.wikidata.org/sparql"
    
    query = """
        SELECT DISTINCT ?iso31661alpha2codeLabel
        WHERE {
            VALUES ?types {wd:Q2088357 wd:Q5}
            wd:%s ?prop ?country .
            ?country p:P31/ps:P31/wdt:P279* wd:Q6256 ;
                     wdt:P297 ?iso31661alpha2code .
            SERVICE wikibase:label {bd:serviceParam wikibase:language "en" . }
        }
    """ % entity_id

    user_agent = "database2_project Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    
    sparql = SPARQLWrapper(url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    # there can be more than one country, so return everything
    if len(results['results']['bindings']):
      return results['results']['bindings']
    else: # if len == 0, no result
      return None

In [5]:
def get_awards_by_wd_entity(entity_id):
    url = "https://query.wikidata.org/sparql"
    
    query = """
        SELECT DISTINCT ?awardLabel (YEAR(?awardDate) as ?awardYear)
        WHERE {
            wd:%s p:P166 ?awardStatement .
            ?awardStatement ps:P166 ?award .
            OPTIONAL { ?awardStatement pq:P585 ?awardDate . }
            ?award p:P31/ps:P31/wdt:P279* wd:Q1364556 .
            SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } 
        }
    """ % entity_id

    user_agent = "database2_project Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    
    sparql = SPARQLWrapper(url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if len(results['results']['bindings']):
      return results['results']['bindings']
    else: # if len == 0, no result
      return None

## Preprocessing
- read the .csv file (only ```Artist``` and ```Url_spotify``` columns)
- drop duplicated artists: the data we ask for would be the same every time
- extract Spotify artist ID from the URL

In [6]:
artists = pd.read_csv('../data/Spotify_Youtube.csv', usecols=['Artist', 'Url_spotify'])
artists = artists.drop_duplicates(subset='Artist').reset_index(drop=True)
artists['Url_spotify'] = artists['Url_spotify'].apply(lambda spotify_id: spotify_id.split('/')[-1])

In [7]:
artists

Unnamed: 0,Artist,Url_spotify
0,Gorillaz,3AA28KZvwAUcZuOKwyblJQ
1,Red Hot Chili Peppers,0L8ExT028jH3ddEcZwqJJ5
2,50 Cent,3q7HBObVc0L8jNeTe5Gofh
3,Metallica,2ye2Wgw4gimLv2eAKyk1NB
4,Coldplay,4gzpq5DPGxSnKTe4SA8HAU
...,...,...
2074,Grupo Frontera,6XkjpgcEsYab502Vr1bBeW
2075,Jung Kook,6HaGTQPmzraVmaVxvz6EUc
2076,LE SSERAFIM,4SpbR6yFEvexJuaBpgAU5p
2077,ThxSoMch,4MvZhE1iuzttcoyepkpfdF


## Save the data
This cell executes the queries and saves a pandas DataFrame with all the information available.

In [8]:
# data structures to store data to compose final pd.DataFrame(s)
artistLabels = []
websiteLabels = []
starts = []
ends = []
dissolveds = []
country_codes = []

award_names = []
award_statements = {
    'artist_spotify_id': [],
    'award_id': [],
    'award_year': []
}

for index, row in tqdm(artists.iterrows(), total=artists.shape[0]):
    q_res = None
    
    # execute queries and get results
    q_id = get_info_by_spotify_id(row['Url_spotify'])
    if q_id is not None:
        q_res = q_id
    else:
        q_label = get_info_by_label(row['Artist'])
        if q_label is not None:
            q_res = q_label
            
    if q_res is not None:
        # country codes
        # extract from q_res the entity ID and use it in the query
        # the URI is like: http://www.wikidata.org/entity/ID
        artist_wd_entity_id = q_res['artist']['value'].split('/')[-1]

        # country codes
        q_cc = get_countrycode_by_wd_entity(artist_wd_entity_id)
        if q_cc is not None:
            # gather all country codes and join then in a string
            country_code_list = [result_row['iso31661alpha2codeLabel']['value'] for result_row in q_cc]
            country_code_str = '+'.join(country_code_list)
            country_codes.append(country_code_str)
        else:
            country_codes.append('_')

        # awards received
        q_awards = get_awards_by_wd_entity(artist_wd_entity_id)
        if q_awards is not None:
            for award in q_awards: # for every award
                # avoid awards without labels
                if not re.fullmatch(r'Q[0-9]+', award['awardLabel']['value']):
                    # add the name to the list of awards
                    if award['awardLabel']['value'] not in award_names:
                        award_names.append(award['awardLabel']['value'])
    
                    # gather statement about artist winning the award in a year (if present)
                    award_statements['artist_spotify_id'].append(row['Url_spotify'])
                    award_id = award_names.index(award['awardLabel']['value'])
                    award_statements['award_id'].append(award_id)
                    award_year = award['awardYear']['value'] if 'awardYear' in award.keys() else '_'
                    award_statements['award_year'].append(award_year)

        # other artist info
        # the JSON response contains only properties for which there is a value
        # for each property, we check if the corresponding key is present
        #	adding the property value or '_' accordingly
        q_keys = q_res.keys()

        artistLabel = q_res['artistLabel']['value'] if 'artistLabel' in q_keys else '_'
        artistLabels.append(artistLabel)

        websiteLabel = q_res['websiteLabel']['value'] if 'websiteLabel' in q_keys else '_'
        websiteLabels.append(websiteLabel)

        start = q_res['start']['value'] if 'start' in q_keys else '_'
        starts.append(start)
        
        end = q_res['end']['value'] if 'end' in q_keys else '_'
        ends.append(end)
        
        dissolved = q_res['dissolved']['value'] if 'dissolved' in q_keys else '_'
        dissolveds.append(dissolved)
    else:
        artistLabels.append('_')
        websiteLabels.append('_')
        starts.append('_')
        ends.append('_')
        dissolveds.append('_')
        country_codes.append('_')

100%|██████████| 2079/2079 [32:01<00:00,  1.08it/s]


In [9]:
artist_info = artists.copy(deep=True)
artist_info['artistLabel'] = artistLabels
artist_info['websiteLabel'] = websiteLabels
artist_info['start'] = starts
artist_info['end'] = ends
artist_info['dissolved'] = dissolveds
artist_info['country_codes'] = country_codes
artist_info.to_csv('../data/wikidata_artists.csv', index=False)

In [10]:
awards_info = pd.DataFrame({
    'award_id': range(len(award_names)),
    'award_name': award_names
})
awards_info

Unnamed: 0,award_id,award_name
0,0,MOJO Awards
1,1,Grammy Award for Best Rock Performance by a Du...
2,2,MTV Europe Music Award for Best Album
3,3,Grammy Award for Best Rock Album
4,4,Grammy Award for Best Dance/Electronic Album
...,...,...
188,188,Latin Grammy Award for Best Recording Package
189,189,First prize of the Eurovision Song Contest
190,190,MTV Europe Music Award for Best Korean Act
191,191,P3 Gull for Artist of the Year


In [11]:
split_names = awards_info['award_name'].apply(lambda name: name.split(' for ', maxsplit=1)).apply(pd.Series)
split_names = split_names.rename(columns={0: 'award_type', 1: 'award_category'})
split_names

Unnamed: 0,award_type,award_category
0,MOJO Awards,
1,Grammy Award,Best Rock Performance by a Duo or Group with V...
2,MTV Europe Music Award,Best Album
3,Grammy Award,Best Rock Album
4,Grammy Award,Best Dance/Electronic Album
...,...,...
188,Latin Grammy Award,Best Recording Package
189,First prize of the Eurovision Song Contest,
190,MTV Europe Music Award,Best Korean Act
191,P3 Gull,Artist of the Year


In [12]:
awards_final = pd.concat([awards_info, split_names], axis=1)
awards_final

Unnamed: 0,award_id,award_name,award_type,award_category
0,0,MOJO Awards,MOJO Awards,
1,1,Grammy Award for Best Rock Performance by a Du...,Grammy Award,Best Rock Performance by a Duo or Group with V...
2,2,MTV Europe Music Award for Best Album,MTV Europe Music Award,Best Album
3,3,Grammy Award for Best Rock Album,Grammy Award,Best Rock Album
4,4,Grammy Award for Best Dance/Electronic Album,Grammy Award,Best Dance/Electronic Album
...,...,...,...,...
188,188,Latin Grammy Award for Best Recording Package,Latin Grammy Award,Best Recording Package
189,189,First prize of the Eurovision Song Contest,First prize of the Eurovision Song Contest,
190,190,MTV Europe Music Award for Best Korean Act,MTV Europe Music Award,Best Korean Act
191,191,P3 Gull for Artist of the Year,P3 Gull,Artist of the Year


In [13]:
awards_final.to_csv('../data/wikidata_awards.csv', index=False)

In [14]:
award_statements_df = pd.DataFrame(award_statements)
award_statements_df.to_csv('../data/wikidata_award_statements.csv', index=False)