In [7]:
import requests
import json
import os

def getArtistsDocumentedStartingFrom(X):
    """
    This function takes an index X and fetches a range of artists from the wasabi API starting from that index.
    It checks if the 'data' directory exists, creates it if not, and then saves the data into a JSON file called 'documented_artists.json'.
    """
    # API endpoint with the specified index X
    url = f"http://wasabi.i3s.unice.fr/api/v1/artist_all/{X}"

    # Attempt to fetch the data from the API
    try:
        response = requests.get(url)
        # If the response is successful, no Exception will be raised
        response.raise_for_status()
    except requests.HTTPError as http_err:
        return f"HTTP error occurred: {http_err}"
    except Exception as err:
        return f"An error occurred: {err}"
    else:
        # Load the response data into a JSON structure
        data = response.json()
        
        # Define the directory and file path
        file_path = 'documented_artists.json'
        
        # Writing the JSON data to a file
        with open(file_path, 'w') as file:
            json.dump(data, file, indent=4)
        
        return file_path

In [8]:
getArtistsDocumentedStartingFrom(40000)

'documented_artists.json'

In [9]:
def clean_artist_data(json_file):
    """
    This function takes a JSON file containing artist data and cleans it by keeping only the specified attributes.
    It then saves the cleaned data into a new JSON file called 'cleaned_artists.json'.
    """
    # Load the JSON data from the file
    with open(json_file, 'r') as file:
        data = json.load(file)

    # The attributes to keep for each artist
    artist_attributes_to_keep = {
        '_id', 'name', 'labels', 'genres', 'locationInfo', 'deezerFans', 'location', 'type', 'lifeSpan', 'albums'
    }
    
    # The attributes to keep for each album
    album_attributes_to_keep = {
        '_id', 'name', 'title', 'dateRelease', 'deezerFans', 'language', 'country', 'songs'
    }
    
    # The attributes to keep for each song
    song_attributes_to_keep = {
        '_id', 'title', 'rank', 'position', 'language_detect', 'language', 'publicationDate'
    }

    # Clean the artist data
    cleaned_data = []
    for artist in data:
        cleaned_artist = {key: artist[key] for key in artist if key in artist_attributes_to_keep}
        if 'albums' in cleaned_artist:
            cleaned_albums = []
            for album in cleaned_artist['albums']:
                cleaned_album = {key: album[key] for key in album if key in album_attributes_to_keep}
                if 'songs' in cleaned_album:
                    cleaned_songs = [{key: song[key] for key in song if key in song_attributes_to_keep} for song in cleaned_album['songs']]
                    cleaned_album['songs'] = cleaned_songs
                cleaned_albums.append(cleaned_album)
            cleaned_artist['albums'] = cleaned_albums
        cleaned_data.append(cleaned_artist)

    # Save the cleaned data to a new JSON file
    cleaned_file_path = 'cleaned_artists.json'
    with open(cleaned_file_path, 'w') as file:
        json.dump(cleaned_data, file, indent=4)

    return cleaned_file_path

In [10]:
clean_artist_data('documented_artists.json')

'cleaned_artists.json'

In [11]:
!pip install pycountry_convert




[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [12]:
import json
from datetime import datetime
import pycountry_convert as pc

def calculate_life_span_in_years(begin, end, ended):
    current_year = datetime.now().year
    # check if begin_year and end_year are of type int
    if ( begin.split('-')[0] != '????' and end.split('-')[0] != '????' ):
        begin_year = int(begin.split('-')[0]) if begin else current_year
        end_year = int(end.split('-')[0]) if ended and end else current_year
        return end_year - begin_year
    else:
        return None

def calculate_average_rank(albums):
    # Calculate the average rank of all songs, excluding those with rank 0
    total_rank = 0
    count = 0
    for album in albums:
        for song in album.get('songs', []):
            rank = song.get('rank', 0)
            if rank > 0:
                total_rank += rank
                count += 1
    return total_rank / count if count > 0 else 0

def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    except:
        return None

def add_additional_attributes(json_file):
    with open(json_file, 'r') as file:
        artists_data = json.load(file)

    for artist in artists_data:
        artist['lifeSpanInYears'] = calculate_life_span_in_years(
            artist['lifeSpan']['begin'],
            artist['lifeSpan']['end'],
            artist['lifeSpan']['ended']
        )
        artist['numberOfAlbums'] = len(artist.get('albums', []))
        artist['numberOfSongs'] = sum(len(album.get('songs', [])) for album in artist.get('albums', []))
        artist['artistRank'] = calculate_average_rank(artist.get('albums', []))
        if ( artist['lifeSpan']['ended'] ):
            artist['isActive'] = True
        else:
            artist['isActive'] = False

        # Get the country from locationInfo or location
        country = (artist.get('locationInfo') or [None])[0] or artist.get('location', {}).get('country')
        artist['artistContinent'] = country_to_continent(country) if country else None

    output_file = 'bubble_plot_data.json'
    with open(output_file, 'w') as file:
        json.dump(artists_data, file, indent=4)

    return output_file

In [13]:
add_additional_attributes('cleaned_artists.json')

'bubble_plot_data.json'