## Spotify and YouTube dataset population

This notebook outlines the steps to create an RDF dataset based on the SoundGraph ontology, from the data import to RDF triple export in Turtle format.

In [1]:
# required library
import pandas as pd
import ast

In [2]:
# csv files path
dataset_path='../data/Spotify_Youtube.csv'
spotify_artist_path = '../data/Artists.csv'
spotify_artist_info_path = '../data/Artist_info.csv'
spotify_album_path = '../data/Album_info.csv'
wikidata_artists_path = '../data/wikidata_artists.csv'
wikidata_award_statements_path = '../data/wikidata_award_statements.csv'
wikidata_awards_path = '../data/wikidata_awards.csv'
youtube_api_channels_path = '../data/youtubeapi_channels_complete.csv'
# target folder where to save the output
targetFolder = '../rdf/'

In [3]:
# load the songs 
dataset = pd.read_csv(dataset_path, sep=',')
# preprocessing of URLs, needed to get uri
#dataset['Url_spotify'] = dataset['Url_spotify'].fillna('_').apply(lambda uri: uri.split(':')[-1]) 
#dataset['Uri'] = dataset['Uri'].fillna('_').apply(lambda uri: uri.split(':')[-1]) 
#dataset['Album'] = dataset['Album'].fillna('_').apply(lambda uri: uri.split(':')[-1]) 
#dataset['Url_youtube'] = dataset['Url_youtube'].fillna('_').apply(lambda uri: uri.split('?v=')[-1]) 
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20718 entries, 0 to 20717
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        20718 non-null  int64  
 1   Artist            20718 non-null  object 
 2   Url_spotify       20718 non-null  object 
 3   Track             20718 non-null  object 
 4   Album             20718 non-null  object 
 5   Album_type        20718 non-null  object 
 6   Uri               20718 non-null  object 
 7   Danceability      20716 non-null  float64
 8   Energy            20716 non-null  float64
 9   Key               20716 non-null  float64
 10  Loudness          20716 non-null  float64
 11  Speechiness       20716 non-null  float64
 12  Acousticness      20716 non-null  float64
 13  Instrumentalness  20716 non-null  float64
 14  Liveness          20716 non-null  float64
 15  Valence           20716 non-null  float64
 16  Tempo             20716 non-null  float6

In [5]:
# load the artists present
spotify_artist = pd.read_csv(spotify_artist_path, sep=',')
spotify_artist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2079 entries, 0 to 2078
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2079 non-null   int64 
 1   Artist      2079 non-null   object
dtypes: int64(1), object(1)
memory usage: 32.6+ KB


In [6]:
# load spotify artists information
spotify_artist_info = pd.read_csv(spotify_artist_info_path, sep=',')
spotify_artist_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2079 entries, 0 to 2078
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Artist      2079 non-null   object
 1   Followers   2079 non-null   int64 
 2   Genres      2079 non-null   object
 3   Popularity  2079 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 65.1+ KB


In [7]:
# load spotify album information
spotify_album = pd.read_csv(spotify_album_path, sep=',')
spotify_album.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20718 entries, 0 to 20717
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                20718 non-null  object
 1   Album             20641 non-null  object
 2   Total_tracks      20718 non-null  int64 
 3   Release_date      20718 non-null  object
 4   Available_market  20718 non-null  object
dtypes: int64(1), object(4)
memory usage: 809.4+ KB


In [8]:
# load the wikidata artists information
wikidata_artists = pd.read_csv(wikidata_artists_path, sep=',')
wikidata_artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2079 entries, 0 to 2078
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Artist         2079 non-null   object
 1   Url_spotify    2079 non-null   object
 2   artistLabel    2079 non-null   object
 3   websiteLabel   2079 non-null   object
 4   start          2079 non-null   object
 5   end            2079 non-null   object
 6   dissolved      2079 non-null   object
 7   country_codes  2079 non-null   object
dtypes: object(8)
memory usage: 130.1+ KB


In [9]:
# load the wikidata award statements information
wikidata_award_statements = pd.read_csv(wikidata_award_statements_path, sep=',')
wikidata_award_statements.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1002 entries, 0 to 1001
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   artist_spotify_id  1002 non-null   object
 1   award_id           1002 non-null   int64 
 2   award_year         1002 non-null   object
dtypes: int64(1), object(2)
memory usage: 23.6+ KB


In [10]:
# load the wikidata awards information
wikidata_awards = pd.read_csv(wikidata_awards_path,sep=',')
wikidata_awards.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   award_id        193 non-null    int64 
 1   award_name      193 non-null    object
 2   award_type      193 non-null    object
 3   award_category  118 non-null    object
dtypes: int64(1), object(3)
memory usage: 6.2+ KB


In [11]:
# load the youtube channels information
youtube_api_channels = pd.read_csv(youtube_api_channels_path, sep=',')
youtube_api_channels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6715 entries, 0 to 6714
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   channelId           6715 non-null   object
 1   title               6715 non-null   object
 2   channelDescription  4241 non-null   object
 3   viewCount           6715 non-null   object
 4   subscriberCount     6715 non-null   object
 5   videoCount          6715 non-null   object
 6   error               6715 non-null   int64 
 7   originalChannel     6715 non-null   object
dtypes: int64(1), object(7)
memory usage: 419.8+ KB


### DataFrames for obj prop

In [12]:
# composes - isComposedBy
composes_df = pd.DataFrame({
    'artist_id': dataset['Url_spotify'],
    'album_id': spotify_album['Id']
})
composes_df['artist_id'] = composes_df['artist_id'].apply(lambda uri: uri.split('/')[-1])
composes_df['album_id'] = composes_df['album_id'].apply(lambda uri: uri.split(':')[-1])
composes_df

Unnamed: 0,artist_id,album_id
0,3AA28KZvwAUcZuOKwyblJQ,0bUTHlWbkSQysoM3VsWldT
1,3AA28KZvwAUcZuOKwyblJQ,2dIGnmEIy1WZIcZCFSj6i8
2,3AA28KZvwAUcZuOKwyblJQ,4V9YFKLqZ5h8nQFTvDQscC
3,3AA28KZvwAUcZuOKwyblJQ,2dIGnmEIy1WZIcZCFSj6i8
4,3AA28KZvwAUcZuOKwyblJQ,0YvYmLBFFwYxgI4U9KKgUm
...,...,...
20713,3EYY5FwDkHEYLw5V86SAtl,7o5cIYGdDmDqa9gGNsU60e
20714,3EYY5FwDkHEYLw5V86SAtl,2JR4Wct66k7JOEh6y5yy0L
20715,3EYY5FwDkHEYLw5V86SAtl,7v9knMsQE5CkN2HE4yhIQu
20716,3EYY5FwDkHEYLw5V86SAtl,3xynK8Rwi02G6VKcb15rFJ


In [13]:
# writes - isWrittenBy
writes_df = pd.DataFrame({
    'artist_id': composes_df['artist_id'],
    'track_id': dataset['Uri'].apply(lambda uri: uri.split(':')[-1])
})
writes_df

Unnamed: 0,artist_id,track_id
0,3AA28KZvwAUcZuOKwyblJQ,0d28khcov6AiegSCpG5TuT
1,3AA28KZvwAUcZuOKwyblJQ,1foMv2HQwfQ2vntFf9HFeG
2,3AA28KZvwAUcZuOKwyblJQ,64dLd6rVqDLtkXFYrEUHIU
3,3AA28KZvwAUcZuOKwyblJQ,0q6LuUqGLUiCPP1cbdwFs3
4,3AA28KZvwAUcZuOKwyblJQ,7yMiX7n9SBvadzox8T5jzT
...,...,...
20713,3EYY5FwDkHEYLw5V86SAtl,0RtcKQGyI4hr8FgFH1TuYG
20714,3EYY5FwDkHEYLw5V86SAtl,3rHvPA8lUnPBkaLyPOc0VV
20715,3EYY5FwDkHEYLw5V86SAtl,4jk00YxPtPbhvHJE9N4ddv
20716,3EYY5FwDkHEYLw5V86SAtl,5EyErbpsugWliX006eTDex


In [14]:
# containsTrack - isInAlbum
containsTrack_df = pd.DataFrame({
    'album_id': composes_df['album_id'],
    'track_id': writes_df['track_id']
})
containsTrack_df

Unnamed: 0,album_id,track_id
0,0bUTHlWbkSQysoM3VsWldT,0d28khcov6AiegSCpG5TuT
1,2dIGnmEIy1WZIcZCFSj6i8,1foMv2HQwfQ2vntFf9HFeG
2,4V9YFKLqZ5h8nQFTvDQscC,64dLd6rVqDLtkXFYrEUHIU
3,2dIGnmEIy1WZIcZCFSj6i8,0q6LuUqGLUiCPP1cbdwFs3
4,0YvYmLBFFwYxgI4U9KKgUm,7yMiX7n9SBvadzox8T5jzT
...,...,...
20713,7o5cIYGdDmDqa9gGNsU60e,0RtcKQGyI4hr8FgFH1TuYG
20714,2JR4Wct66k7JOEh6y5yy0L,3rHvPA8lUnPBkaLyPOc0VV
20715,7v9knMsQE5CkN2HE4yhIQu,4jk00YxPtPbhvHJE9N4ddv
20716,3xynK8Rwi02G6VKcb15rFJ,5EyErbpsugWliX006eTDex


### RDFLib import

To use RDFLib, the following installation is required:

<code>pip3 install rdflib</code>

In [15]:
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD

In [21]:
# Construct the countries and the SoundGraph ontology namespaces not known by RDFlib
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
SG = Namespace("https://www.dei.unipd.it/db2/ontology/soundgraph#")

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("sg", SG)

In [22]:
def write_and_empty_graph(graph, filename):
    with open(targetFolder + filename, 'w') as file:
        file.write(graph.serialize(format='turtle'))
        
    graph = Graph()
    graph.bind("foaf", FOAF)
    graph.bind("xsd", XSD)
    graph.bind("countries", CNS)
    graph.bind("sg", SG)
    return graph

#### Spotify Artist  and  hasNationality

In [23]:
%%time 
#measure execution time

#iterate over the movies dataframe
for index, row in wikidata_artists.iterrows():
    row2 = spotify_artist_info.iloc[index]
    
    # create node
    artist_uri = "artist_" + row['Url_spotify']
    Artist = URIRef(SG[artist_uri])
    g.add((Artist, RDF.type, SG.SpotifyArtist))
    # data properties
    g.add((Artist, SG['artistFollowersNum'], Literal(row2['Followers'], datatype=XSD.integer)))
    g.add((Artist, SG['artistName'], Literal(row['Artist'], datatype=XSD.string)))
    g.add((Artist, SG['artistPopularity'], Literal(row2['Popularity'], datatype=XSD.integer)))
    if row['websiteLabel'] != '_':
        g.add((Artist, SG['artistWebsite'], Literal(row['websiteLabel'], datatype=XSD.string)))
    if row['start'] != '_':
        g.add((Artist, SG['startWorkingPeriod'], Literal(row['start'], datatype=XSD.gYear)))
    if row['end'] != '_':
        g.add((Artist, SG['endWorkingPeriod'], Literal(row['end'], datatype=XSD.gYear)))
    if row['dissolved'] != '_':
        g.add((Artist, SG['dissolvedIn'], Literal(row['dissolved'], datatype=XSD.gYear)))
    # obj prop - hasNationality
    if row['country_codes'] != '_':
        cc_list = row['country_codes'].split('+')
        for cc in cc_list:
            Country = URIRef(CNS[cc.lower()])
            g.add((Artist, SG['hasNationality'], Country))
    

CPU times: user 1.97 s, sys: 17.5 ms, total: 1.98 s
Wall time: 1.98 s


In [24]:
%%time
g = write_and_empty_graph(g, 'spotify_artist.ttl')

CPU times: user 315 ms, sys: 0 ns, total: 315 ms
Wall time: 330 ms


#### Spotify Track

In [25]:
%%time
dataset['Stream'] = dataset['Stream'].fillna('_')
for index, row in dataset.iterrows():
    
    # Create the node to add to the Graph
    track_uri = "track_" + row['Uri'].split(':')[-1]
    Track = URIRef(SG[track_uri])
    # Add triples using store's add() method.
    g.add((Track, RDF.type, SG.SpotifyTrack))
    g.add((Track, SG['trackName'], Literal(row['Track'], datatype=XSD.string)))
    if not pd.isna(row['Duration_ms']):
        g.add((Track, SG['trackAcousticness'], Literal(row['Acousticness'], datatype=XSD.float)))
        g.add((Track, SG['trackDanceability'], Literal(row['Danceability'], datatype=XSD.float)))
        g.add((Track, SG['trackDuration'], Literal(row['Duration_ms'], datatype=XSD.integer)))
        g.add((Track, SG['trackEnergy'], Literal(row['Energy'], datatype=XSD.float)))
        g.add((Track, SG['trackInstrumentalness'], Literal(row['Instrumentalness'], datatype=XSD.float)))    
        g.add((Track, SG['trackKey'], Literal(row['Key'], datatype=XSD.integer)))
        g.add((Track, SG['trackLiveness'], Literal(row['Liveness'], datatype=XSD.float)))
        g.add((Track, SG['trackLoudness'], Literal(row['Loudness'], datatype=XSD.float)))
        g.add((Track, SG['trackSpeechiness'], Literal(row['Speechiness'], datatype=XSD.float)))
        g.add((Track, SG['trackTempo'], Literal(row['Tempo'], datatype=XSD.float)))
        g.add((Track, SG['trackValence'], Literal(row['Valence'], datatype=XSD.float)))
    if row['Stream'] != '_':
        g.add((Track, SG['trackStreams'], Literal(row['Stream'], datatype=XSD.integer)))   
    

CPU times: user 9.12 s, sys: 102 ms, total: 9.22 s
Wall time: 9.22 s


#### Genre

In [26]:
%%time
genres_set = set()
spotify_artist_info['Genres'] = spotify_artist_info['Genres'].apply(lambda x: x.replace('[', '')
                                                                            .replace(']', '')
                                                                            .replace('\'','')
                                                                            .replace('"',''))
for index, row in spotify_artist_info.iterrows():
    for genre in row['Genres'].split(', '):
        # some songs have no associated genres and the replace series results in an empty string
        if genre != '':
            genres_set.add(genre.replace(' ', '_').replace('-', '_'))

for genre in genres_set:
    genre_uri = "genre_" + genre
    Genre = URIRef(SG[genre_uri])
    g.add((Genre, RDF.type, SG.Genre))

CPU times: user 99.6 ms, sys: 0 ns, total: 99.6 ms
Wall time: 98.6 ms


#### Spotify Album

In [27]:
%%time
# add album type to CSV and remove duplicates in another var
spotify_album['album_type'] = dataset['Album_type']
spotify_album_2 = spotify_album.drop_duplicates().reset_index(drop=True)
spotify_album_2['Album'] = spotify_album_2['Album'].fillna('_')

for index, row in spotify_album_2.iterrows():
    album_uri = 'album_' + row['Id'].split(':')[-1]
    if row['Album'] != '_':
        Album = URIRef(SG[album_uri])
        g.add((Album, RDF.type, SG.SpotifyAlbum))
        g.add((Album, SG['albumName'], Literal(row['Album'], datatype=XSD.string)))
        g.add((Album, SG['albumReleaseDate'], Literal(row['Release_date'], datatype=XSD.date)))
        g.add((Album, SG['albumTotalTracksNum'], Literal(row['Total_tracks'], datatype=XSD.integer)))
        g.add((Album, SG['albumType'], Literal(row['album_type'], datatype=XSD.string))) # TODO check enum datatype

CPU times: user 2.75 s, sys: 28.5 ms, total: 2.77 s
Wall time: 2.77 s


In [28]:
%%time
g = write_and_empty_graph(g, 'nodes_spotify.ttl')

CPU times: user 8.16 s, sys: 26.6 ms, total: 8.19 s
Wall time: 8.21 s


#### YouTube Channel

In [29]:
%%time
youtube_api_channels['channelDescription'] = youtube_api_channels['channelDescription'].fillna('_')
for index, row in youtube_api_channels.iterrows():
    if row['channelId'] != '_':
        channel_id = 'channel_' + row['channelId']
        Channel = URIRef(SG[channel_id])
        g.add((Channel, RDF.type, SG.YouTubeChannel))
        g.add((Channel, SG['channelName'], Literal(row['originalChannel'], datatype=XSD.string)))
        if row['channelDescription'] != '_':
            g.add((Channel, SG['channelDescription'], Literal(row['channelDescription'], datatype=XSD.string)))
        g.add((Channel, SG['channelViewCount'], Literal(row['viewCount'], datatype=XSD.integer)))
        g.add((Channel, SG['channelSubscribersCount'], Literal(row['subscriberCount'], datatype=XSD.integer)))
        g.add((Channel, SG['channelVideoCount'], Literal(row['videoCount'], datatype=XSD.integer)))

CPU times: user 1.14 s, sys: 20 ms, total: 1.16 s
Wall time: 1.16 s


In [30]:
%%time
g = write_and_empty_graph(g, 'nodes_yt_channels.ttl')

CPU times: user 1.38 s, sys: 0 ns, total: 1.38 s
Wall time: 1.39 s


#### YouTube Video

In [31]:
%%time
dataset['Url_youtube'] = dataset['Url_youtube'].fillna('_')
dataset['Description'] = dataset['Description'].fillna('_')
dataset['Comments'] = dataset['Comments'].fillna(-1)
dataset['Likes'] = dataset['Likes'].fillna(-1)

for index, row in dataset.iterrows():
    if row['Url_youtube'] != '_':
        video_id = 'video_' + row['Url_youtube'].split('?v=')[-1]
        Video = URIRef(SG[video_id])
        g.add((Video, RDF.type, SG.YouTubeVideo))
        g.add((Video, SG['videoTitle'], Literal(row['Title'], datatype=XSD.string)))
        if row['Description'] != '_':
            g.add((Video, SG['videoDescription'], Literal(row['Description'], datatype=XSD.string)))
        if row['Comments'] != -1:
            g.add((Video, SG['videoComments'], Literal(row['Comments'], datatype=XSD.integer)))
        if row['Likes'] != -1:
            g.add((Video, SG['videoLikes'], Literal(row['Likes'], datatype=XSD.integer)))
        g.add((Video, SG['videoViews'], Literal(row['Views'], datatype=XSD.integer)))
        g.add((Video, SG['isOfficialVideo'], Literal(row['official_video'], datatype=XSD.boolean)))
        g.add((Video, SG['isLicensed'], Literal(row['Licensed'], datatype=XSD.boolean)))

CPU times: user 7.05 s, sys: 78.3 ms, total: 7.13 s
Wall time: 7.13 s


In [32]:
%%time
g = write_and_empty_graph(g, 'nodes_yt_video.ttl')

CPU times: user 3.4 s, sys: 54.8 ms, total: 3.46 s
Wall time: 3.48 s


### Object properties
#### hasNationality

In [33]:
#Already done in Spotify Artist 

#### containsTrack

In [34]:
%%time

# add album type to CSV and remove duplicates in another var
dataset['Album_Id'] = spotify_album['Id']

for index, row in dataset.iterrows():
    album_uri = 'album_' + row['Album_Id'].split(':')[-1]
    Album = URIRef(SG[album_uri])

    track_uri = "track_" + row['Uri'].split(':')[-1]
    Track = URIRef(SG[track_uri])
        
    g.add((Album, SG['containsTrack'], Track))


CPU times: user 1.66 s, sys: 29.7 ms, total: 1.69 s
Wall time: 1.69 s


#### isRelatedTo

In [35]:
%%time

for index, row in dataset.iterrows():
    if row['Url_youtube'] != '_' and row['Uri'] != '_':
        video_id = 'video_' + row['Url_youtube'].split('?v=')[-1]
        Video = URIRef(SG[video_id])

        track_uri = "track_" + row['Uri'].split(':')[-1]
        Track = URIRef(SG[track_uri])
        
        g.add((Video, SG['isRelatedTo'], Track))
        
        

CPU times: user 1.8 s, sys: 0 ns, total: 1.8 s
Wall time: 1.8 s


#### isPublishedBy

In [None]:
%%time

youtube_dataset = dataset[['Url_youtube', 'Channel']].copy()

 #Rimuovi i duplicati
youtube_dataset.drop_duplicates().reset_index(drop=True)
youtube_dataset['Url_youtube'] = youtube_dataset['Url_youtube'].fillna('')
youtube_dataset['Channel'] = youtube_dataset['Channel'].fillna('')

for index, row in youtube_dataset.iterrows():
    if row['Url_youtube'] != '':

        video_id = 'video' + row['Url_youtube'].split('?v=')[-1]
        Video = URIRef(SG[video_id])

        for idx, channel in youtube_api_channels.iterrows():
            if channel['title'] == row['Channel']:
                channel_id = 'channel' + channel['channelId']
                Channel = URIRef(SG[channel_id])

                g.add((Video, SG['isPublishedBy'], Channel))
                break

#### isAvailableIn

In [36]:
%%time
# add album type to CSV and remove duplicates in another var
spotify_album_2 = spotify_album.drop_duplicates().reset_index(drop=True)
spotify_album_2['Album'] = spotify_album_2['Album'].fillna('_')

for index, row in spotify_album_2.iterrows():
    album_uri = 'album_' + row['Id'].split(':')[-1]
    if row['Album'] != '_':
        Album = URIRef(SG[album_uri])
        
        if isinstance(row['Available_market'], list) and len(row['Available_market']) > 0:
            for market in row['Available_market']:
                if market != '_' and len(market) == 2:
                    Country = URIRef(CNS[market.lower()])
                    g.add((Album, SG['isAvailableIn'], Country))

CPU times: user 825 ms, sys: 0 ns, total: 825 ms
Wall time: 824 ms


#### isOfficialChannel 

In [272]:
#TODO 

#### isWrittenBy  and  performsIn

In [38]:
%%time

dataset['Url_youtube'] = dataset['Url_youtube'].fillna('_')
        
for index, row in dataset.iterrows():

    artist_uri = "artist_" + row['Url_spotify'].split('/')[-1]
    Artist = URIRef(SG[artist_uri])
    
    track_uri = "track_" + row['Uri'].split(':')[-1]
    Track = URIRef(SG[track_uri])
    
    #   isWrittenBy
    g.add((Track, SG['isWrittenBy'], Artist))

    #   performIn
    if row['Url_youtube'] != '_':
        video_id = 'video_' + row['Url_youtube'].split('?v=')[-1]
        Video = URIRef(SG[video_id])
        
        g.add((Artist, SG['performsIn'], Video))
    

CPU times: user 1.77 s, sys: 19.5 ms, total: 1.79 s
Wall time: 1.78 s


#### hasGenre

In [52]:
%%time

spotify_artist_info['artist_id'] = wikidata_artists['Url_spotify']
for index, row in spotify_artist_info.iterrows():
    artist_uri = "artist_" + row['artist_id']
    Artist = URIRef(SG[artist_uri])
    
    for genre in row['Genres'].split(', '):
        if genre != '':
            genre_uri = "genre_" + genre.replace(' ', '_').replace('-', '_')
            Genre = URIRef(SG[genre_uri])
            
            g.add((Artist, SG['hasGenre'], Genre))

CPU times: user 160 ms, sys: 0 ns, total: 160 ms
Wall time: 159 ms


#### isComposedBy

In [49]:
%%time

# add album type to CSV and remove duplicates in another var
spotify_album['artist_id'] = dataset['Url_spotify']
spotify_album_2 = spotify_album.drop_duplicates().reset_index(drop=True)
spotify_album_2['Album'] = spotify_album_2['Album'].fillna('_')

for index, row in spotify_album_2.iterrows():
    album_uri = 'album_' + row['Id'].split(':')[-1]
    Album = URIRef(SG[album_uri])
    
    artist_uri = "artist_" + row['artist_id'].split(':')[-1]
    Artist = URIRef(SG[artist_uri])
    
    g.add((Album, SG['isComposedBy'], Artist))

CPU times: user 866 ms, sys: 0 ns, total: 866 ms
Wall time: 865 ms
