## Spotify and YouTube dataset population

This notebook outlines the steps to create an RDF dataset based on the SoundGraph ontology, from the data import to RDF triple export in Turtle format.

In [23]:
# required library
import pandas as pd

In [24]:
#csv files path
dataset_path='../data/Spotify_Youtube.csv'
spotify_artist_path='../data/Artists.csv'
spotify_artist_info_path='../data/Artist_info.csv'
spotify_album_path='../data/Album_info.csv'
wikidata_artists_path='../data/wikidata_artists.csv'
wikidata_award_statements_path='../data/wikidata_award_statements.csv'
wikidata_awards_path='../data/wikidata_awards.csv'
youtube_api_channels_path='../data/youtubeapi_channels_complete.csv'
#target folder where to save the output
targetFolder='../rdf/'

In [25]:
# load the songs 
dataset = pd.read_csv(dataset_path, sep=',')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20718 entries, 0 to 20717
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        20718 non-null  int64  
 1   Artist            20718 non-null  object 
 2   Url_spotify       20718 non-null  object 
 3   Track             20718 non-null  object 
 4   Album             20718 non-null  object 
 5   Album_type        20718 non-null  object 
 6   Uri               20718 non-null  object 
 7   Danceability      20716 non-null  float64
 8   Energy            20716 non-null  float64
 9   Key               20716 non-null  float64
 10  Loudness          20716 non-null  float64
 11  Speechiness       20716 non-null  float64
 12  Acousticness      20716 non-null  float64
 13  Instrumentalness  20716 non-null  float64
 14  Liveness          20716 non-null  float64
 15  Valence           20716 non-null  float64
 16  Tempo             20716 non-null  float6

In [26]:
# load the artists present
spotify_artist = pd.read_csv(spotify_artist_path, sep=',')
spotify_artist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2079 entries, 0 to 2078
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2079 non-null   int64 
 1   Artist      2079 non-null   object
dtypes: int64(1), object(1)
memory usage: 32.6+ KB


In [27]:
# load spotify artists information
spotify_artist_info = pd.read_csv(spotify_artist_info_path, sep=',')
spotify_artist_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2079 entries, 0 to 2078
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Artist      2079 non-null   object
 1   Followers   2079 non-null   int64 
 2   Genres      2079 non-null   object
 3   Popularity  2079 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 65.1+ KB


In [28]:
# load spotify album information

spotify_album = pd.read_csv(spotify_album_path, sep=',')
spotify_album.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20718 entries, 0 to 20717
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                20718 non-null  object
 1   Album             20641 non-null  object
 2   Total_tracks      20718 non-null  int64 
 3   Release_date      20718 non-null  object
 4   Available_market  20718 non-null  object
dtypes: int64(1), object(4)
memory usage: 809.4+ KB


In [29]:
# load the wikidata artists information
wikidata_artists=pd.read_csv(wikidata_artists_path,sep=',')
wikidata_artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2079 entries, 0 to 2078
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Artist         2079 non-null   object
 1   Url_spotify    2079 non-null   object
 2   artistLabel    2079 non-null   object
 3   websiteLabel   2079 non-null   object
 4   start          2079 non-null   object
 5   end            2079 non-null   object
 6   dissolved      2079 non-null   object
 7   country_codes  2079 non-null   object
dtypes: object(8)
memory usage: 130.1+ KB


In [30]:
# load the wikidata award statements information
wikidata_award_statements=pd.read_csv(wikidata_award_statements_path,sep=',')
wikidata_award_statements.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1035 entries, 0 to 1034
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   artist_spotify_id  1035 non-null   object
 1   award_id           1035 non-null   int64 
 2   award_year         1035 non-null   object
dtypes: int64(1), object(2)
memory usage: 24.4+ KB


In [31]:
# load the wikidata awards information
wikidata_awards=pd.read_csv(wikidata_awards_path,sep=',')
wikidata_awards.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   award_id    205 non-null    int64 
 1   award_name  205 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.3+ KB


In [32]:
# load the youtube channels information
youtube_api_channels=pd.read_csv(youtube_api_channels_path,sep=',')
youtube_api_channels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6715 entries, 0 to 6714
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   channelId           6715 non-null   object
 1   title               6715 non-null   object
 2   channelDescription  4241 non-null   object
 3   viewCount           6715 non-null   object
 4   subscriberCount     6715 non-null   object
 5   videoCount          6715 non-null   object
 6   error               6715 non-null   int64 
 7   originalChannel     3358 non-null   object
dtypes: int64(1), object(7)
memory usage: 419.8+ KB


### RDFLib import

To use RDFLib, the following installation is required:

<code>pip3 install rdflib</code>

In [33]:
# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace
# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD

In [34]:
# Construct the countries and the SoundGraph ontology namespaces not known by RDFlib
CNS = Namespace("https://eulersharp.sourceforge.net/2003/03swap/countries#")
SG = Namespace("https://www.dei.unipd.it/db2/ontology/soundgraph")

#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("sg", SG)



In [41]:
print(spotify_artist_info.iloc[3]['Followers'])

26352973


##### Spotify Artist

In [None]:
%%time 
#measure execution time

#iterate over the movies dataframe
for index, row in wikidata_artists.iterrows():
    row2= spotify_artist_info.iloc[index]
    
    # Create the node to add to the Graph
    # the node has the namespace + the movie id as URI
    artist_uri="artist_"+row['Url_spotify']
    Artist= URIRef(SG[artist_uri])
    # Add triples using store's add() method.
    g.add((Artist, RDF.type, SG.SpotifyArtist))
    g.add((Artist, SG['artistFollowersNum'], Literal(row2['Followers'], datatype=XSD.integer)))
    g.add((Artist, SG['artistName'],Literal(row['Artist'], datatype=XSD.string)))
    g.add((Artist, SG['artistPopularity'],Literal(row2['Popularity'], datatype=XSD.integer)))
    if row['websiteLabel'] != '_':
        g.add((Artist, SG['artistWebsite'],Literal(row['websiteLabel'], datatype=XSD.string)))
    if row['start'] != '_':
        g.add((Artist, SG['startWorkingPeriod'],Literal(row['start'], datatype=XSD.gYear)))
    if row['end'] != '_':
        g.add((Artist, SG['endWorkingPeriod'],Literal(row['end'], datatype=XSD.gYear)))
    if row['dissolved'] != '_':
        g.add((Artist, SG['dissolvedIn'],Literal(row['dissolved'], datatype=XSD.gYear)))
    
    

In [42]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20718 entries, 0 to 20717
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        20718 non-null  int64  
 1   Artist            20718 non-null  object 
 2   Url_spotify       20718 non-null  object 
 3   Track             20718 non-null  object 
 4   Album             20718 non-null  object 
 5   Album_type        20718 non-null  object 
 6   Uri               20718 non-null  object 
 7   Danceability      20716 non-null  float64
 8   Energy            20716 non-null  float64
 9   Key               20716 non-null  float64
 10  Loudness          20716 non-null  float64
 11  Speechiness       20716 non-null  float64
 12  Acousticness      20716 non-null  float64
 13  Instrumentalness  20716 non-null  float64
 14  Liveness          20716 non-null  float64
 15  Valence           20716 non-null  float64
 16  Tempo             20716 non-null  float6

##### Spotify Track

In [None]:
for index, row in dataset.iterrows():
    
    # Create the node to add to the Graph
    # the node has the namespace + the movie id as URI
    track_uri="track_"+row['Uri']
    Track= URIRef(SG[track_uri])
    # Add triples using store's add() method.
    g.add((Track, RDF.type, SG.SpotifyArtist))
    g.add((Track, SG['trackName'], Literal(row['Track'], datatype=XSD.string)))
    g.add((Track, SG['trackAcousticness'],Literal(row['Acousticness'], datatype=XSD.float)))
    g.add((Track, SG['trackDanceability'],Literal(row['Danceability'], datatype=XSD.float)))
    g.add((Track, SG['trackDuration'],Literal(row['Duration_ms'], datatype=XSD.positiveInteger)))
    g.add((Track, SG['trackEnergy'],Literal(row['Energy'], datatype=XSD.float)))
    g.add((Track, SG['trackInstrumentalness'],Literal(row['Instrumentalness'], datatype=XSD.float)))    
    g.add((Track, SG['trackKey'],Literal(row['Key'], datatype=XSD.integer)))
    g.add((Track, SG['trackLiveness'],Literal(row['Liveness'], datatype=XSD.float)))
    g.add((Track, SG['trackLoudness'],Literal(row['Loudness'], datatype=XSD.float)))
    g.add((Track, SG['trackSpeechiness'],Literal(row['Speechiness'], datatype=XSD.float)))
    g.add((Track, SG['trackStreams'],Literal(row['Stream'], datatype=XSD.long)))
    g.add((Track, SG['trackTempo'],Literal(row['Tempo'], datatype=XSD.float)))
    g.add((Track, SG['trackValence'],Literal(row['Valence'], datatype=XSD.float)))
    


#### Genre

In [None]:

genres_set= set()
spotify_artist_info['Genres']=spotify_artist_info['Genres'].apply(lambda x: x.replace('[', '').replace(']', '').replace('\'',''))
for index, row in spotify_artist_info.iterrows():
    for genre in row['Genres'].split(', '):
        genres_set.add(genre.replace(' ','_'))

for genre in genres_set:
    genre_uri= "genre_"+genre
    Genre= URIRef(SG[genre_uri])
    g.add(Genre, RDF.type, SG.Genre)
