## Spotify and YouTube dataset population

This notebook outlines the steps to create an RDF dataset based on the SoundGraph ontology, from the data import to RDF triple export in Turtle format.

In [1]:
# required libraries
from pathlib import Path
from rdflib import Graph, Literal, RDF, URIRef, Namespace
from rdflib.namespace import FOAF, XSD
import os
import pandas as pd

### Load CSV files + preprocessing

In [2]:
# csv files path
base_path = str(Path(os.path.abspath(os.getcwd())).parent.absolute())

dataset_path = base_path + '/data/Spotify_Youtube.csv'
spotify_artist_path = base_path + '/data/Artists.csv'
spotify_artist_info_path = base_path + '/data/Artist_info.csv'
spotify_album_path = base_path + '/data/Album_info.csv'
wikidata_artists_path = base_path + '/data/wikidata_artists.csv'
wikidata_award_statements_path = base_path + '/data/wikidata_award_statements.csv'
wikidata_awards_path = base_path + '/data/wikidata_awards_processed.csv'
youtube_api_channels_path = base_path + '/data/youtubeapi_channels_complete.csv'
# target path where to save the serializations
rdf_path = base_path + '/rdf/' 

In [11]:
dataset = pd.read_csv(dataset_path)
# TODO custom .fillna() for each column
# TODO split()[-1] ids
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20718 entries, 0 to 20717
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        20718 non-null  int64  
 1   Artist            20718 non-null  object 
 2   Url_spotify       20718 non-null  object 
 3   Track             20718 non-null  object 
 4   Album             20718 non-null  object 
 5   Album_type        20718 non-null  object 
 6   Uri               20718 non-null  object 
 7   Danceability      20716 non-null  float64
 8   Energy            20716 non-null  float64
 9   Key               20716 non-null  float64
 10  Loudness          20716 non-null  float64
 11  Speechiness       20716 non-null  float64
 12  Acousticness      20716 non-null  float64
 13  Instrumentalness  20716 non-null  float64
 14  Liveness          20716 non-null  float64
 15  Valence           20716 non-null  float64
 16  Tempo             20716 non-null  float6

#### Data cleaning
We observed that some videos and some tracks have different values for the same column and in our dataset this has no sense (e.g, a video having two different ```views``` values: we don't work with the video over time, so this shouldn't happen).

Videos:
- views, likes, comments: we take the ```max``` value
- official_video: we set it to ```false```

Tracks:
- streams: we take the ```max``` value

In [8]:
%%time
# dataset cleaning
# some videos have more than one value for views, likes, comments and official_video
# we take the max for everything but official_video, for which we just set to false
yt_video_urls = dataset[['Url_youtube']].drop_duplicates().reset_index(drop=True)
for index, row in yt_video_urls.iterrows():
    # get rows with same youtube video URL
    videos = dataset[dataset['Url_youtube'] == row['Url_youtube']]
    # if there is more than one row
    if len(videos) > 1:
        # take max values
        max_views = videos['Views'].max()
        max_likes = videos['Likes'].max()
        max_comments = videos['Comments'].max()
        # check if official_video needs to be fixed 
        fix_official_video = False
        if len(videos['official_video'].drop_duplicates()) > 1:
            fix_official_video = True
        # fix values
        for row_idx in videos.index:
            dataset.at[row_idx, 'Views'] = max_views
            dataset.at[row_idx, 'Likes'] = max_likes
            dataset.at[row_idx, 'Comments'] = max_comments
            if fix_official_video:
                dataset.at[row_idx, 'official_video'] = False
                
# the same thing can happen with stream of spotify song: also here we take the max value
spotify_track_uris = dataset[['Uri']].drop_duplicates().reset_index(drop=True)
for index, row in spotify_track_uris.iterrows():
    # get rows with the same spotify track
    tracks = dataset[dataset['Uri'] == row['Uri']]
    # if there is more than one track
    if len(tracks) > 1:
        # take the max value and set it on all the rows
        max_streams = tracks['Stream'].max()
        for row_idx in tracks.index:
            dataset.at[row_idx, 'Stream'] = max_streams

CPU times: user 37.9 s, sys: 10.3 ms, total: 37.9 s
Wall time: 37.9 s


In [None]:
# TODO check if other files need some preprocessing (on ids, list to string, idk), basta che li facciamo tutti qui così poi viene più easy sotto

In [None]:
# TODO ma questo csv serve?
spotify_artist = pd.read_csv(spotify_artist_path)
spotify_artist.info()

In [None]:
spotify_artist_info = pd.read_csv(spotify_artist_path)
spotify_artist_info.info()

In [None]:
wikidata_artist = pd.read_csv(wikidata_artists_path)
wikidata_artist.info()

In [None]:
spotify_album = pd.read_csv(spotify_album_path)
spotify_album.info()

In [None]:
youtube_api_channels = pd.read_csv(youtube_api_channels_path)
youtube_api_channels.info()

In [None]:
wikidata_awards = pd.read_csv(wikidata_awards_path)
wikidata_awards.info()

In [None]:
wikidata_awards_statements = pd.read_csv(wikidata_award_statements_path)
wikidata_awards_statements.info()

### rdflib setup

In [16]:
# Set the countries and the SoundGraph ontologies namespaces (not known by rdflib)
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")
SG = Namespace("https://www.dei.unipd.it/db2/ontology/soundgraph#")

# create the graph
g = Graph()

# bind the namespaces to a prefix
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("sg", SG)

In [17]:
# util function to dump the graph in a file and get a new empty graph with the bindings already set
def write_and_empty_graph(graph, filename):
    with open(rdf_path + filename, 'w') as file:
        file.write(graph.serialize(format='turtle'))
        
    graph = Graph()
    graph.bind("foaf", FOAF)
    graph.bind("xsd", XSD)
    graph.bind("countries", CNS)
    graph.bind("sg", SG)
    return graph

In [None]:
# TODO rdflib