In [41]:
!pip install pytube



In [42]:
import os
from requests import post, get
import base64
import json
import pandas as pd
from googleapiclient.discovery import build
import pytube
import moviepy.editor as mp
from scipy.ndimage import zoom
import matplotlib.pyplot as plt
import numpy as np


    The process here, is to connect to the spotify API, get the differents
    songs names we need, use them in a the search method from youtube API
    The utimate goal is to download a mp3 version of the videos.
    These mp3 files are needed to create spectrograms.
    Note that this is only for educational purpose.
    We encourage you to listen to music through the different legal ways (online on youtube, or platform like spotify,deezer...)
    

In [43]:
#First, you have to get your Spotify API ID and Client_secret. Get them by following the spotify API tutorial
client_id = ''
client_secret = ''

In [44]:
#Also get your Youtube analytics API key
youtube_api_key = ''

In [45]:
#This function is for token generation
def get_token():
    auth_string = client_id + ":" + client_secret
    auth_bytes = auth_string.encode("utf-8")
    auth_base64 = str(base64.b64encode(auth_bytes), "utf-8")

    url = "https://accounts.spotify.com/api/token"

    headers = {
        "Authorization" : "Basic " + auth_base64,
        "Content-Type" : "application/x-www-form-urlencoded"
    }

    data = {"grant_type": "client_credentials"}
    result = post(url, headers=headers, data = data)
    json_result = json.loads(result.content)
    token = json_result["access_token"]
    return token

In [46]:
#This function takes a token and return a header.This is needed since a header is essential
#when trying to make a request
def get_auth_header(token):
    return {"Authorization" : "Bearer " + token}

In [47]:
#This function returns a dataframe thats contains all the
#songs in a playlist. It contains information of the
#songs name, id, preview url, songs's popularity
def playlist_to_dataframe(token,playlist_id):

    url = f"https://api.spotify.com/v1/playlists/{playlist_id}/tracks"
    headers = get_auth_header(token)
    limit = 100
    offset = 0
    while True:

        query = f"?playlist_id={playlist_id}&fields=items(track(artists(id,name),id,name,popularity,preview_url))&limit={limit}&offset={offset}"
        query_url = url + query
        result = get(query_url, headers=headers)
        if result.status_code != 200:
            print(f"Erreur {result.status_code}")
            break
        json_result = json.loads(result.content)
        if not json_result['items']:
            break

        if offset == 0:
            #Creation dataframe de sortie
            df = pd.json_normalize(json_result['items'])

        #Concaténation des dataframes
        elif offset != 0:
            df = pd.concat([df,pd.json_normalize(json_result['items'])],
                           axis=0, ignore_index=True)
        offset+=limit

    return df

In [91]:
#This function takes the name of songs and artists in dataframe (dataframe generated by playlist_to_dataframe) and return the list of video url in youtube API
def search_music_youtube(data, key_api):

    #We set the index based on track.name column
    data.set_index('track.name',inplace=True)

    youtube = build('youtube','v3',developerKey=key_api)
    video_url = []
    try:
        for track_name in data.index:

            request = youtube.search().list(
                part='id',
                maxResults=1,
                q=track_name+ " " + data['track.artists'][track_name][0]['name']
            )

            response = request.execute()
            for item in response.get('items',[]):
                if item['id']['kind'] == 'youtube#video':
                    url = f"https://www.youtube.com/watch?v={item['id']['videoId']}"
                    video_url.append(url)
    except:
        pass
    data.reset_index(inplace=True)
    return video_url

In [92]:
#This function take a list of video'url (list created with search_music_youtube fucntion), download only the audio in a mp3 format
def download_convert_to_mp3(video_url,dest='.'):

    yt = pytube.YouTube(video_url)
    #We filter to only take the audio
    video = yt.streams.filter(only_audio=True).first()

    #The destination where the downloaded files will be
    destination = dest
    out_file = video.download(output_path=dest)
    base, ext = os.path.splitext(out_file)

    #The extension we want for the downloaded file
    new_file = base + '.mp3'

    if os.path.exists(new_file) == False:
    #The conversion
        clip = mp.AudioFileClip(out_file)
        clip.write_audiofile(new_file)
        clip.close()

    os.remove(out_file)


In [93]:
#Returns the ids from a dataframe generated by the function playlist_to_dataframe
def get_ids_from_dataframe(data):

    #ids will be a list of string. Each strings contains ids of the differents songs. The ids are separated by a comma. This will be used to get the songs analysis
    #This is needed because the spotify can only return analysis for maximum of 100 ids. If the playlist has more than 100 songs it can be problematic
    list_ids = []
    ids=''
    i=0
    data.set_index(data['track.id'], inplace=True)
    for id in data.index:
        ids+= f"{id},"
        i+=1
        if i%100 == 0:
            #Since there will be a comma at the end of the ids string, we take all character except the last, which is obviously a comma
            ids = ids[:-1]
            list_ids.append(ids)
            ids=''

    if len(ids)!=0:
        ids = ids[:-1]
        list_ids.append(ids)

    #Since there will be a comma at the end of the ids string, we take all character except the last, which is obviously a comma
    return list_ids

In [94]:
#This function returns a dataframe with audio_featires analysis for all the songs_ids given in parameter
def get_audio_analysis_from_ids(token, list_ids):

    #Create an empty dataframe
    dico = {}
    df = pd.DataFrame(dico)
    for ids in list_ids:
        url = f"https://api.spotify.com/v1/audio-features"
        headers = get_auth_header(token)
        query = f"?ids={ids}"
        query_url =url + query
        result = get(query_url, headers=headers)
        json_result = json.loads(result.content)
        df = pd.concat([df,pd.json_normalize(json_result['audio_features'])],axis=0,ignore_index=True)
    return df

In [108]:
#This function gets the analysis of all songs in a playlist and download the songs. Those songs will be used for spectrogram generation
def all_data_creation(playlist_id, mp3_download_path, playlist_analysis_path, csv_file_name):

  #We get the token
  token = get_token()

  #We create a dataframe from the playlist
  data = playlist_to_dataframe(token,playlist_id)

  #We get the youtube video url of all songs in the dataframe
  video_url = search_music_youtube(data,youtube_api_key)

  #We download the audio in mp3 format
  for url in video_url:
    try:
      download_convert_to_mp3(url,mp3_download_path)
    except Exception as error:
      print("There is an error with pytube")
      print(error)
      break

  for i in range(len(data)):
    #This track.artists columns contains information in JSON format. We only need to get the artist name
    data['track.artists'][i] = data['track.artists'][i][0]['name']

  #We create a dataframe that will take the name of songs, the artist name and the url

  columns_to_take = ['track.artists', 'track.name']
  colmuns_selected = data[columns_to_take]
  colmuns_selected['url'] = video_url
  colmuns_selected = colmuns_selected.rename(columns={'track.artists':'Artistes','track.name':'songs'})

  list_ids = get_ids_from_dataframe(data)
  df = get_audio_analysis_from_ids(token,list_ids)

  #We export the df in csv
  path_df = playlist_analysis_path + csv_file_name
  path_col = playlist_analysis_path + csv_file_name[:-4] + '_url.xlsx'
  colmuns_selected.to_excel(path_col, index=False)
  df.to_csv(path_df, index=False)

  print("The files were created")

If you get this error : RegexMatchError: get_throttling_function_name: could not find match for multiple, it means that you have to wait for pytube to get a correction or you can correct it yourself (there are information on stackoverflow for this error). This error is due to the fact that youtube makes some change very often and pytube sometimes is not up to date.

In [107]:
#THIS IS AN EXAMPLE TO SEE HOW THE DATACREATION WORKS

#Get playlist id in spotify
playlist_id = "6WPtrHFSmeCNtjjlrtJWFy"

#For the mp3 download path, don't forget to add the last '/'
mp3_download_path = "/content/sample_data/"

#Fill the playlist_analysis_path
playlist_analysis_path = "/content/sample_data/"

#Fill the csv_file_name
csv_file_name = "playlist_analysis.csv"

all_data_creation(playlist_id, mp3_download_path, playlist_analysis_path, csv_file_name)

There is an error with pytube
get_throttling_function_name: could not find match for multiple


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['track.artists'][i] = data['track.artists'][i][0]['name']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['track.artists'][i] = data['track.artists'][i][0]['name']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  colmuns_selected['url'] = video_url



There is an error with pytube
get_throttling_function_name: could not find match for multiple
The files were created
