#### Importing Json file to Pandas readable format

Start by importing spotify challenge (AI Crowd) file (.json) into python from [here]("https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge/dataset_files?unique_download_uri=355528&challenge_id=277").

In [None]:
import pandas as pd
import json

# Load JSON data from file
with open('/data/challenge_set.json', 'r') as file: # Replace with local dataset path
    data = json.load(file)

# Initialize an empty list to collect all track data
all_tracks = []

# Loop through each playlist in the dataset
for playlist in data['playlists']:
    for track in playlist['tracks']:
        # Add playlist-level information to each track record
        track_info = {
            'playlist_name': playlist.get('name', 'Unknown'),
            'playlist_pid': playlist['pid'],
            'playlist_num_tracks': playlist['num_tracks'],
            'track_pos': track['pos'],
            'artist_name': track['artist_name'],
            'track_uri': track['track_uri'],
            'artist_uri': track['artist_uri'],
            'track_name': track['track_name'],
            'album_uri': track['album_uri'],
            'duration_ms': track['duration_ms'],
            'album_name': track['album_name']
        }
        all_tracks.append(track_info)

# Convert the list of track dictionaries to a DataFrame
df_spotify = pd.DataFrame(all_tracks)

# Display the first few rows of the DataFrame to verify
print(df_spotify.head())


In [None]:
# Creating new dataset for getting lyrics of unique songs
df_lyrics = df_spotify.drop_duplicates(subset=['track_uri']).drop(['playlist_name', 'playlist_pid', 'playlist_num_tracks', 'track_pos',
       'artist_uri', 'album_uri', 'duration_ms', 'album_name'], axis=1).reset_index()#inplace=True)

In [None]:
# Batch function to split file for API calls 
'''rows_per_file = 3000  # Number of rows per file

# Calculate how many files will be needed
num_files = len(df_lyrics) // rows_per_file + (1 if len(df_lyrics) % rows_per_file != 0 else 0)

# Split and save the DataFrame into multiple CSV files
for i in range(num_files):
    start_row = i * rows_per_file
    end_row = start_row + rows_per_file
    # Create a new CSV file for each chunk
    df_lyrics[start_row:end_row].to_csv(f'output_file_{i + 1}.csv', index=False)'''

In [None]:
# Calling for Genius API credentials from dotenv file
from dotenv import load_dotenv
import os
import lyricsgenius

load_dotenv()
token = os.getenv("genius_token")
genius = lyricsgenius.Genius(token) # Providing token info to access information 
genius.remove_section_headers = True # Remove section headers (e.g. [Chorus]) from lyrics when searching
genius.timeout = 60 # Timeout call for API

In [None]:
# Iterating over each unique song to get lyrics from Genius API
for i in range(len(df_lyrics)-1):
    song = genius.search_song(df_lyrics['track_name'][i], df_lyrics['artist_name'][i])
    df_lyrics.loc[i, 'lyrics'] = song.lyrics if song else 'Lyrics not found'

Searching for "Little Swing" by AronChupa...
Done.
Searching for "I'm an Albatraoz" by AronChupa...
Done.
Searching for "Yellow Flicker Beat - From The Hunger Games: Mockingjay Part 1" by Lorde...
No results found for: 'Yellow Flicker Beat - From The Hunger Games: Mockingjay Part 1 Lorde'
Searching for "White Teeth Teens" by Lorde...
Done.
Searching for "Team" by Lorde...
Done.
Searching for "Heroes (we could be)" by Alesso...
Done.
Searching for "Superheroes" by The Script...
Done.
Searching for "Centuries" by Fall Out Boy...
Done.
Searching for "Best Day Of My Life" by American Authors...
Done.
Searching for "On Top Of The World" by Imagine Dragons...
Done.
Searching for "Beggin For Thread" by Banks...
Done.
Searching for "Ways To Go" by Grouplove...
Done.
Searching for "Dangerous (feat. Joywave)" by Big Data...
Done.
Searching for "Lampshades on Fire" by Modest Mouse...
Done.
Searching for "Primadonna" by Marina and the Diamonds...
Done.
Searching for "Firebird" by Galantis...
Done.