In [1]:
import pyspark
import tqdm
import os
from glob import glob
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
from pyspark.sql import functions as func
from pyspark.sql.types import *
import warnings
import json
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)

In [5]:
conf = pyspark.SparkConf()
sc = pyspark.SparkContext(conf=conf)
config = pyspark.SparkConf().setAll([('spark.executor.memory', '10g'), ('spark.executor.cores', '3'), ('spark.cores.max', '3'), ('spark.driver.memory','10g')])
sc.stop()


sc = pyspark.SparkContext(conf=config)
spark = pyspark.sql.SparkSession(sc)
sc.setLogLevel("ERROR")
spark

In [6]:
# Getting all the file names scraped from rate your music
rating = ['top','bottom','popular','esoteric','diverse']
path = "data/genius_lyrics/{}/"
all_paths = {}
for order in rating:
    temp = path.format(order)
    all_paths[order] = []
    for file in glob(temp+"*.json"):
        all_paths[order].append(file)

In [7]:
# Writing the schema for read
schema = StructType([
            StructField('Ranking', IntegerType()),
            StructField('Album', StringType()),
            StructField('Artist Name', StringType()),
            StructField('Release Date', StringType()),
            StructField('Genres', StringType()),
            StructField('Descriptors', StringType()),
            StructField('Average Rating', StringType()),
            StructField('spotify album uri', StringType()),
            StructField('spotify artist uri', StringType()),
            StructField('spotify track uri', StringType()),
            StructField('spotify track name', StringType()),
            StructField('spotify track number', IntegerType()),
            StructField('spotify disc number', IntegerType()),
            StructField('spotify track popularity', IntegerType()),
            StructField('spotify track duration', IntegerType()),
            StructField('spotify track features', MapType(StringType(), StringType())),
            StructField('spotify artist name', StringType()),
            StructField('spotify artist popularity', IntegerType()),
            StructField('spotify artist followers', IntegerType()),
            StructField('spotify artist genres', ArrayType(StringType())),
            StructField('lyrics', StringType())
            ])

In [8]:
# Reading the dataframe
counter = 0
for order in all_paths.keys():
    
    # Search for a specific category - comment the next two lines if you want to include all categories
    if order != 'top':
        continue
    
    # Iterating through all the file paths in that category
    for j_file in tqdm.tqdm(all_paths[order]):
        
        # Getting the year from the file path
        year = j_file[-9:-5]
        
        # Look for a specific year -  comment the next two lines if you want to include all years+
        if int(year) < 2022:
            continue
        
        counter+=1

        # Opening a file 
        with open(j_file) as f:
            data_dict = json.load(f)

        if counter == 1:
            # creating a dataframe
            df = spark.createDataFrame(data_dict, schema = schema)
        else:
            df = df.unionAll(spark.createDataFrame(data_dict, schema = schema))


100%|██████████████████████████████████████████████████████████████████████████████████| 25/25 [00:10<00:00,  2.37it/s]


In [None]:
df.where("lyrics != 'None'").count()

In [None]:
@udf(returnType=StringType())
def clean_lyrics(data):
    res = data.split("Lyrics",1)[1]
    return res

In [103]:
temp = df.where("lyrics != 'None'").limit(5)
temp.select(clean_lyrics("lyrics")).toPandas()

Unnamed: 0,clean_lyrics(lyrics)
0,\nAin't enough to say that I think of you\nWor...
1,\nOoh-ooh\nOoh-ooh\nOoh-ooh\n\nI can tell we'v...
2,"\nEvery day, every day, every day, I look\nFor..."
3,\nCan't take my eyes off of you tonight\nYou'r...
4,"\nHigh anticipation, it's an emotional trap\nD..."


In [None]:
df.filter(col("name").contains("mes")).show()