# Importing the necessary libraries

In [None]:
! pip install -q pyspark==3.3.0 spark-nlp==4.3.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.3/281.3 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m473.2/473.2 kB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:

import sparknlp

from sparknlp.base import *
from sparknlp.annotator import *

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import Pipeline,PipelineModel
from glob import glob
from termcolor import colored,cprint

In [None]:
import pyspark
import tqdm
import os
from glob import glob
from pyspark.sql.functions import *
import pandas as pd
import numpy as np
from pyspark.sql import functions as func
from pyspark.sql.types import *
import warnings
import json
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# Comment out this line  and uncomment the next one to enable GPU mode and High RAM
# spark = sparknlp.start()

spark = sparknlp.start(gpu=True)

print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 4.3.2
Apache Spark version: 3.3.0


# Loading our database into spark

In [None]:
# Getting all the file names scraped from rate your music
rating = ['top','bottom','popular','esoteric','diverse']
# path = "data/genius_lyrics/{}/"
path = "/content/drive/MyDrive/Big_Data_Project/spark_y_rock_anthem/data/genius_lyrics/{}/"
all_paths = {}
for order in rating:
    temp = path.format(order)
    all_paths[order] = []
    for file in glob(temp+"*.json"):
        all_paths[order].append(file)

In [None]:
# Writing the schema for read
schema = StructType([
            StructField('Ranking', IntegerType()),
            StructField('Album', StringType()),
            StructField('Artist Name', StringType()),
            StructField('Release Date', StringType()),
            StructField('Genres', StringType()),
            StructField('Descriptors', StringType()),
            StructField('Average Rating', StringType()),
            StructField('spotify album uri', StringType()),
            StructField('spotify artist uri', StringType()),
            StructField('spotify track uri', StringType()),
            StructField('spotify track name', StringType()),
            StructField('spotify track number', IntegerType()),
            StructField('spotify disc number', IntegerType()),
            StructField('spotify track popularity', IntegerType()),
            StructField('spotify track duration', IntegerType()),
            StructField('spotify track features', MapType(StringType(), StringType())),
            StructField('spotify artist name', StringType()),
            StructField('spotify artist popularity', IntegerType()),
            StructField('spotify artist followers', IntegerType()),
            StructField('spotify artist genres', ArrayType(StringType())),
            StructField('lyrics', StringType())
            ])

In [None]:
# Reading the dataframe
counter = 0
for order in all_paths.keys():
    
    # Search for a specific category - comment the next two lines if you want to include all categories
    # if order != 'top':
    #     continue
    
    # Iterating through all the file paths in that category
    for j_file in tqdm.tqdm(all_paths[order]):
        
        # Getting the year from the file path
        year = j_file[-9:-5]
        
        # Look for a specific year -  comment the next two lines if you want to include all years+
        if int(year) < 2000:
            continue
        
        counter+=1

        # Opening a file 
        with open(j_file) as f:
            data_dict = json.load(f)

        if counter == 1:
            # creating a dataframe
            df = spark.createDataFrame(data_dict, schema = schema)
        else:
            try:
              df = df.unionAll(spark.createDataFrame(data_dict, schema = schema))
            except:
              print("Issue Reading")
              continue


100%|██████████| 25/25 [01:07<00:00,  2.72s/it]
100%|██████████| 23/23 [00:18<00:00,  1.25it/s]
100%|██████████| 24/24 [00:56<00:00,  2.35s/it]
100%|██████████| 49/49 [01:32<00:00,  1.90s/it]
100%|██████████| 37/37 [01:05<00:00,  1.77s/it]


# Searching by album, artist or song to generate lyrics

In [None]:
temp = df.where("lyrics != 'None'")

## Searching by album name

In [None]:
inp = input("Enter the album name: ")

Enter the album name: melodrama


In [None]:
temp1 = temp.filter(lower(col("Album")).contains(inp.lower())).sample(False, 0.1, seed = 0).limit(1)

In [None]:
print_data = temp1.toPandas()

In [None]:
try:
  print(f"Getting AI generated lyrics for {print_data['spotify track name'][0]} by {print_data['Artist Name'][0]} from the album {print_data['Album'][0]}")
except:
  print("No song found!")

Getting AI generated lyrics for Hard Feelings/Loveless by Lorde from the album Melodrama


## Searching by artist name

In [None]:
inp = input("Enter the artist name: ")

Enter the artist name: katy perry


In [None]:
temp1 = temp.filter(lower(col("Artist Name")).contains(inp.lower())).sample(False, 0.1, seed = 0).limit(1)

In [None]:
print_data = temp1.toPandas()

In [None]:
try:
  print(f"Getting AI generated lyrics for {print_data['spotify track name'][0]} by {print_data['Artist Name'][0]} from the album {print_data['Album'][0]}")
except:
  print("No song found!")

Getting AI generated lyrics for Teenage Dream by Katy Perry from the album Teenage Dream


## Searching by song name

In [None]:
inp = input("Enter the song name: ")

Enter the song name: pink + white


In [None]:
temp1 = temp.filter(lower(col("spotify track name")).contains(inp.lower())).sample(False, 1.0, seed=0).limit(1)

In [None]:
print_data = temp1.toPandas()

In [None]:
try:
  print(f"Getting AI generated lyrics for {print_data['spotify track name'][0]} by {print_data['Artist Name'][0]} from the album {print_data['Album'][0]}")
except:
  print("No song found!")

Getting AI generated lyrics for Pink + White by Frank Ocean from the album Blonde


# Cleaning and preparing the data

In [None]:
# Cleaning the data
@udf(returnType=StringType())
def clean_lyrics(data):
    res = data.split("Lyrics",1)[1]
    res = res.split("Embed")[0][:-1]
    res = "\n".join([i for i in res.split("\n") if i!=''])
    return res

In [None]:
# Truncating the input
@udf(returnType=StringType())
def truncate_lyrics(data):
    res = data.split("\n")[0:8]
    res = "\n".join(res)
    return res

In [None]:
temp2 = temp1.select(clean_lyrics("lyrics").alias('cleaned lyrics'))
data = temp2.select('cleaned lyrics',truncate_lyrics('cleaned lyrics')).toDF("original","text")

# GPT2 Pipeline 

Creating a Spark NLP Pipeline with `gpt2` model and checking the results.

In [None]:
documentAssembler = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("documents")


In [None]:
# The below code is to load gpt2_base model from spark nlp 
gpt2 = GPT2Transformer.load("/content/drive/MyDrive/Big_Data_Project/spark_y_rock_anthem/models/gpt2_en_3.4.0_3.0_1638510926608/") \
    .setInputCols(["documents"]) \
    .setTask("Is it true that") \
    .setMaxOutputLength(1000) \
    .setMinOutputLength(25) \
    .setOutputCol("generation")

# The below code is to load gpt2_medium model from spark nlp - doesn't fit in RAM of colab
# gpt2 = GPT2Transformer.load("/content/drive/MyDrive/Big_Data_Project/spark_y_rock_anthem/models/gpt2_medium_en_3.4.0_3.0_1638517188768/") \
#     .setInputCols(["documents"]) \
#     .setTask("Is it true that") \
#     .setMaxOutputLength(1000) \
#     .setMinOutputLength(25) \
#     .setOutputCol("generation")

In [None]:
pipeline = Pipeline().setStages([documentAssembler,gpt2]).fit(data)

In [None]:
result = pipeline.transform(data)

In [None]:
res = result.select("generation.result").toPandas()

In [None]:
to_print = data.toPandas()

In [None]:
to_print

Unnamed: 0,original,text
0,"Yeah, yeah, um (Woo)\nYeah, yeah, yeah\nThat's...","Yeah, yeah, um (Woo)\nYeah, yeah, yeah\nThat's..."


In [None]:
red = '\033[91m'
green = '\033[92m'
bold = '\033[1m'
reset = "\033[0;0m"

print(bold + "Input text:" + reset)
print(red + to_print.text[0] + reset)

print("%-120s  %-120s" %(bold+"GPT Result:","Original Song:"+reset))


print_1 = to_print.text[0]  
print_2 = res.result[0][0].split("Is it true that ")[1].split(to_print.text[0])[1]
print_3 = to_print.original[0]

p1_l = print_1.split("\n")
p1_l = [red + s + reset for s in p1_l]
p2_l = print_2.split("\n")
p2_l = [green + s + reset for s in p2_l]
p1 = p1_l+p2_l
p1 = [i for i in p1 if i]

p3_l = print_3.split("\n")
p3_l = [red + s + reset for s in p3_l]
p3_l = [i for i in p3_l if i]

if len(p1)>=len(p3_l):
  for i in range(len(p1)):
    try:
      print("%-120s  %-120s" %(p1[i],p3_l[i]))
    except:
      print("%-120s" %(p1[i]))
else:
    for i in range(len(p3_l)):
      try:
        print("%-120s  %-120s" %(p1[i],p3_l[i]))
      except:
        print("%-120s %s" %(" ",p3_l[i]))



[1mInput text:[0;0m
[91mYeah, yeah, um (Woo)
Yeah, yeah, yeah
That's the way every day goes
Every time we've no control
If the sky is pink and white
If the ground is black and yellow
It's the same way you showed me
Nod my head, don't close my eyes[0;0m
[1mGPT Result:                                                                                                           Original Song:[0;0m                                                                                                    
[91mYeah, yeah, um (Woo)[0;0m                                                                                           [91mYeah, yeah, um (Woo)[0;0m                                                                                         
[91mYeah, yeah, yeah[0;0m                                                                                               [91mYeah, yeah, yeah[0;0m                                                                                             
[91mThat's th