In [1]:
import numpy as np
import pandas as pd

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import col,array_contains

spark = SparkSession.builder.appName('MusicGen').getOrCreate()

In [3]:
schema = StructType() \
      .add("title", StringType(),True) \
      .add("tag", StringType(), True) \
      .add("artist", StringType(), True) \
      .add("year", IntegerType(), True) \
      .add("views", IntegerType(), True) \
      .add("features", StringType(), True) \
      .add("lyrics", StringType(), True) \
      .add("id", IntegerType(), True) \
      .add("language_cld3", StringType(), True) \
      .add("language_ft", StringType(), True) \
      .add("language", StringType(), True)

# df = spark.read.csv("song_lyrics.csv")
# df.printSchema()
df = spark.read.format("csv") \
      .option("header", True) \
      .option("multiLine", True) \
      .option("escape","\"") \
      .schema(schema) \
      .load("song_lyrics.csv")
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- views: integer (nullable = true)
 |-- features: string (nullable = true)
 |-- lyrics: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- language_cld3: string (nullable = true)
 |-- language_ft: string (nullable = true)
 |-- language: string (nullable = true)



In [4]:
wanted_tag = 'pop'
genre_filtered = df.filter(f"tag = {wanted_tag} AND language = 'en'")

In [5]:
seed = 69
train, test = genre_filtered.randomSplit([0.8, 0.2], seed)

n_rows = train.count()
WANTED_ROWS = 60_000
frac = WANTED_ROWS / n_rows
print(frac)

sampled = train.sample(fraction=frac).toPandas()
sampled

0.028055907942954858


Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Bad,pop,Michael Jackson,1987,356803,{},"[Verse 1]\nYour butt is mine, gon' tell you ri...",1862,en,en,en
1,TiK ToK,pop,Kesha,2009,429571,{},[Verse 1]\nWake up in the morning feelin' like...,3627,en,en,en
2,WORDS WORDS WORDS,pop,Bo Burnham,2010,261890,{},[Intro]\nLet's rock! 1! 2! 1-2-3-4!\n\n[Verse ...,2617,en,en,en
3,Songs for Women,pop,Frank Ocean,2011,280503,{},"[Intro]\nHaha\nWhen I was younger, I used to w...",4883,en,en,en
4,Friday,pop,Rebecca Black,2011,260169,{},"[Intro]\nOoh-ooh, ooh-yeah yeah, yeah yeah\nYe...",5603,en,en,en
...,...,...,...,...,...,...,...,...,...,...,...
60150,Broken Pt. 2,pop,Michael W. Smith,2020,2,{},"[Instrumental]\n\nLord, bend down to listen to...",7881703,en,en,en
60151,Heartbeat,pop,"TAAMY, Barco & Mordkey",2019,3,"{""TAAMY / Barco & Mordkey""}",[Chorus]\nForever in your heartbeat (Heartbeat...,7881789,en,en,en
60152,Cant Make Myself,pop,death's dynamic shroud,2022,2,"{""​death\\'s dynamic shroud""}",Why don't you just take a walk?\nWhy don't you...,7882374,en,en,en
60153,With The Band,pop,Sophie May,2022,5,{},[Verse 1]\nYou ain't 18 anymore\nQuit reminisc...,7882576,en,en,en


In [6]:
sampled['tag'].value_counts()

tag
pop    60155
Name: count, dtype: int64

In [9]:
!pip install nltk 
import nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
from nltk.util import pad_sequence
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE

In [11]:
tokenized_data = [nltk.word_tokenize(text.lower()) for text in sampled["lyrics"]]
tokenized_data[0]

['[',
 'verse',
 '1',
 ']',
 'your',
 'butt',
 'is',
 'mine',
 ',',
 'gon',
 "'",
 'tell',
 'you',
 'right',
 ',',
 'ah',
 'just',
 'show',
 'your',
 'face',
 'in',
 'broad',
 'daylight',
 ',',
 'ah',
 'i',
 "'m",
 'tellin',
 "'",
 'you',
 'on',
 'who',
 'i',
 'feel',
 ',',
 'ah',
 'gon',
 'na',
 'hurt',
 'your',
 'mind',
 ',',
 'do',
 "n't",
 'shoot',
 'to',
 'kill',
 'shamone',
 ',',
 'ah',
 ',',
 'shamone',
 'lay',
 'it',
 'on',
 'me',
 ',',
 'ah',
 ',',
 'alright',
 'ah',
 ',',
 'i',
 "'m",
 'givin',
 "'",
 'you',
 'on',
 'count',
 'to',
 'three',
 ',',
 'ah',
 'just',
 'show',
 'your',
 'stuff',
 'or',
 'let',
 'it',
 'be',
 ',',
 'ah',
 'i',
 "'m",
 'tellin',
 "'",
 'you',
 ',',
 'just',
 'watch',
 'your',
 'mouth',
 ',',
 'ah',
 'i',
 'know',
 'your',
 'game',
 ',',
 'what',
 'you',
 "'re",
 'about',
 '[',
 'pre-chorus',
 ']',
 'well',
 'they',
 'say',
 'the',
 'sky',
 "'s",
 'the',
 'limit',
 'and',
 'to',
 'me',
 'that',
 "'s",
 'really',
 'true',
 ',',
 'ah',
 'but',
 'my',
 

In [12]:
std_data = [
    list(
        pad_sequence(tokens, n=3, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')
    ) for tokens in tokenized_data
]
std_data[0]

['<s>',
 '<s>',
 '[',
 'verse',
 '1',
 ']',
 'your',
 'butt',
 'is',
 'mine',
 ',',
 'gon',
 "'",
 'tell',
 'you',
 'right',
 ',',
 'ah',
 'just',
 'show',
 'your',
 'face',
 'in',
 'broad',
 'daylight',
 ',',
 'ah',
 'i',
 "'m",
 'tellin',
 "'",
 'you',
 'on',
 'who',
 'i',
 'feel',
 ',',
 'ah',
 'gon',
 'na',
 'hurt',
 'your',
 'mind',
 ',',
 'do',
 "n't",
 'shoot',
 'to',
 'kill',
 'shamone',
 ',',
 'ah',
 ',',
 'shamone',
 'lay',
 'it',
 'on',
 'me',
 ',',
 'ah',
 ',',
 'alright',
 'ah',
 ',',
 'i',
 "'m",
 'givin',
 "'",
 'you',
 'on',
 'count',
 'to',
 'three',
 ',',
 'ah',
 'just',
 'show',
 'your',
 'stuff',
 'or',
 'let',
 'it',
 'be',
 ',',
 'ah',
 'i',
 "'m",
 'tellin',
 "'",
 'you',
 ',',
 'just',
 'watch',
 'your',
 'mouth',
 ',',
 'ah',
 'i',
 'know',
 'your',
 'game',
 ',',
 'what',
 'you',
 "'re",
 'about',
 '[',
 'pre-chorus',
 ']',
 'well',
 'they',
 'say',
 'the',
 'sky',
 "'s",
 'the',
 'limit',
 'and',
 'to',
 'me',
 'that',
 "'s",
 'really',
 'true',
 ',',
 'ah',


In [13]:
training, vocab = padded_everygram_pipeline(3, std_data)

In [14]:
model = MLE(3)
model.fit(training, vocab)

In [15]:
def complete_text(model, previous_text, n_tokens=10):
    tokenized_previous = nltk.word_tokenize(previous_text.lower())
    generated_text = model.generate(n_tokens, random_seed=1, text_seed=tokenized_previous)
    texto_gerado = [token for token in generated_text if token != '<s>' and token != '</s>']
    return ' '.join(texto_gerado)


In [22]:
complete_text(model, 'The stars remind of')

"someone who stood beside me i know where i 'll"

In [None]:
import pickle
f = open('baseline-model.pickle', 'wb')
pickle.dump(model, f)
f.close()

In [None]:
def compute_accuracy(x, test) -> int:
    total = len(test)
    right = 0
    for text in test:
        target = text[-1]
        pred = complete_text(model, text[:-1])
        if pred == target:
            right += 1
    return right / total
        
    