In [1]:
import numpy as np
import pandas as pd

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import col,array_contains

spark = SparkSession.builder.appName('MusicGen').getOrCreate()

In [3]:
schema = StructType() \
      .add("title", StringType(),True) \
      .add("tag", StringType(), True) \
      .add("artist", StringType(), True) \
      .add("year", IntegerType(), True) \
      .add("views", IntegerType(), True) \
      .add("features", StringType(), True) \
      .add("lyrics", StringType(), True) \
      .add("id", IntegerType(), True) \
      .add("language_cld3", StringType(), True) \
      .add("language_ft", StringType(), True) \
      .add("language", StringType(), True)

# df = spark.read.csv("song_lyrics.csv")
# df.printSchema()
df = spark.read.format("csv") \
      .option("header", True) \
      .option("multiLine", True) \
      .option("escape","\"") \
      .schema(schema) \
      .load("song_lyrics.csv")
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- views: integer (nullable = true)
 |-- features: string (nullable = true)
 |-- lyrics: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- language_cld3: string (nullable = true)
 |-- language_ft: string (nullable = true)
 |-- language: string (nullable = true)



In [62]:
wanted_tag = 'pop'
genre_filtered = df.filter(df.tag == wanted_tag)
n_rows = genre_filtered.count()

2138587

In [12]:
N_ROWS = 3_093_218
WANTED_ROWS = 45_000
frac = WANTED_ROWS / n_rows
print(frac)

sampled = genre_filtered.sample(fraction=frac).toPandas()
sampled

0.014547956206125789


Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,I Get Crazy,rap,Nicki Minaj,2009,96101,"{""Lil Wayne""}",[Chorus: Bianca Bonnie]\nI get crazy\nI-I get ...,60,en,en,en
1,Politics as Usual,rap,JAY-Z,1996,180517,{},[Intro]\nYou know how we do\nRoc-A-Fella... fo...,268,en,en,en
2,Pachanga,rap,Fabolous,2009,30574,{},[Hook - Fabolous]\nA thug changes and love cha...,358,en,en,en
3,Queens Gambit,rap,DJ Muggs & GZA The Genius,2005,6521,{},"[Verse: GZA]\nShe dated jolly green GIANTS, th...",937,en,en,en
4,Cold World,rap,GZA,1995,110104,"{""Inspectah Deck"",Life}",[Intro: sample]\n“I had a bad dream”\n“Don’t b...,939,en,en,en
...,...,...,...,...,...,...,...,...,...,...,...
74663,I Aint Going Home,rap,TyFontaine,2022,2,{},"[Pre-Chorus]\nYeah, we ain't goin' home, nah\n...",7881976,en,en,en
74664,Legacy,pop,Sylvia Kay,2022,2,{},Verse One\nChildhood skies\nFascinated with vi...,7882172,en,en,en
74665,Post-Créditos,rock,MauricioSorrow,2020,1,{},Llevo días sin dormir\nY tú sigues aquí\nRiend...,7882235,es,es,es
74666,Love Me For Me,pop,MASN,2022,47,{},[Verse 1]\nHead in the clouds when my skies ar...,7882379,en,en,en


In [19]:
sampled['tag'].value_counts()

tag
pop        31117
rap        24936
rock       11610
rb          2845
misc        2684
country     1476
Name: count, dtype: int64

In [21]:
!pip install nltk 
nltk.download('punkt')



[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [24]:
import nltk
from nltk.util import pad_sequence
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE

In [34]:
tokenized_data = [nltk.word_tokenize(text.lower()) for text in sampled["lyrics"]]
tokenized_data[0]

['[',
 'chorus',
 ':',
 'bianca',
 'bonnie',
 ']',
 'i',
 'get',
 'crazy',
 'i-i',
 'get',
 'crazy',
 'i-i',
 'get',
 'crazy',
 'i-i',
 'get',
 'crazy',
 '[',
 'intro',
 ':',
 'nicki',
 'minaj',
 ']',
 'i',
 'mean',
 ',',
 'you',
 'know',
 'it',
 "'s",
 'like',
 'when',
 'i',
 'feel',
 'really',
 'crazy',
 'i',
 'just',
 'get',
 'my',
 'broom',
 'and',
 'fly',
 'away',
 'i',
 'go',
 'real',
 ',',
 'real',
 'far',
 'away',
 'young',
 'money',
 "that's-that's-that",
 "'s",
 'just',
 'what',
 'we',
 'do',
 'you',
 'know',
 ',',
 'i',
 'mean',
 'we-we',
 'wear',
 'straightjackets',
 'and',
 'we-we-we-we',
 'stay',
 'in',
 ',',
 'you',
 'know',
 ',',
 'padded',
 'rooms',
 'and',
 ',',
 'you',
 'know',
 ',',
 'and',
 'we',
 'fly',
 'away',
 '[',
 'verse',
 '1',
 ':',
 'nicki',
 'minaj',
 ']',
 'i',
 'just',
 'came',
 'out',
 'of',
 'the',
 'motherfuckin',
 "'",
 'old',
 'school',
 'got',
 'my',
 'mac',
 'notebook',
 'with',
 'the',
 'pro',
 'tools',
 'you',
 'bitches',
 'ai',
 "n't",
 'ready

In [46]:
std_data = [
    list(
        pad_sequence(tokens, n=3, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')
    ) for tokens in tokenized_data
]
std_data[0]

['<s>',
 '<s>',
 '[',
 'chorus',
 ':',
 'bianca',
 'bonnie',
 ']',
 'i',
 'get',
 'crazy',
 'i-i',
 'get',
 'crazy',
 'i-i',
 'get',
 'crazy',
 'i-i',
 'get',
 'crazy',
 '[',
 'intro',
 ':',
 'nicki',
 'minaj',
 ']',
 'i',
 'mean',
 ',',
 'you',
 'know',
 'it',
 "'s",
 'like',
 'when',
 'i',
 'feel',
 'really',
 'crazy',
 'i',
 'just',
 'get',
 'my',
 'broom',
 'and',
 'fly',
 'away',
 'i',
 'go',
 'real',
 ',',
 'real',
 'far',
 'away',
 'young',
 'money',
 "that's-that's-that",
 "'s",
 'just',
 'what',
 'we',
 'do',
 'you',
 'know',
 ',',
 'i',
 'mean',
 'we-we',
 'wear',
 'straightjackets',
 'and',
 'we-we-we-we',
 'stay',
 'in',
 ',',
 'you',
 'know',
 ',',
 'padded',
 'rooms',
 'and',
 ',',
 'you',
 'know',
 ',',
 'and',
 'we',
 'fly',
 'away',
 '[',
 'verse',
 '1',
 ':',
 'nicki',
 'minaj',
 ']',
 'i',
 'just',
 'came',
 'out',
 'of',
 'the',
 'motherfuckin',
 "'",
 'old',
 'school',
 'got',
 'my',
 'mac',
 'notebook',
 'with',
 'the',
 'pro',
 'tools',
 'you',
 'bitches',
 'ai',

In [47]:
training, vocab = padded_everygram_pipeline(3, std_data)

In [49]:
model = MLE(3)
model.fit(training, vocab)

In [60]:
def complete_text(model, previous_text, n_tokens=10):
    tokenized_previous = nltk.word_tokenize(previous_text.lower())
    generated_text = model.generate(n_tokens, random_seed=1, text_seed=tokenized_previous)
    texto_gerado = [token for token in generated_text if token != '<s>' and token != '</s>']
    return ' '.join(texto_gerado)


'me these racks at the idea of the capitol ,'

In [61]:
complete_text(model, 'The stars remind of')

'me these racks at the idea of the capitol ,'

In [52]:
texto_gerado = model.generate(10, random_seed=1)
texto_gerado = [token for token in texto_gerado if token != '<s>' and token != '</s>']
texto_gerado = ' '.join(texto_gerado)
texto_gerado

"[ verse 3 : king jawaun ] we 're all"