In [1]:
import pronouncing
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import col, split, explode, udf
from pyspark.sql.types import ArrayType, StringType, IntegerType
from pyspark.sql import Row
from pyspark.sql import functions as F
import pandas as pd
import re

In [2]:
rowData = spark.read.csv("/FileStore/tables/hadoop_final_project/lyrics.csv", inferSchema=True, header = True, multiLine=True)
rowData = rowData.filter(rowData.genre == 'Hip-Hop')

In [3]:
rowData.show(n=5)

In [4]:
data_array_lyrics = rowData.withColumn(
    "lyrics",
    split(col("lyrics"), "\n").cast(ArrayType(StringType())).alias("lyrics")
)
data_array_lyrics.show(n=5)

In [5]:
data_line = data_array_lyrics.withColumn("lyrics", explode(data_array_lyrics.lyrics))
data_line.show(n=5)

In [6]:
def phone_phrase(phrase):
  phone_consonant = ['B', 'CH', 'D', 'DH', 'F', 'G', 'HH', 'JH', 'K', 'L', 'M', 'N', 'P', 'R', 'S', 'SH', 'T', 'TH', 'V', 'W', 'Y', 'Z', 'ZH']
  s = filter(bool, re.split(r'\W+', phrase.lower()))
  phone = []
  for p in s:
    tmp = pronouncing.phones_for_word(p)
    if len(tmp) != 0:
      tmp_list = tmp[0].split()
      for x in tmp_list:
        if x not in phone_consonant:
          phone.append(x[:-1])
  if len(phone) >=3:
    res = phone[-3] + phone[-2] + phone[-1]
  else:
    res = ''
  return res
pp_udf = udf(phone_phrase, StringType())

In [7]:
data_phone = data_line.withColumn("phoneme", pp_udf(data_line['lyrics']))
data_phone.show(n=20)

In [8]:
def length_phrase(phrase):
  s = filter(bool, re.split(r'\W+', phrase.lower()))
  return len(s)
len_udf = udf(length_phrase, IntegerType())
data_phone_len = data_phone.withColumn('length', len_udf(data_phone['lyrics']))
data_phone_len.show(n=20)

In [9]:
data_phone_len.printSchema()

In [10]:
test_lyric = 'call me rap king of underground'
pho, length = phone_phrase(test_lyric), length_phrase(test_lyric)
print pho, length
print type(pho)

In [11]:
data_final = data_phone_len.filter(data_phone_len.phoneme == pho).filter(data_phone_len.length<length+5).filter(data_phone_len.length>length-5)

In [12]:
data_final = data_final.select(data_final.lyrics).distinct()

In [13]:
data_final.show(n=100, truncate=False)