# Extract lyrics data from dataset
This notebook was created to extract formatted RDDs form the musiXmatch dataset on a local machine.

In [1]:
from pyspark import SparkContext
import re

## Reading and processing the dataset

In [2]:
folder_path = "../data/"
file_path_train = "mxm_dataset_train.txt"
file_path_test = "mxm_dataset_test.txt"

rdd = sc.textFile(folder_path + file_path_train)
rdd2 = sc.textFile(folder_path + file_path_test)
#rdd.saveAsPickleFile(os.path.join("output/", re.sub('\\.', '-', file_path)), 1000)

We obtain the list of word mappings from the line starting with `%i`. We keep the first element (`%i`) as the indices start at 1.

In [3]:
wordMappings = rdd.filter(lambda x: isinstance(x, basestring) and x.startswith("%")).first().split(",")

In [4]:
def filterComments(rdd):
    """
    :param rdd: RDD on which to perform the operations
    :return: An RDD where lines that start with '#' or '%' are filtered
    """
    return rdd.filter(lambda x: isinstance(x, basestring) and not x.startswith("#") and not x.startswith("%"))

In [5]:
def mapToPairs(rdd):
    """
    :param rdd: RDD on which to perform the operations
    :return: An RDD where the lines as strings have been transformed to 
    """
    def mapFn(x):
        fragments = x.split(",")
        return (fragments[0], {"TrackID": fragments[1], "words": fragments[2:]})
    return rdd.map(mapFn)

In [6]:
def mapWords(wm, rdd):
    """
    :param rdd: RDD on which to perform the operations
    :return: An RDD where the words list has been transformed to include the name of each word instead of its ID.
    """
    def processLine(l):
        words = l[1]["words"]
        
        newWordsList = []
        
        for wl in words:
            frags = wl.split(":")
            wordID = int(frags[0])
            wordCount = int(frags[1])
            
            if (wordID >= 0 and wordID <= 50):
                newWordsList.append((wm[wordID], wordCount))
        
        l[1]["words"] = newWordsList
        return l
    
    return rdd.map(processLine)

In [7]:
def processingPipeline(rdd):
    """
    :param rdd: RDD on which to perform the operations
    :return: An RDD on which the above operations have all been performed in one step
    """
    rdd = filterComments(rdd)
    rdd = mapToPairs(rdd)
    rdd = mapWords(wordMappings, rdd)
    return rdd

In [8]:
rdd_out = processingPipeline(rdd)
rdd2_out = processingPipeline(rdd2)

In [13]:
print("The amount of lines in the train dataset is %d" % rdd_out.count())
print("The amount of lines in the test dataset is %d" % rdd2_out.count())

The amount of lines in the train dataset is 210519
The amount of lines in the test dataset is 27143


In [10]:
rdd_out.saveAsPickleFile(os.path.join("output/", re.sub('\\.', '-', file_path_train)), 1000)
rdd2_out.saveAsPickleFile(os.path.join("output/", re.sub('\\.', '-', file_path_test)), 1000)