# Extract lyrics data from dataset
This notebook was created to extract formatted RDDs form the musiXmatch dataset on a local machine.

In [1]:
from pyspark import SparkContext
import re

## Reading and processing the dataset

In [2]:
folder_path = "../data/"
file_path_train = "mxm_dataset_train.txt"
file_path_test = "mxm_dataset_test.txt"
# only used for output
file_path_merged = "mxm_dataset_all.txt"

rdd = sc.textFile(folder_path + file_path_train)
rdd2 = sc.textFile(folder_path + file_path_test)
#rdd.saveAsPickleFile(os.path.join("output/", re.sub('\\.', '-', file_path)), 1000)

We obtain the list of word mappings from the line starting with `%i`. 
**The indices start at 1.**

In [3]:
wordMappings = rdd.filter(lambda x: isinstance(x, basestring) and x.startswith("%")).first()[1:].split(",")

We define a few processing steps that allow for easier manipulation of the data

In [4]:
def filterComments(rdd):
    """
    :param rdd: RDD on which to perform the operations
    :return: An RDD where lines that start with '#' or '%' are filtered
    """
    return rdd.filter(lambda x: isinstance(x, basestring) and not x.startswith("#") and not x.startswith("%"))

In [5]:
def mapToPairs(rdd):
    """
    :param rdd: RDD on which to perform the operations
    :return: An RDD where the lines as strings have been transformed to 
    """
    def mapFn(x):
        fragments = x.split(",")
        return (fragments[0], {"TrackID": fragments[1], "wordsID": fragments[2:]})
    return rdd.map(mapFn)

In [6]:
def mapWords(wm, rdd):
    """
    :param rdd: RDD on which to perform the operations
    :return: An RDD where the words list has been transformed to include the name of each word instead of its ID.
    """
    def processLine(l):
        words = l[1]["wordsID"]
        
        newWordsList = []
        
        for wl in words:
            frags = wl.split(":")
            wordID = int(frags[0])
            wordCount = int(frags[1])
            
            word = wm[wordID-1]
            newWordsList.append((word, wordCount))
        
        l[1]["words"] = newWordsList
        return l
    
    return rdd.map(processLine)

In [7]:
def processingPipeline(rdd, wm):
    """
    :param rdd: RDD on which to perform the operations
    :return: An RDD on which the above operations have all been performed in one step
    """
    rdd = filterComments(rdd)
    rdd = mapToPairs(rdd)
    rdd = mapWords(wm, rdd)
    return rdd

In [8]:
rdd_out = processingPipeline(rdd, wordMappings)
rdd2_out = processingPipeline(rdd2, wordMappings)
rdd_merged = rdd_out.union(rdd2_out)

In [9]:
print("The amount of lines in the train dataset is %d" % rdd_out.count())
print("The amount of lines in the test dataset is %d" % rdd2_out.count())
print("The amount of lines in the whole dataset is %d" % rdd_merged.count())

The amount of lines in the train dataset is 210519
The amount of lines in the test dataset is 27143
The amount of lines in the whole dataset is 237662


In [10]:
# Data sample
rdd_merged.first()

(u'TRAAAAV128F421A322',
 {'TrackID': u'4623710',
  'words': [(u'i', 6),
   (u'the', 4),
   (u'you', 2),
   (u'to', 2),
   (u'and', 5),
   (u'a', 3),
   (u'me', 1),
   (u'it', 1),
   (u'my', 1),
   (u'is', 2),
   (u'of', 3),
   (u'your', 1),
   (u'that', 1),
   (u'are', 2),
   (u'we', 2),
   (u'am', 2),
   (u'will', 2),
   (u'for', 4),
   (u'be', 1),
   (u'have', 2),
   (u'so', 1),
   (u'this', 1),
   (u'like', 2),
   (u'de', 1),
   (u'up', 1),
   (u'was', 2),
   (u'if', 1),
   (u'got', 1),
   (u'would', 1),
   (u'been', 1),
   (u'these', 2),
   (u'seem', 1),
   (u'someon', 1),
   (u'understand', 1),
   (u'pass', 1),
   (u'river', 1),
   (u'met', 1),
   (u'piec', 1),
   (u'damn', 1),
   (u'worth', 1),
   (u'flesh', 1),
   (u'grace', 1),
   (u'poor', 2),
   (u'somehow', 1),
   (u'ignor', 1),
   (u'passion', 1),
   (u'tide', 1),
   (u'season', 1),
   (u'seed', 1),
   (u'resist', 1),
   (u'order', 2),
   (u'piti', 1),
   (u'fashion', 1),
   (u'grant', 1),
   (u'captur', 2),
   (u'ici', 1),

In [11]:
rdd_out.saveAsPickleFile(os.path.join("output/", re.sub('\\.', '-', file_path_train)), 1000)
rdd2_out.saveAsPickleFile(os.path.join("output/", re.sub('\\.', '-', file_path_test)), 1000)
rdd_merged.saveAsPickleFile(os.path.join("output/", re.sub('\\.', '-', file_path_merged)), 1000)