In [1]:
from pyspark.sql import SparkSession
from operator import add

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.111:7077") \
        .appName("lyrics_mapreduce")\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores", 4)\
        .config("spark.cores.max", 8)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("INFO")

In [2]:
import pyspark.sql.functions as F

#Get most frequent words for each song ID
lyrics = spark_session.read\
           .option("header", "true")\
           .csv("hdfs://192.168.2.111:9000/user/ubuntu/lyrics_database.csv")

#Get genre tags for each song ID
lastfm = spark_session.read\
           .json("hdfs://192.168.2.111:9000/user/ubuntu/lastfm/lastfm_test/*/*/*")\
            .repartition(30)\
           .cache()

In [3]:
from pyspark.sql.types import *

#Filter out irrelevant attributes in genre dataset
genre = lastfm.filter(F.size(lastfm["tags"]) > 0 )\
        .select("tags", "track_id")\
        .cache()

In [8]:
#Look data

lyrics.show(3)
genre.show(4)
genre.printSchema()
lyrics.printSchema()
lyrics.count()

+----+-----+------------------+
|word|count|          track_id|
+----+-----+------------------+
|   i|    6|TRAAAAV128F421A322|
| the|    4|TRAAAAV128F421A322|
| you|    2|TRAAAAV128F421A322|
+----+-----+------------------+
only showing top 3 rows

+--------------------+------------------+
|                tags|          track_id|
+--------------------+------------------+
|[[oldies, 100], [...|TRYCTQZ128F93596E2|
|[[oldies, 100], [...|TRUPDAM128F92FE67C|
|[[motown, 100], [...|TRKTNPR128F92F15ED|
|[[oldies, 100], [...|TRNRKRS128F42640AE|
+--------------------+------------------+
only showing top 4 rows

root
 |-- tags: array (nullable = true)
 |    |-- element: array (containsNull = true)
 |    |    |-- element: string (containsNull = true)
 |-- track_id: string (nullable = true)

root
 |-- word: string (nullable = true)
 |-- count: string (nullable = true)
 |-- track_id: string (nullable = true)



19045332

In [5]:
#Join both datasets on their ID
paired_songs = lyrics.join(genre, "track_id").cache()
paired_songs.show(4)

paired_songs.count()

+------------------+----+-----+--------------------+
|          track_id|word|count|                tags|
+------------------+----+-----+--------------------+
|TRAADFO128F92E1E91|   i|   79|[[dancehall, 100]...|
|TRAADFO128F92E1E91| the|   66|[[dancehall, 100]...|
|TRAADFO128F92E1E91| you|   15|[[dancehall, 100]...|
|TRAADFO128F92E1E91|  to|    7|[[dancehall, 100]...|
+------------------+----+-----+--------------------+
only showing top 4 rows



1622597

In [14]:
from stop_words import get_stop_words

#Filter stopwords from the frequent words
stopwords = get_stop_words("english")
#ONLY RUN ONCE - For more interesting results
# stopwords.append("just")
# stopwords.append("will")

#Create new DataFrame that contains one row for each genre tag, the word, and the word count in each song
songs_lite = paired_songs.filter(paired_songs['word'].isin(stopwords)==False)\
                        .select(paired_songs["word"],\
                                paired_songs["count"],\
                                F.explode(paired_songs["tags"]))\
                        .withColumnRenamed("col","genre")\
                        .cache()
    
songs_lite.show(3)

+----+-----+------------------+
|word|count|             genre|
+----+-----+------------------+
|will|    4|  [dancehall, 100]|
|will|    4|[raggamuffin, 100]|
|know|    3|  [dancehall, 100]|
+----+-----+------------------+
only showing top 3 rows



In [15]:
import pyspark.sql.types

#Remove second element in tuple to obtain only genre tag
def remove_similarity(genre_tuple):
    genre, _ = genre_tuple
            
    return genre

tags_function = F.udf(remove_similarity, StringType())

#Cast count to an integer type and remove second element in genre tuple 
wordcount_genre = songs_lite.withColumn("genre", tags_function(songs_lite["genre"]))\
                            .withColumn("wordcount", songs_lite["count"].cast(IntegerType()))\
                            .drop("count")\
                            .cache()

wordcount_genre.show(2)

+----+-----------+---------+
|word|      genre|wordcount|
+----+-----------+---------+
|will|  dancehall|        4|
|will|raggamuffin|        4|
+----+-----------+---------+
only showing top 2 rows



In [16]:
# Group elements with common genre and word, and sum their wordcounts
wordcount_genre.groupBy(wordcount_genre["word"], wordcount_genre["wordcount"], wordcount_genre["genre"])\
                .count()\
                .sort("count",ascending=False)\
                .drop("wordcount")\
                .show()

+----+-----+-----+
|word|genre|count|
+----+-----+-----+
|just| rock|  941|
|time| rock|  934|
|will| rock|  929|
|like| rock|  905|
|know| rock|  899|
| see| rock|  872|
| now| rock|  855|
| can| rock|  820|
|  go| rock|  812|
|come| rock|  790|
| one| rock|  779|
|feel| rock|  724|
|make| rock|  721|
|take| rock|  720|
| way| rock|  704|
| eye| rock|  700|
| say| rock|  700|
| get| rock|  685|
| day| rock|  667|
|time|  pop|  651|
+----+-----+-----+
only showing top 20 rows



In [17]:
#Group elements by genre and word, to see which pairs are the most frequent
wordcount_genre.groupBy("genre", "word")\
                .count()\
                .sort("count", ascending=False)\
                .show(100)

+----------------+-----+-----+
|           genre| word|count|
+----------------+-----+-----+
|            rock| will| 2638|
|            rock| know| 2253|
|            rock| just| 2171|
|            rock| like| 1921|
|            rock|  now| 1879|
|            rock| time| 1850|
|            rock|  can| 1767|
|             pop| will| 1740|
|             pop| know| 1713|
|            rock|   go| 1695|
|            rock|  see| 1690|
|            rock| come| 1627|
|             pop| just| 1600|
|            rock|  one| 1592|
|            rock| love| 1566|
|            rock| feel| 1526|
|            rock|  get| 1517|
|     alternative| will| 1503|
|             pop| love| 1496|
|            rock|never| 1413|
|             pop| like| 1410|
|            rock| make| 1392|
|            rock|  say| 1386|
|            rock|  way| 1360|
|             pop|  can| 1356|
|            rock| take| 1355|
|           indie| will| 1353|
|             pop| time| 1352|
|            rock|  got| 1339|
|       

In [18]:
# To find most common genres

wordcount_genre.groupBy("genre")\
                .count()\
                .sort("count", ascending=False)\
                .show()

+-----------------+------+
|            genre| count|
+-----------------+------+
|             rock|277460|
|              pop|198881|
|      alternative|155521|
|            indie|136971|
| female vocalists|121475|
|          Hip-Hop|118674|
|            metal|114356|
|        favorites|112787|
|          hip hop| 98754|
|              rap| 98415|
|              00s| 97088|
|             Love| 95319|
| alternative rock| 90408|
|        seen live| 81291|
|        beautiful| 75017|
|   male vocalists| 74259|
|       indie rock| 73948|
|          Awesome| 72476|
|singer-songwriter| 72124|
|            dance| 70189|
+-----------------+------+
only showing top 20 rows



In [19]:

import pandas as pd
import matplotlib.pyplot as plt

#Print lists of most common words for the top 5 most common genre tags
top_genres = {"rock", "pop", "alternative", "indie", "Hip-Hop"}
genre_top_words = []

for genre in top_genres:
    #genre_top_words += 
    wordcount_genre.filter(wordcount_genre["genre"] == genre)\
            .groupBy("genre", "word")\
            .count()\
            .sort("count", ascending=False)\
            .limit(10)\
            .show()

#print(genre_top_words)

#df = pd.DataFrame({'word': ['word1', 'word2'], 'count': [12898, 4861]})
#df.plot.bar(x='word', y='count', rot=0)

+-------+----+-----+
|  genre|word|count|
+-------+----+-----+
|Hip-Hop|like|  615|
|Hip-Hop| get|  582|
|Hip-Hop|know|  550|
|Hip-Hop| got|  543|
|Hip-Hop|just|  537|
|Hip-Hop|will|  508|
|Hip-Hop| now|  497|
|Hip-Hop| one|  464|
|Hip-Hop| can|  452|
|Hip-Hop|back|  422|
+-------+----+-----+

+-----+----+-----+
|genre|word|count|
+-----+----+-----+
|  pop|will| 1740|
|  pop|know| 1713|
|  pop|just| 1600|
|  pop|love| 1496|
|  pop|like| 1410|
|  pop| can| 1356|
|  pop|time| 1352|
|  pop| now| 1308|
|  pop|  go| 1290|
|  pop| see| 1253|
+-----+----+-----+

+-----------+----+-----+
|      genre|word|count|
+-----------+----+-----+
|alternative|will| 1503|
|alternative|know| 1238|
|alternative|just| 1181|
|alternative|like| 1146|
|alternative| now| 1016|
|alternative|time|  996|
|alternative| can|  982|
|alternative| see|  938|
|alternative|  go|  903|
|alternative|feel|  885|
+-----------+----+-----+

+-----+----+-----+
|genre|word|count|
+-----+----+-----+
| rock|will| 2638|
| rock|know

In [20]:
spark_context.stop()