# Preliminaries

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz
!tar xf spark-3.2.0-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "spark-3.2.0-bin-hadoop3.2"

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pyspark.sql
from pyspark.sql import Row
from pyspark.sql.types import *
import json

# Import data (run this program once for each genre to minimize file size)

In [7]:
myPath = "/content/drive/My Drive/H516/"

In [8]:
myInFile = "genius_parsed_data/genius_cleaner_pop.csv"

In [9]:
myOutFile = "genius_with_counts/genius_wCounts_pop2.csv"

In [10]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [11]:
%%time
rawdataDF = spark.read.csv(myPath + myInFile, header="true")
rawdataDF.show(2)

+---+-----------------+---+---------------+----+------+--------------------+--------+
| id|            title|tag|         artist|year| views|              lyrics|language|
+---+-----------------+---+---------------+----+------+--------------------+--------+
|242|Wordy Rappinghood|pop|   Tom Tom Club|1981| 26499| Chorus  What are...|      en|
|384|         Horchata|pop|Vampire Weekend|2009|102550| Verse 1  In Dece...|      en|
+---+-----------------+---+---------------+----+------+--------------------+--------+
only showing top 2 rows

CPU times: user 41.3 ms, sys: 6.86 ms, total: 48.1 ms
Wall time: 7.21 s


In [12]:
rawdataDF.createOrReplaceTempView("genius_songs")

# Preview data

In [13]:
lyricPreview = spark.sql("SELECT * FROM genius_songs where language is not null and tag is not null and lyrics is not null and artist not like '%Genius%' and artist not like '%Translations%'")
lyricPreview.show(10)

+----+-----------------+---+---------------+----+-------+--------------------+--------+
|  id|            title|tag|         artist|year|  views|              lyrics|language|
+----+-----------------+---+---------------+----+-------+--------------------+--------+
| 242|Wordy Rappinghood|pop|   Tom Tom Club|1981|  26499| Chorus  What are...|      en|
| 384|         Horchata|pop|Vampire Weekend|2009| 102550| Verse 1  In Dece...|      en|
| 526|        Heartless|pop|     Kanye West|2008|1175109| Chorus  In the n...|      en|
| 523|  Flashing Lights|pop|     Kanye West|2007|1078113|" Intro  Connie M...|      en|
| 566|             Baby|pop|  Justin Bieber|2010|2232442|" Produced by The...|      en|
| 576|   Find Your Love|pop|          Drake|2010| 358122| Verse 1  I'm mor...|      en|
| 741|          Anxiety|pop|Black Eyed Peas|2003|  13865| Verse 1  I feel ...|      en|
|1087|         Fuck You|pop|    CeeLo Green|2010| 277490|" Chorus  I see y...|      en|
| 778|      Up Up  Away|pop|    

In [14]:
lyricGenres = spark.sql("SELECT SUM(CASE WHEN tag = 'country' THEN 1 ELSE 0 END) AS country, round(SUM(CASE WHEN tag = 'country' THEN 1 ELSE 0 END)/COUNT(1),3) AS country_pct, SUM(CASE WHEN tag = 'pop' THEN 1 ELSE 0 END) AS pop,  round(SUM(CASE WHEN tag = 'pop' THEN 1 ELSE 0 END)/COUNT(1),3) AS pop_pct, SUM(CASE WHEN tag = 'rap' THEN 1 ELSE 0 END) AS rap,  round(SUM(CASE WHEN tag = 'rap' THEN 1 ELSE 0 END)/COUNT(1),3) AS rap_pct,  SUM(CASE WHEN tag = 'rb' THEN 1 ELSE 0 END) AS rb, round(SUM(CASE WHEN tag = 'rb' THEN 1 ELSE 0 END)/COUNT(1),3) AS rb_pct,  SUM(CASE WHEN tag = 'rock' THEN 1 ELSE 0 END) AS rock, round(SUM(CASE WHEN tag = 'rock' THEN 1 ELSE 0 END)/COUNT(1),3) AS rock_pct, COUNT(1) AS TTL_RCRDS FROM genius_songs where language is not null and tag is not null and artist not like '%Genius%' and artist not like '%Translations%' ")
lyricGenres.show(100)

+-------+-----------+-------+-------+---+-------+---+------+----+--------+---------+
|country|country_pct|    pop|pop_pct|rap|rap_pct| rb|rb_pct|rock|rock_pct|TTL_RCRDS|
+-------+-----------+-------+-------+---+-------+---+------+----+--------+---------+
|      0|        0.0|1384844|    1.0|  0|    0.0|  0|   0.0|   0|     0.0|  1384844|
+-------+-----------+-------+-------+---+-------+---+------+----+--------+---------+



In [15]:
#balance data set to have the same number of records for each genre
p1 = 86658/86658
p2 = 86658/1384844
p3 = 86658/633307
p4 = 86658/155082
p5 = 86658/964605
lyricSample = lyricPreview.sampleBy("tag", {'country': p1, 'pop': p2,  'rock': p3 ,'rb': p4, 'rap': p5},0)

In [16]:
lyricSample.createOrReplaceTempView("sample")

In [17]:
sampleGenres = spark.sql("SELECT SUM(CASE WHEN tag = 'country' THEN 1 ELSE 0 END) AS country, round(SUM(CASE WHEN tag = 'country' THEN 1 ELSE 0 END)/COUNT(1),3) AS country_pct, SUM(CASE WHEN tag = 'pop' THEN 1 ELSE 0 END) AS pop,  round(SUM(CASE WHEN tag = 'pop' THEN 1 ELSE 0 END)/COUNT(1),3) AS pop_pct, SUM(CASE WHEN tag = 'rap' THEN 1 ELSE 0 END) AS rap,  round(SUM(CASE WHEN tag = 'rap' THEN 1 ELSE 0 END)/COUNT(1),3) AS rap_pct,  SUM(CASE WHEN tag = 'rb' THEN 1 ELSE 0 END) AS rb, round(SUM(CASE WHEN tag = 'rb' THEN 1 ELSE 0 END)/COUNT(1),3) AS rb_pct,  SUM(CASE WHEN tag = 'rock' THEN 1 ELSE 0 END) AS rock, round(SUM(CASE WHEN tag = 'rock' THEN 1 ELSE 0 END)/COUNT(1),3) AS rock_pct,   COUNT(1) AS TTL_RCRDS FROM sample WHERE tag IN('country','pop','rap','rb','rock') ")
sampleGenres.show(100)

+-------+-----------+-----+-------+---+-------+---+------+----+--------+---------+
|country|country_pct|  pop|pop_pct|rap|rap_pct| rb|rb_pct|rock|rock_pct|TTL_RCRDS|
+-------+-----------+-----+-------+---+-------+---+------+----+--------+---------+
|      0|        0.0|86575|    1.0|  0|    0.0|  0|   0.0|   0|     0.0|    86575|
+-------+-----------+-----+-------+---+-------+---+------+----+--------+---------+



In [18]:
sampleGenres = spark.sql("SELECT artist, COUNT(1) AS TTL_RCRDS FROM sample GROUP BY artist ORDER BY 2 DESC")
sampleGenres.show(truncate=False)

+-------------------+---------+
|artist             |TTL_RCRDS|
+-------------------+---------+
|KIDZ BOP Kids      |80       |
|YOUNG DIAMOND      |63       |
|Frank Sinatra      |51       |
|Madonna            |47       |
|Kylie Minogue      |45       |
|Ella Fitzgerald    |45       |
|Andy Williams      |43       |
|Mariah Carey       |42       |
|"Nat ""King"" Cole"|41       |
|The Beach Boys     |40       |
|Cliff Richard      |38       |
|Barbra Streisand   |37       |
|Pet Shop Boys      |37       |
|Glee Cast          |36       |
|Van Morrison       |36       |
|Charli XCX         |35       |
|Johnny Mathis      |34       |
|Erasure            |32       |
|Tom Jones          |31       |
|Hillsong Worship   |31       |
+-------------------+---------+
only showing top 20 rows



In [19]:
lyricSample.show(5)

+----+-------------------+---+-------------------+----+------+--------------------+--------+
|  id|              title|tag|             artist|year| views|              lyrics|language|
+----+-------------------+---+-------------------+----+------+--------------------+--------+
| 911|          Ego Remix|pop|             Beyonc|2009|139941|" Verse 1  Kanye ...|      en|
|1321|Keep It Goin Louder|pop|        Major Lazer|2009|  5314|" Intro  Ricky Bl...|      en|
|1533|            Be Free|pop|       Radio Killer|2010|   798|I care about love...|      en|
|2121|      Miami 2 Ibiza|pop|Swedish House Mafia|2010| 91912|" Verse 1  She sa...|      en|
|2510| Dance Yrself Clean|pop|    LCD Soundsystem|2010|168685| Verse 1  Walking...|      en|
+----+-------------------+---+-------------------+----+------+--------------------+--------+
only showing top 5 rows



# Remove punctuation and numbers from lyrics

> Indented block



In [20]:
import string

In [21]:
#remove punctuation
wordsR = lyricSample.rdd.map(lambda x:(x[0],x[1],x[2],x[3],x[4],x[6].translate(str.maketrans('','',string.punctuation))))
wordsR.take(2)

[('911',
  'Ego Remix',
  'pop',
  'Beyonc',
  '2009',
  ' Verse 1  Kanye West  I got a big ego  Ha ha ha   I’m such a big ego  Uhuhuh  I got a big   eheheh   ego  she love my big   eheheh   ego So stroke my big   eheheh   ego I like to joke around a little bit  but here we go Welcome to the wonderful world of goldplated Earl  cause Every thing I throw up  blow up Talking to the girl She said  Know what  grow up  you nasty  I don’t understand why they trippin’  if you ask me Flow is just the nicest  I emit the propane I just spit  probably just raised the gas prices Everybody in the club try and get as fresh as me What you want  dawg Tryna stay recession free And spit  refreshingly when I rock the stadium You probably get sweaty you should bring a extra tee Now I’m standing next to Jay who standing next to B You coulda been anywhere in the world  but you’re here with me That’s good for my ego  me and my ego And it go wherever we go my ego is my imaginary friend He was with me when I wa

In [22]:
#remove numbers
wordsR2 = wordsR.map(lambda x:(x[0],x[1],x[2],x[3],x[4],x[5].translate(str.maketrans('','',string.digits))))
wordsR2.take(2)

[('911',
  'Ego Remix',
  'pop',
  'Beyonc',
  '2009',
  ' Verse   Kanye West  I got a big ego  Ha ha ha   I’m such a big ego  Uhuhuh  I got a big   eheheh   ego  she love my big   eheheh   ego So stroke my big   eheheh   ego I like to joke around a little bit  but here we go Welcome to the wonderful world of goldplated Earl  cause Every thing I throw up  blow up Talking to the girl She said  Know what  grow up  you nasty  I don’t understand why they trippin’  if you ask me Flow is just the nicest  I emit the propane I just spit  probably just raised the gas prices Everybody in the club try and get as fresh as me What you want  dawg Tryna stay recession free And spit  refreshingly when I rock the stadium You probably get sweaty you should bring a extra tee Now I’m standing next to Jay who standing next to B You coulda been anywhere in the world  but you’re here with me That’s good for my ego  me and my ego And it go wherever we go my ego is my imaginary friend He was with me when I was

# Add the total number of words, and the number of stopwords and profane words

In [23]:
!pip install profanity


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting profanity
  Downloading profanity-1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: profanity
  Building wheel for profanity (setup.py) ... [?25l[?25hdone
  Created wheel for profanity: filename=profanity-1.1-py3-none-any.whl size=4253 sha256=468330205085fc1b29cc06d7a68e95e681bea9dfa2fbc3f20d3c22422665387b
  Stored in directory: /root/.cache/pip/wheels/d6/62/ce/2d1388b4998148d18bc3fd287105fe25f63ed695e287928ef1
Successfully built profanity
Installing collected packages: profanity
Successfully installed profanity-1.1


In [24]:
from profanity import profanity

In [25]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [26]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [27]:
english_stopwprds = stopwords.words('english')

In [28]:
#use the positive word list from assignment 4
poswordsR = sc.textFile(myPath + "pos.txt")
poswordsR.take(5)

['abound', 'abounds', 'abundance', 'abundant', 'accessable']

In [29]:
#use the negative word list from assignment 4
negwordsR = sc.textFile(myPath + "neg.txt")
negwordsR.take(5)

['abnormal', 'abolish', 'abominable', 'abominably', 'abominate']

In [30]:
#get python lists of positive an negative words from input files
poswords = poswordsR.collect()
negwords = negwordsR.collect()

In [31]:
#create a function to calculate the number of words in the song
getLength = lambda x: len(x.split())

In [32]:
#create a function to calculate the number of stop words in the song
getStopCount = lambda x: len([x.split()[stopword] for stopword in range(len(x.split())) if x.split()[stopword] in english_stopwprds])

In [33]:
#create a function to calculate the number of profane words in the song
getProfaneCount = lambda x: len([x.split()[badword] for badword in range(len(x.split())) if profanity.contains_profanity(x.split()[badword]) == True])

In [34]:
#create a function to measure the amount of repetition in a song
getUniqueCount = lambda x: len(set([x.split()[unique] for unique in range(len(x.split())) if x.split()[unique] not in english_stopwprds]))

In [35]:
#create a function to calculate the number of positive words in the song
getPositiveCount = lambda x: len([x.split()[posword] for posword in range(len(x.split())) if x.split()[posword] in poswords])

In [36]:
#create a function to calculate the number of positive words in the song
getNegativeCount = lambda x: len([x.split()[negword] for negword in range(len(x.split())) if x.split()[negword] in negwords])

In [37]:
checkR = wordsR2.map(lambda x: (x[0], x[2], x[1], x[3], x[4], x[5]))
checkR.take(2)

[('911',
  'pop',
  'Ego Remix',
  'Beyonc',
  '2009',
  ' Verse   Kanye West  I got a big ego  Ha ha ha   I’m such a big ego  Uhuhuh  I got a big   eheheh   ego  she love my big   eheheh   ego So stroke my big   eheheh   ego I like to joke around a little bit  but here we go Welcome to the wonderful world of goldplated Earl  cause Every thing I throw up  blow up Talking to the girl She said  Know what  grow up  you nasty  I don’t understand why they trippin’  if you ask me Flow is just the nicest  I emit the propane I just spit  probably just raised the gas prices Everybody in the club try and get as fresh as me What you want  dawg Tryna stay recession free And spit  refreshingly when I rock the stadium You probably get sweaty you should bring a extra tee Now I’m standing next to Jay who standing next to B You coulda been anywhere in the world  but you’re here with me That’s good for my ego  me and my ego And it go wherever we go my ego is my imaginary friend He was with me when I was

In [38]:
extrColsR = wordsR2.filter(lambda x: getLength(x[5]) > 0).map(lambda x: (x[0],x[2],x[1],x[3],x[4], getLength(x[5]), getUniqueCount(x[5]), round((1-getUniqueCount(x[5])/getLength(x[5])),2), getStopCount(x[5]), round((getStopCount(x[5])/getLength(x[5])),2), getProfaneCount(x[5]) ,round((getProfaneCount(x[5])/getLength(x[5])),2), getPositiveCount(x[5]),round((getPositiveCount(x[5])/getLength(x[5])),2), getNegativeCount(x[5]),round((getNegativeCount(x[5])/getLength(x[5])),2), x[5]))
extrColsR.take(2)

[('911',
  'pop',
  'Ego Remix',
  'Beyonc',
  '2009',
  662,
  202,
  0.69,
  243,
  0.37,
  0,
  0.0,
  45,
  0.07,
  13,
  0.02,
  ' Verse   Kanye West  I got a big ego  Ha ha ha   I’m such a big ego  Uhuhuh  I got a big   eheheh   ego  she love my big   eheheh   ego So stroke my big   eheheh   ego I like to joke around a little bit  but here we go Welcome to the wonderful world of goldplated Earl  cause Every thing I throw up  blow up Talking to the girl She said  Know what  grow up  you nasty  I don’t understand why they trippin’  if you ask me Flow is just the nicest  I emit the propane I just spit  probably just raised the gas prices Everybody in the club try and get as fresh as me What you want  dawg Tryna stay recession free And spit  refreshingly when I rock the stadium You probably get sweaty you should bring a extra tee Now I’m standing next to Jay who standing next to B You coulda been anywhere in the world  but you’re here with me That’s good for my ego  me and my ego And

In [39]:
labeledR = extrColsR.map(lambda p: Row(track_id=p[0], genre=p[1], title=p[2], artist=p[3],year=p[4], word_count=p[5],unique_words=p[6], repetition_pct=p[7],stopword_count=p[8],stopword_pct=p[9],profanity_count=p[10],profanity_pct=p[11],positive_count=p[12],positive_pct=p[13],negative_count=p[14],negative_pct=p[15], lyrics=p[16]))
labeledR.take(2)

[Row(track_id='911', genre='pop', title='Ego Remix', artist='Beyonc', year='2009', word_count=662, unique_words=202, repetition_pct=0.69, stopword_count=243, stopword_pct=0.37, profanity_count=0, profanity_pct=0.0, positive_count=45, positive_pct=0.07, negative_count=13, negative_pct=0.02, lyrics=' Verse   Kanye West  I got a big ego  Ha ha ha   I’m such a big ego  Uhuhuh  I got a big   eheheh   ego  she love my big   eheheh   ego So stroke my big   eheheh   ego I like to joke around a little bit  but here we go Welcome to the wonderful world of goldplated Earl  cause Every thing I throw up  blow up Talking to the girl She said  Know what  grow up  you nasty  I don’t understand why they trippin’  if you ask me Flow is just the nicest  I emit the propane I just spit  probably just raised the gas prices Everybody in the club try and get as fresh as me What you want  dawg Tryna stay recession free And spit  refreshingly when I rock the stadium You probably get sweaty you should bring a ex

In [40]:
%%time
labeledDF = spark.createDataFrame(labeledR)
labeledDF.show(5)

+--------+-----+-------------------+-------------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|track_id|genre|              title|             artist|year|word_count|unique_words|repetition_pct|stopword_count|stopword_pct|profanity_count|profanity_pct|positive_count|positive_pct|negative_count|negative_pct|              lyrics|
+--------+-----+-------------------+-------------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|     911|  pop|          Ego Remix|             Beyonc|2009|       662|         202|          0.69|           243|        0.37|              0|          0.0|            45|        0.07|            13|        0.02| Verse   Kanye We...|
|    1321|  pop|Keep It Goin Louder|        Major Lazer|

In [41]:
import pandas as pd

# Save the data to a file so that it will not have to be recalculated every time

In [None]:
#save the enhanced data frame to a csv file so that we will not have to recalulate it every time
%%time
labeledPD = labeledDF.toPandas()

In [None]:
labeledPD.head(2)

In [None]:
%%time
labeledPD.to_csv(myPath + myOutFile)