# Preliminaries

## To set up required PySpark and Spark versions, use examples given at:
https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb

In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2023-04-12 16:48:15--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2023-04-12 16:48:16--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2023-04-12 16:48:17--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:44

In [None]:
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

Spark NLP version:  4.4.0
Apache Spark version:  3.2.3


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


## Add Other import statements

In [None]:
import pyspark.sql
from pyspark.sql import Row
from pyspark.sql.types import *
import json

In [None]:
import pandas as pd
import numpy as np
from pyspark.ml.classification import *
#from pyspark.ml.feature import *
#do not import the tokenizer and stop word remover from ml since it conflicts with the one from nlp
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

In [None]:
from sklearn.metrics import classification_report

# Import data

In [None]:
myPath = "/content/drive/My Drive/H516/genius_with_counts/"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pyspark.sql.types import StructField, StructType, StringType,LongType

custom_schema = StructType([
    StructField("record", IntegerType(), False),
    StructField("id", StringType(), False),
    StructField("genre", StringType(), False),
    StructField("title", StringType(), False),
    StructField("artist", StringType(), False),
    StructField("year", IntegerType(), False),
    StructField("word_count", IntegerType(), False),
    StructField("unique_words", IntegerType(), False),
    StructField("repetition_pct", FloatType(), False),
    StructField("stopword_count", IntegerType(), False),
    StructField("stopword_pct", FloatType(), False),
    StructField("profanity_count", IntegerType(), False),
    StructField("profanity_pct", FloatType(), False),
    StructField("positive_count", IntegerType(), False),
    StructField("positive_pct", FloatType(), False),
    StructField("negative_count", IntegerType(), False),
    StructField("negative_pct", FloatType(), False),
    StructField("words_only_lyrics", StringType(),False)
])

In [None]:
rawdata_country = spark.read.format("csv") \
    .schema(custom_schema) \
    .option("header", True) \
    .load(myPath + 'genius_wCounts_country.csv')
rawdata_country.show(3)

+------+----+-------+-------------------+------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|record|  id|  genre|              title|      artist|year|word_count|unique_words|repetition_pct|stopword_count|stopword_pct|profanity_count|profanity_pct|positive_count|positive_pct|negative_count|negative_pct|   words_only_lyrics|
+------+----+-------+-------------------+------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|     0|2504|country|1 Corinthians 15:55| Johnny Cash|2010|       214|          76|          0.64|            90|        0.42|              0|          0.0|             9|        0.04|             5|        0.02|O Death  where is...|
|     1|2518|country| Bullets In The Gun|  Toby Keith|2010|     

In [None]:
rawdata_pop = spark.read.format("csv") \
    .schema(custom_schema) \
    .option("header", True) \
    .load(myPath + 'genius_wCounts_pop.csv')
rawdata_pop.show(3)

+------+----+-----+-------------------+------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|record|  id|genre|              title|      artist|year|word_count|unique_words|repetition_pct|stopword_count|stopword_pct|profanity_count|profanity_pct|positive_count|positive_pct|negative_count|negative_pct|   words_only_lyrics|
+------+----+-----+-------------------+------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|     0| 911|  pop|          Ego Remix|      Beyonc|2009|       662|         202|          0.69|           243|        0.37|              0|          0.0|            45|        0.07|            13|        0.02| Verse   Kanye We...|
|     1|1321|  pop|Keep It Goin Louder| Major Lazer|2009|       510|    

In [None]:
rawdata_rock = spark.read.format("csv") \
    .schema(custom_schema) \
    .option("header", True) \
    .load(myPath + 'genius_wCounts_rock.csv')
rawdata_rock.show(3)

+------+----+-----+--------------------+-----------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|record|  id|genre|               title|     artist|year|word_count|unique_words|repetition_pct|stopword_count|stopword_pct|profanity_count|profanity_pct|positive_count|positive_pct|negative_count|negative_pct|   words_only_lyrics|
+------+----+-----+--------------------+-----------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|     0|1181| rock|Neighborhood 2 Laïka|Arcade Fire|2005|       199|          64|          0.68|            74|        0.37|              0|          0.0|             3|        0.02|             2|        0.01| Verse   Alexande...|
|     1|1060| rock|    Hotel California|     Eagles|1976|       352|    

In [None]:
rawdata_rb = spark.read.format("csv") \
    .schema(custom_schema) \
    .option("header", True) \
    .load(myPath + 'genius_wCounts_rb.csv')
rawdata_rb.show(3)

+------+---+-----+-----------------+--------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|record| id|genre|            title|  artist|year|word_count|unique_words|repetition_pct|stopword_count|stopword_pct|profanity_count|profanity_pct|positive_count|positive_pct|negative_count|negative_pct|   words_only_lyrics|
+------+---+-----+-----------------+--------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|     0|235|   rb|   Miss You Remix| Aaliyah|2003|       733|         269|          0.63|           282|        0.38|              1|          0.0|            11|        0.02|            25|        0.03| Intro  JayZ    A...|
|     1|664|   rb|         Ringtone|R. Kelly|2007|       377|         175|          0.54|           

In [None]:
rawdata_rap = spark.read.format("csv") \
    .schema(custom_schema) \
    .option("header", True) \
    .load(myPath + 'genius_wCounts_rap.csv')
rawdata_rap.show(3)

+------+----+-----+--------------------+-------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|record|  id|genre|               title| artist|year|word_count|unique_words|repetition_pct|stopword_count|stopword_pct|profanity_count|profanity_pct|positive_count|positive_pct|negative_count|negative_pct|   words_only_lyrics|
+------+----+-----+--------------------+-------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|     0|  11|  rap|       Lord You Know|Cam'ron|2004|       831|         319|          0.62|           280|        0.34|              7|         0.01|            25|        0.03|            36|        0.04| Chorus  Jaheim  ...|
|     1|  18|  rap|Its Hot Some Like...|  JAY-Z|1999|       645|         208|          0

In [None]:
all_genresDF = rawdata_country.union(rawdata_pop).union(rawdata_rock).union(rawdata_rb).union(rawdata_rap)
all_genresDF.show(3)

+------+----+-------+-------------------+------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|record|  id|  genre|              title|      artist|year|word_count|unique_words|repetition_pct|stopword_count|stopword_pct|profanity_count|profanity_pct|positive_count|positive_pct|negative_count|negative_pct|   words_only_lyrics|
+------+----+-------+-------------------+------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+
|     0|2504|country|1 Corinthians 15:55| Johnny Cash|2010|       214|          76|          0.64|            90|        0.42|              0|          0.0|             9|        0.04|             5|        0.02|O Death  where is...|
|     1|2518|country| Bullets In The Gun|  Toby Keith|2010|     

In [None]:
all_genresDF.printSchema()

root
 |-- record: integer (nullable = true)
 |-- id: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- title: string (nullable = true)
 |-- artist: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- word_count: integer (nullable = true)
 |-- unique_words: integer (nullable = true)
 |-- repetition_pct: float (nullable = true)
 |-- stopword_count: integer (nullable = true)
 |-- stopword_pct: float (nullable = true)
 |-- profanity_count: integer (nullable = true)
 |-- profanity_pct: float (nullable = true)
 |-- positive_count: integer (nullable = true)
 |-- positive_pct: float (nullable = true)
 |-- negative_count: integer (nullable = true)
 |-- negative_pct: float (nullable = true)
 |-- words_only_lyrics: string (nullable = true)



In [None]:
all_genresDF.createOrReplaceTempView("model_input")

In [None]:
lyricGenres = spark.sql("SELECT genre,  COUNT(1) AS TTL_RCRDS FROM model_input WHERE genre IN('country','pop','rap','rb','rock')  AND words_only_lyrics is not null and artist not like '%Genius%' and artist not like '%Translations%' GROUP BY genre")
lyricGenres.show(100)

+-------+---------+
|  genre|TTL_RCRDS|
+-------+---------+
|country|    86598|
|    pop|    86575|
|   rock|    86321|
|     rb|    86504|
|    rap|    86433|
+-------+---------+



In [None]:
#change data volume as needed
#volume = 60000
p1 = 1 #volume/86598
p2 = 1 #volume/86575
p3 = 1 #volume/86321
p4 = 1 #volume/86504
p5 = 1 #volume/86433
lyricSample = all_genresDF.sampleBy("genre", {'country': p1, 'pop': p2,  'rock': p3 ,'rb': p4, 'rap': p5},0)

In [None]:
lyricSample.createOrReplaceTempView("sample")

In [None]:
sampleGenres = spark.sql("SELECT genre,  COUNT(1) AS TTL_RCRDS FROM sample WHERE genre IN('country','pop','rap','rb','rock')  AND words_only_lyrics is not null and artist not like '%Genius%' and artist not like '%Translations%' GROUP BY genre")
sampleGenres.show(100) 

+-------+---------+
|  genre|TTL_RCRDS|
+-------+---------+
|country|    86598|
|    pop|    86575|
|   rock|    86321|
|     rb|    86504|
|    rap|    86433|
+-------+---------+



In [None]:
model_inputDF = spark.sql("SELECT a.*, CASE WHEN a.genre = 'country' THEN 0 WHEN a.genre = 'pop' THEN 1 WHEN a.genre = 'rap' THEN 2 WHEN a.genre = 'rb' THEN 3 WHEN a.genre = 'rock' THEN 4 ELSE 9 END AS label FROM sample a WHERE genre IN('country','pop','rap','rb','rock') AND words_only_lyrics is not null and word_count is not null and artist not like '%Genius%' and artist not like '%Translations%'")
model_inputDF.show(truncate=150)

+------+-----+-------+-----------------------------+-------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|record|   id|  genre|                        title|       artist|year|word_count|unique_words|repetition_pct|stopword_count|stopword_pct|profanity_count|profanity_pct|positive_count|positive_pct|negative_count|negative_pct|                                                                                                                                     words_only_lyrics|label|
+------+-----+-------+-----------------------------+-------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+------------

# Perform pre-processing of data using a PySpark nlp pipeline

In [None]:
target_names = ['country','pop', 'rap', 'rb', 'rock']

In [None]:
%%time
documentAssembler = DocumentAssembler()\
    .setInputCol("words_only_lyrics")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(["document"]) \
    .setOutputCol("sentences")

tokenizer = Tokenizer()\
    .setInputCols(["sentences"]) \
    .setOutputCol("token")

stopWords = StopWordsCleaner() \
    .setInputCols(["token"]) \
    .setOutputCol("cleanTokens") \
    .setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained()\
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

normalizer  = Normalizer()\
    .setInputCols(["lemma"]) \
    .setOutputCol("normalized") \
    .setLowercase(True)

finisher = Finisher()\
    .setInputCols(["normalized"]) \
    .setOutputCols("normalized")\
    .setOutputAsArray(True)

count_vectorizer = CountVectorizer(
    inputCol="normalized",
    outputCol="tf"
)

tf_idf = IDF(
    inputCol="tf",
    outputCol="tf_idf"
)

#stemmer not used this time (bc lemmatizer used instead) but save syntax for later
#stemmer = Stemmer() \
    #.setInputCols(["token"]) \
    #.setOutputCol("stem")



lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
CPU times: user 60.6 ms, sys: 10.9 ms, total: 71.5 ms
Wall time: 5.32 s


In [None]:
nlpPipeline = Pipeline(stages=[documentAssembler,
                               sentence,
                               tokenizer,
                               stopWords,
                               lemmatizer,
                               normalizer,
                               finisher,
                               count_vectorizer,
                               tf_idf])

In [None]:
#add additional feature columns to the cleaned text that is output from the nlp pipeline
%%time
xtra_cols = VectorAssembler(inputCols=['tf_idf','word_count'], outputCol='features')

CPU times: user 1.82 ms, sys: 0 ns, total: 1.82 ms
Wall time: 4.3 ms


In [None]:
pipeline = Pipeline(stages=[nlpPipeline, xtra_cols])

In [None]:
#create feature vectors for all of the model input
%%time
processed = pipeline.fit(model_inputDF).transform(model_inputDF)

CPU times: user 9.68 s, sys: 1.16 s, total: 10.8 s
Wall time: 28min 7s


In [None]:
processed.show(truncate=40)

+------+-----+-------+-----------------------------+-------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+----------------------------------------+-----+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+
|record|   id|  genre|                        title|       artist|year|word_count|unique_words|repetition_pct|stopword_count|stopword_pct|profanity_count|profanity_pct|positive_count|positive_pct|negative_count|negative_pct|                       words_only_lyrics|label|                              normalized|                                      tf|                                  tf_idf|                                features|
+------+-----+-------+-----------------------------+-------------+----+----------+------------+--------------+--------------+---

In [None]:
#split the data into training and test data sets
train, test = processed.randomSplit([0.7,0.3], seed=123)

# Train a Naive Bayes model on the training data and predict classes in the test data

In [None]:
#declare the model
naive_bayes = NaiveBayes(featuresCol='features')

In [None]:
#fit the nb model to the processed model
%%time
model = naive_bayes.fit(train)

CPU times: user 4.92 s, sys: 567 ms, total: 5.48 s
Wall time: 14min 34s


In [None]:
#create model predictions on the test data
%%time
results = model.transform(test)

CPU times: user 4.86 ms, sys: 0 ns, total: 4.86 ms
Wall time: 100 ms


In [None]:
results.show(5)

+------+-----+-------+---------------+------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|record|   id|  genre|          title|      artist|year|word_count|unique_words|repetition_pct|stopword_count|stopword_pct|profanity_count|profanity_pct|positive_count|positive_pct|negative_count|negative_pct|   words_only_lyrics|label|          normalized|                  tf|              tf_idf|            features|       rawPrediction|         probability|prediction|
+------+-----+-------+---------------+------------+----+----------+------------+--------------+--------------+------------+---------------+-------------+--------------+------------+--------------+------------+--------------------+-----+----------------

In [None]:
y_true = results.select(['label']).rdd.collect()
y_pred = results.select(['prediction']).rdd.collect()

In [None]:
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     country       0.62      0.69      0.65     25738
         pop       0.37      0.23      0.28     25831
         rap       0.78      0.75      0.76     25831
          rb       0.55      0.59      0.57     26001
        rock       0.49      0.60      0.54     25800

    accuracy                           0.57    129201
   macro avg       0.56      0.57      0.56    129201
weighted avg       0.56      0.57      0.56    129201

