# Preliminaries

## To set up required PySpark and Spark versions, use examples given at:
https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/stemmer/Word_Stemming_with_Stemmer.ipynb

In [1]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2023-04-22 17:03:50--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2023-04-22 17:03:51--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1191 (1.2K) [text/plain]
Saving to: ‘STDOUT’


2023-04-22 17:03:51 (33.0 MB/s) - written to stdout [1191/1191]

Installing PySpark 3.2.3 and Spark NLP 4.4.0
setup Colab for PySpark 3.2.3 and Spark NLP 4

In [2]:
from pyspark.ml import Pipeline

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

Spark NLP version:  4.4.0
Apache Spark version:  3.2.3


## Add Spark Sql import stements used in class

In [3]:
import pyspark.sql
from pyspark.sql import Row
from pyspark.sql.types import *
import json

## Add additional import statements from tutorial examples at: 
https://www.youtube.com/watch?v=Qsx6Endfvbg

In [4]:
import pandas as pd
import numpy as np
from pyspark.ml.classification import *
#from pyspark.ml.feature import *
#do not import the tokenizer and stop word remover from ml since it conflicts with the one from nlp
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

# Import data

In [6]:
myPath = "/content/drive/My Drive/Project_Work/"

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
from pyspark.sql.types import StructField, StructType, StringType,LongType
custom_schema = StructType([
    StructField("track_id", StringType(), False),
    StructField("track_name", StringType(), False),
    StructField("track_artist", StringType(), False),
    StructField("track_popularity", IntegerType(), False),
    StructField("track_album_id", StringType(), False),
    StructField("track_album_name", StringType(), False),
    StructField("track_album_release_date", StringType(), False),
    StructField("playlist_name", StringType(), False),
    StructField("playlist_id", StringType(), False),
    StructField("playlist_genre", StringType(), False),
    StructField("playlist_subgenre", StringType(), False),
    StructField("danceability", FloatType(), False),
    StructField("energy",FloatType(), False),
    StructField("key", IntegerType(), False),
    StructField("loudness", FloatType(), False),
    StructField("mode", IntegerType(), False),
    StructField("speechiness", FloatType(), False),
    StructField("acousticness", FloatType(), False),
    StructField("instrumentalness", FloatType(), False),
    StructField("liveness", FloatType(), False),
    StructField("valence", FloatType(), False),
    StructField("tempo", FloatType(), False),
    StructField("duration_ms", IntegerType(), False),
    StructField("language", StringType(), False),
    StructField("label", IntegerType(), False),
    StructField("year", IntegerType(), False),
    StructField("minutes", FloatType(), False),
    StructField("word_count", IntegerType(), False),
    StructField("words_per_minute", FloatType(), False),
    StructField("repetition_pct", FloatType(), False),
    StructField("stopword_pct", FloatType(), False),
    StructField("profanity_pct", FloatType(), False),
    StructField("positive_pct", FloatType(), False),
    StructField("negative_pct", FloatType(), False),
    StructField("Sentiment", IntegerType(), False),
    StructField("words_only_lyrics", StringType(),False)
])
rawdataDF = spark.read.format("csv") \
    .schema(custom_schema) \
    .option("header", True) \
    .load(myPath + "spotify_with_word_counts.csv")
rawdataDF.show(3)

+--------------------+--------------------+------------+----------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------+-----------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------+-----+----+-------+----------+----------------+--------------+------------+-------------+------------+------------+---------+--------------------+
|            track_id|          track_name|track_artist|track_popularity|      track_album_id|    track_album_name|track_album_release_date|       playlist_name|         playlist_id|playlist_genre|playlist_subgenre|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|language|label|year|minutes|word_count|words_per_minute|repetition_pct|stopword_pct|profanity_pct|positive_pct|negative_pct|Sentiment|   words_only_lyrics|
+--------------------+--

In [9]:
rawdataDF.printSchema()

root
 |-- track_id: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- track_artist: string (nullable = true)
 |-- track_popularity: integer (nullable = true)
 |-- track_album_id: string (nullable = true)
 |-- track_album_name: string (nullable = true)
 |-- track_album_release_date: string (nullable = true)
 |-- playlist_name: string (nullable = true)
 |-- playlist_id: string (nullable = true)
 |-- playlist_genre: string (nullable = true)
 |-- playlist_subgenre: string (nullable = true)
 |-- danceability: float (nullable = true)
 |-- energy: float (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: float (nullable = true)
 |-- mode: integer (nullable = true)
 |-- speechiness: float (nullable = true)
 |-- acousticness: float (nullable = true)
 |-- instrumentalness: float (nullable = true)
 |-- liveness: float (nullable = true)
 |-- valence: float (nullable = true)
 |-- tempo: float (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- l

In [41]:
rawdataDF.createOrReplaceTempView("model_input")

In [13]:
model_inputDF = spark.sql("SELECT * FROM model_input a WHERE label is not null and words_only_lyrics is not null")
model_inputDF.show(3)

+--------------------+--------------------+------------+----------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------+-----------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------+-----+----+-------+----------+----------------+--------------+------------+-------------+------------+------------+---------+--------------------+
|            track_id|          track_name|track_artist|track_popularity|      track_album_id|    track_album_name|track_album_release_date|       playlist_name|         playlist_id|playlist_genre|playlist_subgenre|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|language|label|year|minutes|word_count|words_per_minute|repetition_pct|stopword_pct|profanity_pct|positive_pct|negative_pct|Sentiment|   words_only_lyrics|
+--------------------+--

# Perform pre-processing of data using a PySpark nlp pipeline

In [14]:
%%time
documentAssembler = DocumentAssembler()\
    .setInputCol("words_only_lyrics")\
    .setOutputCol("document")

sentence = SentenceDetector()\
    .setInputCols(["document"]) \
    .setOutputCol("sentences")

tokenizer = Tokenizer()\
    .setInputCols(["sentences"]) \
    .setOutputCol("token")

stopWords = StopWordsCleaner() \
    .setInputCols(["token"]) \
    .setOutputCol("cleanTokens") \
    .setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained()\
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

normalizer  = Normalizer()\
    .setInputCols(["lemma"]) \
    .setOutputCol("normalized") \
    .setLowercase(True)

finisher = Finisher()\
    .setInputCols(["normalized"]) \
    .setOutputCols("normalized")\
    .setOutputAsArray(True)

count_vectorizer = CountVectorizer(
    inputCol="normalized",
    outputCol="tf"
)

tf_idf = IDF(
    inputCol="tf",
    outputCol="tf_idf"
)

#stemmer not used this time (bc lemmatizer used instead) but save syntax for later
#stemmer = Stemmer() \
    #.setInputCols(["token"]) \
    #.setOutputCol("stem")



lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
CPU times: user 181 ms, sys: 27.8 ms, total: 209 ms
Wall time: 17 s


In [15]:
nlpPipeline = Pipeline(stages=[documentAssembler,
                               sentence,
                               tokenizer,
                               stopWords,
                               lemmatizer,
                               normalizer,
                               finisher,
                               count_vectorizer,
                               tf_idf])

In [16]:
#add additional feature columns to the cleaned text that is output from the nlp pipeline
%%time
xtra_cols = VectorAssembler(inputCols=['tf_idf','word_count'], outputCol='features')

CPU times: user 1.65 ms, sys: 0 ns, total: 1.65 ms
Wall time: 13 ms


In [17]:
pipeline = Pipeline(stages=[nlpPipeline, xtra_cols])

In [18]:
#create feature vectors for all of the model input
%%time
processed = pipeline.fit(model_inputDF).transform(model_inputDF)

CPU times: user 2.63 s, sys: 319 ms, total: 2.95 s
Wall time: 5min 24s


In [19]:
processed.show(truncate=40)

+----------------------+---------------------------------+---------------------+----------------+----------------------+--------------------------------------+------------------------+----------------------------------------+----------------------+--------------+-------------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------+-----+----+-------+----------+----------------+--------------+------------+-------------+------------+------------+---------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+----------------------------------------+
|              track_id|                       track_name|         track_artist|track_popularity|        track_album_id|                      track_album_name|track_album_release_date|                           playlist_name|           playlist_id|playli

In [33]:
class_sizes = spark.sql("SELECT playlist_genre, label, count(1) FROM model_input a WHERE label is not null and words_only_lyrics is not null GROUP BY playlist_genre, label ORDER BY label")
class_sizes.show(10)

+--------------+-----+--------+
|playlist_genre|label|count(1)|
+--------------+-----+--------+
|          rock|    0|    3274|
|           pop|    1|    3735|
|           r&b|    2|    3131|
|           rap|    3|    2498|
|           edm|    4|    1758|
+--------------+-----+--------+



In [54]:
#balance the label classes in the data if needed
p1 = 1
p2 = 1
p3 = 1
p4 = 1
p5 = 1
balanced = processed.sampleBy("playlist_genre", {'rock': p1, 'pop': p2,  'r&b': p3 ,'rap': p4, 'edm': p5},0)

In [55]:
balanced.createOrReplaceTempView("balanced")

In [57]:
class_sizes2 = spark.sql("SELECT playlist_genre, label, count(1) FROM balanced a WHERE label is not null and words_only_lyrics is not null GROUP BY playlist_genre, label ORDER BY label")
class_sizes2.show(10)

+--------------+-----+--------+
|playlist_genre|label|count(1)|
+--------------+-----+--------+
|          rock|    0|    3274|
|           pop|    1|    3735|
|           r&b|    2|    3131|
|           rap|    3|    2498|
|           edm|    4|    1758|
+--------------+-----+--------+



In [58]:
#split the data into training and test data sets
train, test = balanced.randomSplit([0.8,0.2], seed=123)

# Create a Naive Bayes model on the raw lyric data 

In [59]:
#declare the model
naive_bayes = NaiveBayes(featuresCol='features')

In [60]:
#fit the nb model to the processed model
%%time
model = naive_bayes.fit(train)

CPU times: user 1.12 s, sys: 112 ms, total: 1.23 s
Wall time: 2min 37s


In [61]:
#create model predictions on the test data
%%time
results = model.transform(test)

CPU times: user 7.11 ms, sys: 0 ns, total: 7.11 ms
Wall time: 127 ms


In [62]:
results.show(5)

+--------------------+--------------------+--------------+----------------+--------------------+--------------------+------------------------+--------------------+--------------------+--------------+--------------------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------+-----+----+-------+----------+----------------+--------------+------------+-------------+------------+------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|            track_id|          track_name|  track_artist|track_popularity|      track_album_id|    track_album_name|track_album_release_date|       playlist_name|         playlist_id|playlist_genre|   playlist_subgenre|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|duration_ms|language|label|year|minutes|word_

In [63]:
y_true = results.select(['label']).rdd.collect()
y_pred = results.select(['prediction']).rdd.collect()

In [64]:
type(y_true)

list

In [65]:
from sklearn.metrics import classification_report

In [66]:
target_names = ['rock','pop','r&b','rap','edm']

print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

        rock       0.54      0.62      0.58       685
         pop       0.44      0.40      0.42       729
         r&b       0.52      0.53      0.52       649
         rap       0.72      0.74      0.73       525
         edm       0.30      0.25      0.27       325

    accuracy                           0.52      2913
   macro avg       0.50      0.51      0.50      2913
weighted avg       0.52      0.52      0.52      2913

