# Genre Classification

In [None]:
# Installing pyspark on local machine
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Importing installed pyspark
import pyspark

In [None]:
# Importing PySpark Session 
from pyspark.sql import SparkSession

In [None]:
# Creating Spark and setting a name for the application
spark=SparkSession.builder.appName('Project').getOrCreate()

In [None]:
spark

# Reading the Data and Removing the structural errors if there are any

In [None]:
# Reading the file and Replacing Column number with header name 
music=spark.read.option('header','true').csv(r'/content/final_spotify.csv')

In [None]:
music

DataFrame[index_id: string, track_id: string, artists: string, album_name: string, track_name: string, popularity: string, duration_ms: string, explicit: string, danceability: string, energy: string, key: string, loudness: string, mode: string, speechiness: string, acousticness: string, instrumentalness: string, liveness: string, valence: string, tempo: string, time_signature: string, track_genre: string, spotify_release_date: string]

In [None]:
#displaying the edited file
music.show()

+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|index_id|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|spotify_release_date|
+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|       0|5SuOikwiRyPMVoIQD...|         Gen Hoshino|              Comedy|              Comedy|        73|     230666|   FALSE|       0.676| 0.461|  1|  -6.746| 

In [None]:
## Check the schema of the dataset
music.printSchema()

root
 |-- index_id: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- explicit: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- instrumentalness: string (nullable = true)
 |-- liveness: string (nullable = true)
 |-- valence: string (nullable = true)
 |-- tempo: string (nullable = true)
 |-- time_signature: string (nullable = true)
 |-- track_genre: string (nullable = true)
 |-- spotify_release_date: string (nullable = true)



# TYPECASTING to use data in valid datatype

In [None]:
# Changing the datatype of the columns
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType
from pyspark.sql.types import BooleanType
from pyspark.sql.types import DateType

music = music.withColumn("popularity",music["popularity"].cast(IntegerType()))
music = music.withColumn("duration_ms",music["duration_ms"].cast(DoubleType()))
music = music.withColumn("explicit",music["explicit"].cast(BooleanType()))
music = music.withColumn("danceability",music["danceability"].cast(DoubleType()))
music = music.withColumn("energy",music["energy"].cast(DoubleType()))
music = music.withColumn("key",music["key"].cast(IntegerType()))
music = music.withColumn("loudness",music["loudness"].cast(DoubleType()))
music = music.withColumn("mode",music["mode"].cast(BooleanType()))
music = music.withColumn("speechiness",music["speechiness"].cast(DoubleType()))
music = music.withColumn("acousticness",music["acousticness"].cast(DoubleType()))
music = music.withColumn("instrumentalness",music["instrumentalness"].cast(DoubleType()))
music = music.withColumn("liveness",music["liveness"].cast(DoubleType()))
music = music.withColumn("valence",music["valence"].cast(DoubleType()))
music = music.withColumn("spotify_release_date",music["spotify_release_date"].cast(DateType()))
music = music.withColumn("tempo",music["tempo"].cast(DoubleType()))
music = music.withColumn("time_signature",music["time_signature"].cast(DoubleType()))

In [None]:
original_count = music.count()
original_count

114000

In [None]:
# to verify whether the data types are changed or not
music.printSchema()

root
 |-- index_id: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: boolean (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- track_genre: string (nullable = true)
 |-- spotify_release_date: date (nullable = true)



# REMOVING THE NULL VALUES

In [None]:
# Deleting null values 
music.na.drop(how='any').count()


113865

Total 135 rows with any null value are deleted

135

In [None]:
music.show(truncate=False)

+--------+----------------------+------------------------------------+------------------------------------------------------+--------------------------------+----------+-----------+--------+------------+------+---+--------+-----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|index_id|track_id              |artists                             |album_name                                            |track_name                      |popularity|duration_ms|explicit|danceability|energy|key|loudness|mode |speechiness|acousticness|instrumentalness|liveness|valence|tempo  |time_signature|track_genre|spotify_release_date|
+--------+----------------------+------------------------------------+------------------------------------------------------+--------------------------------+----------+-----------+--------+------------+------+---+--------+-----+-----------+------------+----------------+--------+-------+-------+--------------

In [None]:
#checkd whether the data types are successfully convertec or not.
music.printSchema()

root
 |-- index_id: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: boolean (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)
 |-- track_genre: string (nullable = true)
 |-- spotify_release_date: date (nullable = true)



# FEATURE EXTRACTION 

In [None]:
from pyspark.sql.functions import col

X = music.drop("track_id", "index_id", "artists", "album_name", "track_name", "track_genre", "duration_ms", "explicit", "key","mode", "spotify_release_date")

 # convert columns to double type
X = X.select([col(c).cast("double") for c in X.columns]) 

y = music.select("track_genre")

In [None]:
X.printSchema()

root
 |-- popularity: double (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)



# Assemble and Scale the Data

In [None]:
from pyspark.ml.linalg import Vector 
from pyspark.ml.feature import VectorAssembler

In [None]:
assembler = VectorAssembler(inputCols = ['popularity','danceability','energy','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo','time_signature'], outputCol="features")

In [None]:
assembled_data=assembler.setHandleInvalid("skip").transform(X)

In [None]:
assembled_data.show(truncate=False)

+----------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+--------------+------------------------------------------------------------------------+
|popularity|danceability|energy|loudness|speechiness|acousticness|instrumentalness|liveness|valence|tempo  |time_signature|features                                                                |
+----------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+--------------+------------------------------------------------------------------------+
|73.0      |0.676       |0.461 |-6.746  |0.143      |0.0322      |1.01E-6         |0.358   |0.715  |87.917 |4.0           |[73.0,0.676,0.461,-6.746,0.143,0.0322,1.01E-6,0.358,0.715,87.917,4.0]   |
|55.0      |0.42        |0.166 |-17.235 |0.0763     |0.924       |5.56E-6         |0.101   |0.267  |77.489 |4.0           |[55.0,0.42,0.166,-17.235,0.0763,0.924,5.56E-6,0.101,0.267,77.489,4.0]   |
|57.0      |0.4

In [None]:
from pyspark.ml.feature import StandardScaler

In [None]:
scale=StandardScaler(inputCol='features',outputCol='standardized')

In [None]:
scaleModel = scale.fit(assembled_data)

# Model Selection and Fitting



# Random Forest Algorithm

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
import time

# Drop the irrelevant columns
data = music.drop('index_id','track_id', 'artists', 'album_name','mode',
                             'track_name','key','duration_ms','explicit', 'spotify_release_date')

# Convert the genre column to numerical labels
labelIndexer = StringIndexer(inputCol='track_genre', outputCol='label').fit(data)
data = labelIndexer.transform(data)
data = data.drop("track_genre")

# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=123)

# Assemble the features into a vector
assembler = VectorAssembler(inputCols=data.columns, outputCol='features')
assembled_data=assembler.setHandleInvalid("skip").transform(trainingData)

trainingData = assembler.transform(trainingData)
testData = assembler.transform(testData)

# Train the Random Forest model
rf = RandomForestClassifier(numTrees=20,maxBins=200, featureSubsetStrategy='sqrt', labelCol='label')
start_time=time.time()
rfModel = rf.fit(trainingData)
end_time=time.time()
fitting_time=end_time-start_time

# Make predictions on the test data
predictions = rfModel.transform(testData)

# Evaluate the accuracy of the model
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Time taken to fit the model: {:.2f} seconds".format(fitting_time))


Accuracy: 46.63%
Time taken to fit the model: 18.62 seconds


# Logistic Regression Algorithm

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time

# Drop the irrelevant columns
data = music.drop("track_id", "artists", "album_name", "track_name", "spotify_release_date",'index_id','track_id', 'artists', 'album_name','mode','key','duration_ms','explicit', 'spotify_release_date')

# Convert the genre column to numerical labels
labelIndexer = StringIndexer(inputCol="track_genre", outputCol="label").fit(data)
data = labelIndexer.transform(data)
data=data.drop("track_genre")
# Split the data into training and test sets
(trainingData, testData) = data.randomSplit([0.8, 0.2], seed=123)

# Assemble the feature columns into a vector
assembler = VectorAssembler(inputCols=data.columns, outputCol="features")
assembled_data=assembler.setHandleInvalid("skip").transform(trainingData)

trainingData = assembler.transform(trainingData)
testData = assembler.transform(testData)


# Train a Multiclass Logistic Regression model
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial", labelCol="label")
start_time=time.time()
lrModel = lr.fit(trainingData)
end_time=time.time()

# Make predictions on the test set
predictions = lrModel.transform(testData)

# Evaluate the performance of the model using accuracy
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Time taken to fit the model: {:.2f} seconds".format(fitting_time))


Accuracy: 0.76%
Time taken to fit the model: 18.62 seconds
