In [1]:
# Installing pyspark on local machine
!pip install pyspark 



In [2]:
# Importing installed pyspark
import pyspark

In [3]:
# Importing PySpark Session 
from pyspark.sql import SparkSession

In [4]:
# Creating Spark and setting a name for the application
spark=SparkSession.builder.appName('Project').getOrCreate()

In [5]:
spark

REMOVING STRUCTURAL ERRORS FROM DATASET

In [6]:
# Reading csv file using spark
music = spark.read.csv(r"final_spotify.csv")

In [7]:
# Displaying the dataset
music.show()

+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|     _c0|                 _c1|                 _c2|                 _c3|                 _c4|       _c5|        _c6|     _c7|         _c8|   _c9|_c10|    _c11|_c12|       _c13|        _c14|            _c15|    _c16|   _c17|   _c18|          _c19|       _c20|                _c21|
+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|index_id|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy| key|loudne

In [8]:
# Replacing Column number with header name 
music=spark.read.option('header','true').csv(r'final_spotify.csv')

In [9]:
music

DataFrame[index_id: string, track_id: string, artists: string, album_name: string, track_name: string, popularity: string, duration_ms: string, explicit: string, danceability: string, energy: string, key: string, loudness: string, mode: string, speechiness: string, acousticness: string, instrumentalness: string, liveness: string, valence: string, tempo: string, time_signature: string, track_genre: string, spotify_release_date: string]

In [10]:
#displaying the edited file
music.show()

+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|index_id|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|spotify_release_date|
+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|       0|5SuOikwiRyPMVoIQD...|         Gen Hoshino|              Comedy|              Comedy|        73|     230666|   FALSE|       0.676| 0.461|  1|  -6.746| 

In [11]:
## Check the schema of the dataset
music.printSchema()

root
 |-- index_id: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- explicit: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- instrumentalness: string (nullable = true)
 |-- liveness: string (nullable = true)
 |-- valence: string (nullable = true)
 |-- tempo: string (nullable = true)
 |-- time_signature: string (nullable = true)
 |-- track_genre: string (nullable = true)
 |-- spotify_release_date: string (nullable = true)



TYPECASTING to use data in valid datatype

In [12]:
# Changing the datatype of the columns
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType
from pyspark.sql.types import BooleanType
from pyspark.sql.types import DateType




music = music.withColumn("popularity",music["popularity"].cast(IntegerType()))
music = music.withColumn("duration_ms",music["duration_ms"].cast(DoubleType()))
music = music.withColumn("explicit",music["explicit"].cast(BooleanType()))
music = music.withColumn("danceability",music["danceability"].cast(DoubleType()))
music = music.withColumn("energy",music["energy"].cast(DoubleType()))
music = music.withColumn("key",music["key"].cast(IntegerType()))
music = music.withColumn("loudness",music["loudness"].cast(DoubleType()))
music = music.withColumn("mode",music["mode"].cast(BooleanType()))
music = music.withColumn("speechiness",music["speechiness"].cast(DoubleType()))
music = music.withColumn("acousticness",music["acousticness"].cast(DoubleType()))
music = music.withColumn("instrumentalness",music["instrumentalness"].cast(DoubleType()))
music = music.withColumn("liveness",music["liveness"].cast(DoubleType()))
music = music.withColumn("valence",music["valence"].cast(DoubleType()))
music = music.withColumn("spotify_release_date",music["spotify_release_date"].cast(DateType()))

In [13]:
# to verify whether the data types are changed or not
music.printSchema()

root
 |-- index_id: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: boolean (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: string (nullable = true)
 |-- time_signature: string (nullable = true)
 |-- track_genre: string (nullable = true)
 |-- spotify_release_date: date (nullable = true)



REMOVING THE NULL VALUES

In [14]:
# Deleting null values
music.na.drop(how='any').count()

113865

Total 135 rows with any null value are deleted

In [15]:
music.show()

+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+-----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|index_id|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness| mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|spotify_release_date|
+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+-----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|       0|5SuOikwiRyPMVoIQD...|         Gen Hoshino|              Comedy|              Comedy|        73|   230666.0|   false|       0.676| 0.461|  1|  -6.74

In [16]:
music.printSchema()

root
 |-- index_id: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: boolean (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: string (nullable = true)
 |-- time_signature: string (nullable = true)
 |-- track_genre: string (nullable = true)
 |-- spotify_release_date: date (nullable = true)



FEATURE EXTRACTION 

In [17]:
from pyspark.sql.functions import col

X = music.drop("track_id", "index_id", "artists", "album_name", "track_name", "track_genre", "duration_ms", "explicit", "key","mode", "spotify_release_date")

X = X.select([col(c).cast("double") for c in X.columns])  # convert columns to double type

y = music.select("track_genre")

In [18]:
X.printSchema()

root
 |-- popularity: double (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- loudness: double (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: double (nullable = true)
 |-- time_signature: double (nullable = true)



In [19]:
from pyspark.ml.linalg import Vector 
from pyspark.ml.feature import VectorAssembler

In [20]:
X.columns

['popularity',
 'danceability',
 'energy',
 'loudness',
 'speechiness',
 'acousticness',
 'instrumentalness',
 'liveness',
 'valence',
 'tempo',
 'time_signature']

In [21]:
assembler = VectorAssembler(inputCols = ['popularity','danceability','energy','loudness','speechiness','acousticness','instrumentalness','liveness','valence','tempo','time_signature'], outputCol="features")

In [22]:
assembled_data=assembler.setHandleInvalid("skip").transform(X)

In [23]:
assembled_data.show(truncate=False)

+----------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+--------------+------------------------------------------------------------------------+
|popularity|danceability|energy|loudness|speechiness|acousticness|instrumentalness|liveness|valence|tempo  |time_signature|features                                                                |
+----------+------------+------+--------+-----------+------------+----------------+--------+-------+-------+--------------+------------------------------------------------------------------------+
|73.0      |0.676       |0.461 |-6.746  |0.143      |0.0322      |1.01E-6         |0.358   |0.715  |87.917 |4.0           |[73.0,0.676,0.461,-6.746,0.143,0.0322,1.01E-6,0.358,0.715,87.917,4.0]   |
|55.0      |0.42        |0.166 |-17.235 |0.0763     |0.924       |5.56E-6         |0.101   |0.267  |77.489 |4.0           |[55.0,0.42,0.166,-17.235,0.0763,0.924,5.56E-6,0.101,0.267,77.489,4.0]   |
|57.0      |0.4

In [24]:
from pyspark.ml.feature import StandardScaler

In [25]:
scale=StandardScaler(inputCol='features',outputCol='standardized')

In [27]:
scaleModel = scale.fit(assembled_data)

Model Selection and Fiting

In [53]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import col
from pyspark.sql import SparkSession

# Rename the target column to 'label'
y = music.select(col('track_genre').alias('label'))

# Split the data into training and testing sets
(training_data, testing_data) = X.randomSplit([0.7, 0.3])

# Initialize the Decision Tree classifier
classifier = DecisionTreeClassifier()

# Train the classifier
model = classifier.fit(training_data.join(y, training_data.index == y.index))

# Evaluate the classifier
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')
predictions = model.transform(testing_data.join(y, testing_data.index == y.index))
accuracy = evaluator.evaluate(predictions)

# Print the accuracy score
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Print the time taken to fit the model
start_time = time.time()
model.fit(training_data.join(y, training_data.index == y.index))
end_time = time.time()
fitting_time = end_time - start_time
print("Time taken to fit the model: {:.2f} seconds".format(fitting_time))


AttributeError: 'DataFrame' object has no attribute 'index_id'