In [1]:
# Installing pyspark on local machine
!pip install pyspark 



In [2]:
# Importing installed pyspark
import pyspark

In [3]:
# Importing PySpark Session 
from pyspark.sql import SparkSession

In [4]:
# Creating Spark and setting a name for the application
spark=SparkSession.builder.appName('Project').getOrCreate()

In [5]:
spark

In [18]:
# Reading csv file using spark
music = spark.read.csv('Desktop\Project\spotify.csv')

In [19]:
# Displaying the dataset
music.show()

+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|     _c0|                 _c1|                 _c2|                 _c3|                 _c4|       _c5|        _c6|     _c7|         _c8|   _c9|_c10|    _c11|_c12|       _c13|        _c14|            _c15|    _c16|   _c17|   _c18|          _c19|       _c20|                _c21|
+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+----+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|index_id|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy| key|loudne

In [20]:
# Replacing Column number with header name 
music=spark.read.option('header','true').csv('Desktop\Project\spotify.csv')

In [21]:
music

DataFrame[index_id: string, track_id: string, artists: string, album_name: string, track_name: string, popularity: string, duration_ms: string, explicit: string, danceability: string, energy: string, key: string, loudness: string, mode: string, speechiness: string, acousticness: string, instrumentalness: string, liveness: string, valence: string, tempo: string, time_signature: string, track_genre: string, spotify_release_date: string]

In [22]:
# Displaying the edited file
music.show()

+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|index_id|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|spotify_release_date|
+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|       0|5SuOikwiRyPMVoIQD...|         Gen Hoshino|              Comedy|              Comedy|        73|     230666|   FALSE|       0.676| 0.461|  1|  -6.746| 

In [23]:
# Check the schema of the dataset
music.printSchema()

root
 |-- index_id: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- duration_ms: string (nullable = true)
 |-- explicit: string (nullable = true)
 |-- danceability: string (nullable = true)
 |-- energy: string (nullable = true)
 |-- key: string (nullable = true)
 |-- loudness: string (nullable = true)
 |-- mode: string (nullable = true)
 |-- speechiness: string (nullable = true)
 |-- acousticness: string (nullable = true)
 |-- instrumentalness: string (nullable = true)
 |-- liveness: string (nullable = true)
 |-- valence: string (nullable = true)
 |-- tempo: string (nullable = true)
 |-- time_signature: string (nullable = true)
 |-- track_genre: string (nullable = true)
 |-- spotify_release_date: string (nullable = true)



In [32]:
# Changing the datatype of the columns
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType
from pyspark.sql.types import BooleanType
from pyspark.sql.types import DateType

music = music.withColumn("popularity",music["popularity"].cast(IntegerType()))
music = music.withColumn("duration_ms",music["duration_ms"].cast(DoubleType()))
music = music.withColumn("explicit",music["explicit"].cast(BooleanType()))
music = music.withColumn("danceability",music["danceability"].cast(DoubleType()))
music = music.withColumn("energy",music["energy"].cast(DoubleType()))
music = music.withColumn("key",music["key"].cast(IntegerType()))
music = music.withColumn("loudness",music["loudness"].cast(DoubleType()))
music = music.withColumn("mode",music["mode"].cast(BooleanType()))
music = music.withColumn("speechiness",music["speechiness"].cast(DoubleType()))
music = music.withColumn("acousticness",music["acousticness"].cast(DoubleType()))
music = music.withColumn("instrumentalness",music["instrumentalness"].cast(DoubleType()))
music = music.withColumn("liveness",music["liveness"].cast(DoubleType()))
music = music.withColumn("valence",music["valence"].cast(DoubleType()))
music = music.withColumn("spotify_release_date",music["spotify_release_date"].cast(DateType()))

In [33]:
music.printSchema()

root
 |-- index_id: string (nullable = true)
 |-- track_id: string (nullable = true)
 |-- artists: string (nullable = true)
 |-- album_name: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: double (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- danceability: double (nullable = true)
 |-- energy: double (nullable = true)
 |-- key: integer (nullable = true)
 |-- loudness: double (nullable = true)
 |-- mode: boolean (nullable = true)
 |-- speechiness: double (nullable = true)
 |-- acousticness: double (nullable = true)
 |-- instrumentalness: double (nullable = true)
 |-- liveness: double (nullable = true)
 |-- valence: double (nullable = true)
 |-- tempo: string (nullable = true)
 |-- time_signature: string (nullable = true)
 |-- track_genre: string (nullable = true)
 |-- spotify_release_date: date (nullable = true)



In [26]:
# single column display
music.select('artists').show()

+--------------------+
|             artists|
+--------------------+
|         Gen Hoshino|
|        Ben Woodward|
|Ingrid Michaelson...|
|        Kina Grannis|
|    Chord Overstreet|
|        Tyrone Wells|
|A Great Big World...|
|          Jason Mraz|
|Jason Mraz;Colbie...|
|      Ross Copperman|
|        Zack Tabudlo|
|          Jason Mraz|
|            Dan Berk|
|       Anna Hamilton|
|Chord Overstreet;...|
|         Landon Pigg|
|Andrew Foy;Renee Foy|
|Andrew Foy;Renee Foy|
|Jason Mraz;Colbie...|
|Boyce Avenue;Bea ...|
+--------------------+
only showing top 20 rows



In [27]:
# display multiple column
music.select(['artists','track_name']).show()

+--------------------+--------------------+
|             artists|          track_name|
+--------------------+--------------------+
|         Gen Hoshino|              Comedy|
|        Ben Woodward|    Ghost - Acoustic|
|Ingrid Michaelson...|      To Begin Again|
|        Kina Grannis|Can't Help Fallin...|
|    Chord Overstreet|             Hold On|
|        Tyrone Wells|Days I Will Remember|
|A Great Big World...|       Say Something|
|          Jason Mraz|           I'm Yours|
|Jason Mraz;Colbie...|               Lucky|
|      Ross Copperman|              Hunger|
|        Zack Tabudlo|Give Me Your Forever|
|          Jason Mraz|     I Won't Give Up|
|            Dan Berk|                Solo|
|       Anna Hamilton|            Bad Liar|
|Chord Overstreet;...|     Hold On - Remix|
|         Landon Pigg|Falling in Love a...|
|Andrew Foy;Renee Foy|ily (i love you b...|
|Andrew Foy;Renee Foy|         At My Worst|
|Jason Mraz;Colbie...|               Lucky|
|Boyce Avenue;Bea ...|          

In [28]:
# Deleting null values
music.na.drop(how='any').count()

113865

Total 135 rows with any null value are deleted

In [29]:
music.show()

+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+-----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|index_id|            track_id|             artists|          album_name|          track_name|popularity|duration_ms|explicit|danceability|energy|key|loudness| mode|speechiness|acousticness|instrumentalness|liveness|valence|  tempo|time_signature|track_genre|spotify_release_date|
+--------+--------------------+--------------------+--------------------+--------------------+----------+-----------+--------+------------+------+---+--------+-----+-----------+------------+----------------+--------+-------+-------+--------------+-----------+--------------------+
|       0|5SuOikwiRyPMVoIQD...|         Gen Hoshino|              Comedy|              Comedy|        73|   230666.0|   false|       0.676| 0.461|  1|  -6.74