In [101]:
#import libraries 
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, FloatType, DateType
from pyspark.sql.functions import desc, col, lower, when, avg, sum, length

# Create Spark Session

In [19]:
#create spark session 
spark = SparkSession.builder\
        .appName("GCPipeline")\
        .master("local[*]")\
        .getOrCreate()

In [20]:
#Check the catalog that should countain the list of names of databases, tables and columns 
spark.catalog.listTables()

[]

In [21]:
#These are the configurations of the pyspark cluster
spark.sparkContext.getConf().getAll()

[('spark.app.name', 'GCPipeline'),
 ('spark.driver.port', '35771'),
 ('spark.driver.extraJavaOptions',
  '-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED'),
 ('spark.app.startTime', '1720782156932'),
 ('spark.sql.warehouse.dir',
  'file:/home/rihab/Projects/spark/notebooks/spark-warehouse'),
 ('spark.executor.id', 'drive

# Read input data

In [41]:
schema = StructType([
    StructField("_c0", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("rank", IntegerType(), True),
    StructField("artist", StringType(), True),
    StructField("url", StringType(), True),
    StructField("region", StringType(), True),
    StructField("chart", StringType(), True),
    StructField("trend", StringType(), True),
    StructField("streams", IntegerType(), True),
    StructField("track_id", StringType(), True),
    StructField("album", StringType(), True),
    StructField("popularity", IntegerType(), True),
    StructField("duration_ms", IntegerType(), True),
    StructField("explicit", BooleanType(), True),
    StructField("release_date", DateType(), True),
    StructField("available_markets", StringType(), True),
    StructField("af_danceability", FloatType(), True),
    StructField("af_energy", FloatType(), True),
    StructField("af_key", IntegerType(), True),
    StructField("af_loudness", FloatType(), True),
    StructField("af_mode", IntegerType(), True),
    StructField("af_speechiness", FloatType(), True),
    StructField("af_acousticness", FloatType(), True),
    StructField("af_instrumentalness", FloatType(), True),
    StructField("af_liveness", FloatType(), True),
    StructField("af_valence", FloatType(), True),
    StructField("af_tempo", FloatType(), True),
    StructField("af_time_signature", IntegerType(), True)
])

In [84]:
data = spark.read.csv('../data/merged_data_10k.csv', 
                      schema=schema,
                     sep=",")
display(data)

DataFrame[_c0: int, title: string, rank: int, artist: string, url: string, region: string, chart: string, trend: string, streams: int, track_id: string, album: string, popularity: int, duration_ms: int, explicit: boolean, release_date: date, available_markets: string, af_danceability: float, af_energy: float, af_key: int, af_loudness: float, af_mode: int, af_speechiness: float, af_acousticness: float, af_instrumentalness: float, af_liveness: float, af_valence: float, af_tempo: float, af_time_signature: int]

In [43]:
data.summary()

                                                                                

DataFrame[summary: string, _c0: string, title: string, rank: string, artist: string, url: string, region: string, chart: string, trend: string, streams: string, track_id: string, album: string, popularity: string, duration_ms: string, available_markets: string, af_danceability: string, af_energy: string, af_key: string, af_loudness: string, af_mode: string, af_speechiness: string, af_acousticness: string, af_instrumentalness: string, af_liveness: string, af_valence: string, af_tempo: string, af_time_signature: string]

In [44]:
data.describe()
#.show(vertical = True)

                                                                                

DataFrame[summary: string, _c0: string, title: string, rank: string, artist: string, url: string, region: string, chart: string, trend: string, streams: string, track_id: string, album: string, popularity: string, duration_ms: string, available_markets: string, af_danceability: string, af_energy: string, af_key: string, af_loudness: string, af_mode: string, af_speechiness: string, af_acousticness: string, af_instrumentalness: string, af_liveness: string, af_valence: string, af_tempo: string, af_time_signature: string]

In [45]:
data.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- rank: integer (nullable = true)
 |-- artist: string (nullable = true)
 |-- url: string (nullable = true)
 |-- region: string (nullable = true)
 |-- chart: string (nullable = true)
 |-- trend: string (nullable = true)
 |-- streams: integer (nullable = true)
 |-- track_id: string (nullable = true)
 |-- album: string (nullable = true)
 |-- popularity: integer (nullable = true)
 |-- duration_ms: integer (nullable = true)
 |-- explicit: boolean (nullable = true)
 |-- release_date: date (nullable = true)
 |-- available_markets: string (nullable = true)
 |-- af_danceability: float (nullable = true)
 |-- af_energy: float (nullable = true)
 |-- af_key: integer (nullable = true)
 |-- af_loudness: float (nullable = true)
 |-- af_mode: integer (nullable = true)
 |-- af_speechiness: float (nullable = true)
 |-- af_acousticness: float (nullable = true)
 |-- af_instrumentalness: float (nullable = true)
 |-- af_liveness

In [46]:
data.count()

10000

In [62]:
data.distinct().show()



+----+--------------------+----+----------+--------------------+--------------------+------------------+------+-------+--------+--------------------+----------+-----------+--------+------------+-----------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
| _c0|               title|rank|    artist|                 url|              region|             chart| trend|streams|track_id|               album|popularity|duration_ms|explicit|release_date|available_markets|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|
+----+--------------------+----+----------+--------------------+--------------------+------------------+------+-------+--------+--------------------+----------+-----------+--------+------------+-----------------+---------------+---------+------+-----------+-------+-

                                                                                

In [48]:
data.dtypes

[('_c0', 'int'),
 ('title', 'string'),
 ('rank', 'int'),
 ('artist', 'string'),
 ('url', 'string'),
 ('region', 'string'),
 ('chart', 'string'),
 ('trend', 'string'),
 ('streams', 'int'),
 ('track_id', 'string'),
 ('album', 'string'),
 ('popularity', 'int'),
 ('duration_ms', 'int'),
 ('explicit', 'boolean'),
 ('release_date', 'date'),
 ('available_markets', 'string'),
 ('af_danceability', 'float'),
 ('af_energy', 'float'),
 ('af_key', 'int'),
 ('af_loudness', 'float'),
 ('af_mode', 'int'),
 ('af_speechiness', 'float'),
 ('af_acousticness', 'float'),
 ('af_instrumentalness', 'float'),
 ('af_liveness', 'float'),
 ('af_valence', 'float'),
 ('af_tempo', 'float'),
 ('af_time_signature', 'int')]

In [53]:
data.stat.freqItems(['title']).show()

+--------------------+
|     title_freqItems|
+--------------------+
|[Mi tesoro (feat....|
+--------------------+



In [57]:
data.select('af_energy').show()

+---------+
|af_energy|
+---------+
|     null|
|    0.852|
|    0.663|
|    0.761|
|    0.508|
|    0.899|
|    0.776|
|    0.588|
|    0.832|
|    0.736|
|    0.721|
|     0.68|
|     0.78|
|     0.73|
|    0.723|
|     0.76|
|    0.598|
|    0.716|
|    0.476|
|    0.549|
+---------+
only showing top 20 rows



In [58]:
data.filter(data['af_energy']>0.5).show()

+---+--------------------+----+----------+--------------------+--------------------+---------+------+-------+--------+--------------------+----------+-----------+--------+------------+-----------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
|_c0|               title|rank|    artist|                 url|              region|    chart| trend|streams|track_id|               album|popularity|duration_ms|explicit|release_date|available_markets|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|
+---+--------------------+----+----------+--------------------+--------------------+---------+------+-------+--------+--------------------+----------+-----------+--------+------------+-----------------+---------------+---------+------+-----------+-------+--------------+---------------+

In [64]:
data.select(['title','album','artist']).orderBy(desc('artist')).show()

+----------+--------------------+----------+
|     title|               album|    artist|
+----------+--------------------+----------+
|     title|            track_id|      date|
|    Dueles|1iRvhKiXRElIH2Uf4...|2018-03-02|
|     BILLY|7bRRHQelMfrP86QYQ...|2018-03-02|
|      Mine|6tHWl8ows5JOZq9Yf...|2018-03-01|
|      Mine|6tHWl8ows5JOZq9Yf...|2018-03-01|
|   Outcast|6jnDfM8vtRWgN7R4X...|2018-03-01|
|        Du|0EM85Qj3526QtxCBO...|2018-03-01|
|     Leika|6sIgHBjmtF5AgGa0a...|2018-03-01|
|  Everyday|2zJJro5CPAVwwzLQQ...|2018-03-01|
|      Mine|6tHWl8ows5JOZq9Yf...|2018-03-01|
|   Skyldig|4k9zexSmoSS6EER7F...|2018-03-01|
| STAY TUNE|0albLGvYx6ftOUJ7K...|2018-03-01|
|     BILLY|7bRRHQelMfrP86QYQ...|2018-03-01|
| Ko Ko Bop|5EzitieoPnjyKHAq0...|2018-03-01|
|        楓|73xYYths9kvmFvYsd...|2018-03-01|
|        TT|1nMbZ9OsVNSLEyijI...|2018-03-01|
|   44 More|6iyp6udVpLuq3MCUu...|2018-03-01|
|God's Plan|2XW4DbS6NddZxRPm5...|2018-03-01|
|   Enkelit|6hAKGURpSwL2tvmfj...|2018-03-01|
|      Chic

In [None]:
spark.sql(f"SELECT * FROM merged_data_10k.csv")

In [71]:
data.show(n=5)

+----+--------------------+----+----------+--------------------+--------------------+---------+------+-------+--------+--------------------+----------+-----------+--------+------------+-----------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
| _c0|               title|rank|    artist|                 url|              region|    chart| trend|streams|track_id|               album|popularity|duration_ms|explicit|release_date|available_markets|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|
+----+--------------------+----+----------+--------------------+--------------------+---------+------+-------+--------+--------------------+----------+-----------+--------+------------+-----------------+---------------+---------+------+-----------+-------+--------------+-------------

In [9]:
data=data.withColumnRenamed("_c0", "index")
  

# Create/Update features

In [None]:
 data = data.withColumn(

In [95]:
data.select([ 'available_markets','af_danceability','af_energy','af_key','af_loudness','af_mode','af_speechiness', 'af_acousticness','af_instrumentalness','af_liveness','af_valence','af_tempo','af_time_signature']).show()

+-----------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
|available_markets|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|
+-----------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+
|     release_date|           null|     null|  null|       null|   null|          null|           null|               null|       null|      null|    null|             null|
|       2017-05-26|           null|    0.852|  null|        8.0|   null|           0.0|         0.0776|              0.187|    3.05E-5|     0.159|   0.907|             null|
|       2016-09-22|           null|    0.663|  null|       11.0|   null|           0.0|          0.226|            0.00431|    1.6

In [104]:
sum(col('af_danceability')).

Column<'sum(af_danceability)'>

In [110]:
data = data.withColumn("title", lower(col('title')))\
            .withColumn("Class", when(col('af_energy')>0.5, "High Energy").otherwise(when(col('af_loudness')>0.5, "Loud").otherwise("Classical")))\
            .withColumn("Weight", when((col('af_loudness').isNotNull()) & (col('af_energy').isNotNull()), col('af_loudness')+ col('af_energy')))\
           

In [111]:
data.show(n=3)

+----+--------------------+----+----------+------------+--------------------+---------+------+-------+--------+--------------------+----------+-----------+--------+------------+-----------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+-----------+------+
| _c0|               title|rank|    artist|         url|              region|    chart| trend|streams|track_id|               album|popularity|duration_ms|explicit|release_date|available_markets|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|      Class|Weight|
+----+--------------------+----+----------+------------+--------------------+---------+------+-------+--------+--------------------+----------+-----------+--------+------------+-----------------+---------------+---------+------+-----------+-------+--------------

# Fill in missing values

In [112]:
data =data.fillna(0)

In [113]:
data.describe().show(vertical=True)



-RECORD 0-----------------------------------
 summary             | count                
 _c0                 | 10000                
 title               | 10000                
 rank                | 10000                
 artist              | 10000                
 url                 | 10000                
 region              | 10000                
 chart               | 10000                
 trend               | 10000                
 streams             | 10000                
 track_id            | 10000                
 album               | 10000                
 popularity          | 10000                
 duration_ms         | 10000                
 available_markets   | 10000                
 af_danceability     | 10000                
 af_energy           | 10000                
 af_key              | 10000                
 af_loudness         | 10000                
 af_mode             | 10000                
 af_speechiness      | 10000                
 af_acoust

                                                                                

In [114]:
data.show(n=3)

+---+--------------------+----+----------+------------+--------------------+---------+------+-------+--------+--------------------+----------+-----------+--------+------------+-----------------+---------------+---------+------+-----------+-------+--------------+---------------+-------------------+-----------+----------+--------+-----------------+-----------+------+
|_c0|               title|rank|    artist|         url|              region|    chart| trend|streams|track_id|               album|popularity|duration_ms|explicit|release_date|available_markets|af_danceability|af_energy|af_key|af_loudness|af_mode|af_speechiness|af_acousticness|af_instrumentalness|af_liveness|af_valence|af_tempo|af_time_signature|      Class|Weight|
+---+--------------------+----+----------+------------+--------------------+---------+------+-------+--------+--------------------+----------+-----------+--------+------------+-----------------+---------------+---------+------+-----------+-------+--------------+--

In [None]:
gcs_bucket = ""
data.write.csv(gcs_bucket, header=TRue)

In [None]:
import os
import airflow

In [None]:
def read_files_in_batches(directory, batch_size):
    files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]
    for i in range(0, len(files), batch_size):
        yield files[i:i+batch_size]

In [None]:
for file in read_files_in_batches('../data',2):
    print(file)