In [None]:
!pip install -U pyspark==3.2.2
!pip install -U delta-spark

Collecting pyspark==3.2.2
  Downloading pyspark-3.2.2.tar.gz (281.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.5/281.5 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5 (from pyspark==3.2.2)
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.2-py2.py3-none-any.whl size=281969433 sha256=5cc175fcc3ad48ad852519e20e2bfcedaea3a738a989c460fe28beb4c037b37c
  Stored in directory: /root/.cache/pip/wheels/84/42/fa/5339cf0197ee3f87cf713e440a581889f343da6d24e04e866a
Successfully built pyspark
Installing collected packages: py4j, pyspark
 

In [None]:
from delta import configure_spark_with_delta_pip
from tempfile import TemporaryDirectory
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml.functions import vector_to_array

In [None]:
def config_spark():
    tmpdir = TemporaryDirectory()
    builder = (
        SparkSession.builder.master("local[*]")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .config("spark.sql.warehouse.dir", f"file:///{tmpdir.name}")
        .config("spark.executor.memory", "4g")  # Increase executor memory
        .config("spark.driver.memory", "4g")    # Increase driver memory
    )

    return configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
spark = config_spark()

## count the duplicates

In [None]:
songs_df = spark.read.csv('spotify_data.csv', header=True, inferSchema=True)


In [None]:
songs_df.show(truncate=False)

+---+---------------------+--------------------------------------------------------------+----------------------+----------+----+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------+
|_c0|artist_name          |track_name                                                    |track_id              |popularity|year|genre   |danceability|energy|key|loudness|mode|speechiness|acousticness|instrumentalness|liveness|valence|tempo  |duration_ms|time_signature|
+---+---------------------+--------------------------------------------------------------+----------------------+----------+----+--------+------------+------+---+--------+----+-----------+------------+----------------+--------+-------+-------+-----------+--------------+
|0  |Jason Mraz           |I Won't Give Up                                               |53QF56cjZA9RTuuMZDrSA6|68        |2012|acoustic|0.483       |0.303 |4  |-10.058 |1   |0.0429     

In [None]:
songs_df.count()

1159764

In [None]:
filtered_df = songs_df.groupBy("track_id").count()\
.filter("count > 1")

In [None]:
filtered_df.show(truncate=False)

+------------------------------+-----+
|track_id                      |count|
+------------------------------+-----+
| Mimì)"                       |2    |
| Woglinde                     |2    |
| Vol. 2"""                    |7    |
| 1964"                        |9    |
| Chorus)"                     |20   |
| still                        |2    |
| 1965"""                      |2    |
| Jesus)"                      |3    |
| Violetta                     |3    |
| mein Hammer                  |2    |
| Alphise                      |2    |
| le jasmin"" (Lakmé           |3    |
| Op. 55                       |2    |
| Gemahl                       |2    |
| Suzuki                       |3    |
| je veux vivre dans ce reve"""|3    |
| dit-elle                     |2    |
| la paterna mano"""           |2    |
| o cara                       |2    |
| Suzuki)"                     |3    |
+------------------------------+-----+
only showing top 20 rows



## delete the instances of the duplicates

> Add blockquote




In [None]:
duplicate_track_id_df.select(F.sum("count")).show()

+----------+
|sum(count)|
+----------+
|       476|
+----------+



In [None]:
unique_tracks_df = songs_df.join(filtered_df, ["track_id"], "left_anti")

In [None]:
unique_tracks_df.count()

1159288

In [None]:
songs_df = (
   unique_tracks_df
    .drop("_c0")  # Drop unnecessary column
    .withColumn("popularity", F.col("popularity").cast(DoubleType()))
    .withColumn("danceability", F.col("danceability").cast(FloatType()))
    .withColumn("energy", F.col("energy").cast(FloatType()))
    .withColumn("key", F.col("key").cast(IntegerType()))
    .withColumn("loudness", F.col("loudness").cast(DoubleType()))
    .withColumn("mode", F.col("mode").cast(IntegerType()))
    .withColumn("speechiness", F.col("speechiness").cast(FloatType()))
    .withColumn("acousticness", F.col("acousticness").cast(FloatType()))
    .withColumn("instrumentalness", F.col("instrumentalness").cast(FloatType()))
    .withColumn("liveness", F.col("liveness").cast(FloatType()))
    .withColumn("valence", F.col("valence").cast(FloatType()))
    .withColumn("tempo", F.col("tempo").cast(DoubleType()))
    .withColumn("year", F.col("year").cast(IntegerType()))
    .withColumn("duration_ms", F.col("duration_ms").cast(IntegerType()))
    .withColumn("time_signature", F.col("time_signature").cast(IntegerType()))
)

songs_df.write.mode('overwrite').parquet("./data/ready_parquet_to_postgress")