In [None]:
!pip install -U pyspark==3.2.2
!pip install -U delta-spark

Collecting pyspark==3.2.2
  Using cached pyspark-3.2.2-py2.py3-none-any.whl
Collecting py4j==0.10.9.5 (from pyspark==3.2.2)
  Using cached py4j-0.10.9.5-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10.9.7
    Uninstalling py4j-0.10.9.7:
      Successfully uninstalled py4j-0.10.9.7
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.4
    Uninstalling pyspark-3.5.4:
      Successfully uninstalled pyspark-3.5.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
delta-spark 3.3.0 requires pyspark<3.6.0,>=3.5.3, but you have pyspark 3.2.2 which is incompatible.
google-spark-connect 0.5.2 requires pyspark>=3.5, but you have pyspark 3.2.2 which is incompatible.[0m[31m
[0mSucc

Collecting pyspark<3.6.0,>=3.5.3 (from delta-spark)
  Using cached pyspark-3.5.4-py2.py3-none-any.whl
Collecting py4j==0.10.9.7 (from pyspark<3.6.0,>=3.5.3->delta-spark)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10.9.5
    Uninstalling py4j-0.10.9.5:
      Successfully uninstalled py4j-0.10.9.5
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.2.2
    Uninstalling pyspark-3.2.2:
      Successfully uninstalled pyspark-3.2.2
Successfully installed py4j-0.10.9.7 pyspark-3.5.4


In [None]:
from delta import configure_spark_with_delta_pip
from tempfile import TemporaryDirectory
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml.functions import vector_to_array

In [None]:
def config_spark():
    tmpdir = TemporaryDirectory()
    builder = (
        SparkSession.builder.master("local[*]")
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        .config("spark.sql.warehouse.dir", f"file:///{tmpdir.name}")
        .config("spark.executor.memory", "4g")  # Increase executor memory
        .config("spark.driver.memory", "4g")    # Increase driver memory
    )

    return configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
spark = config_spark()

In [None]:
df = spark.read.csv('./final_dataset.csv', header=True, inferSchema=True)

# Track Vector Presentation

In [None]:
df = df.select(
    "track_id",
    F.array(
        "acousticness",
        "danceability",
        "energy",
        "instrumentalness",
        "liveness",
        "loudness",
        "mode",
        "popularity",
        "speechiness",
        "tempo",
        "valence",
        "year_2000_2004",
        "year_2005_2009",
        "year_2010_2014",
        "year_2015_2019",
        "year_2020_2024",
    ).alias("features"),
)


In [None]:
df.show(truncate=False)

+----------------------+----------------------------------------------------------------------------------------------------------------------------------+
|track_id              |features                                                                                                                          |
+----------------------+----------------------------------------------------------------------------------------------------------------------------------+
|53QF56cjZA9RTuuMZDrSA6|[0.694, 0.483, 0.303, 0.0, 0.115, 0.7575757575757576, 1.0, 0.68, 0.0429, 0.532, 0.139, 0.0, 0.0, 1.0, 0.0, 0.0]                   |
|1s8tP3jP4GZcyHDsjvw218|[0.477, 0.572, 0.454, 1.37E-5, 0.0974, 0.7575757575757576, 1.0, 0.5, 0.0258, 0.56, 0.515, 0.0, 0.0, 1.0, 0.0, 0.0]                |
|7BRCa8MPiyuvr2VU3O9W0F|[0.338, 0.409, 0.234, 5.0E-5, 0.0895, 0.7121212121212122, 1.0, 0.5700000000000001, 0.0323, 0.556, 0.145, 0.0, 0.0, 1.0, 0.0, 0.0] |
|63wsZUhUZLlh1OsyrZq7sz|[0.807, 0.392, 0.251, 0.0, 0.0797, 0.772

# User Vector Presentation

In [None]:
!pip install pandas



In [None]:
import pandas as pd

In [None]:
df = spark.read.csv('./final_dataset.csv', header=True, inferSchema=True)

In [None]:
df = df.select(
        "acousticness",
        "danceability",
        "energy",
        "instrumentalness",
        "liveness",
        "loudness",
        "mode",
        "popularity",
        "speechiness",
        "tempo",
        "valence",
        "year_2000_2004",
        "year_2005_2009",
        "year_2010_2014",
        "year_2015_2019",
        "year_2020_2024",
)


In [None]:
#take a piece of 10 tracks
df = df.limit(10)

In [None]:
#generate random timestamp - date and time to mimic a user like time.
from pyspark.sql import functions as F

time_range_seconds = 2592000  # 30 days in seconds

df = df.withColumn(
    "update_timestamp",
    F.to_timestamp(F.from_unixtime(F.unix_timestamp() + (F.rand() * time_range_seconds).cast("int")))
)

In [None]:
df.show(truncate=False)

+------------+------------+------+----------------+--------+------------------+----+------------------+-----------+------------------+-------+--------------+--------------+--------------+--------------+--------------+-------------------+
|acousticness|danceability|energy|instrumentalness|liveness|loudness          |mode|popularity        |speechiness|tempo             |valence|year_2000_2004|year_2005_2009|year_2010_2014|year_2015_2019|year_2020_2024|update_timestamp   |
+------------+------------+------+----------------+--------+------------------+----+------------------+-----------+------------------+-------+--------------+--------------+--------------+--------------+--------------+-------------------+
|0.694       |0.483       |0.303 |0.0             |0.115   |0.7575757575757576|1   |0.68              |0.0429     |0.532             |0.139  |0             |0             |1             |0             |0             |2025-03-17 14:34:04|
|0.477       |0.572       |0.454 |1.37E-5       

In [None]:
df = df.toPandas()

In [None]:
import datetime

min_date = datetime.datetime(2025, 1, 1)
max_date = datetime.datetime(2026, 1, 1)
time_range = (max_date - min_date).days

In [None]:
(max_date - min_date)/ 4

datetime.timedelta(days=91, seconds=21600)

In [None]:
second_qua = min_date + datetime.timedelta(days=(max_date - min_date).days / 4)
second_qua

datetime.datetime(2025, 4, 2, 6, 0)

In [None]:
third_qua = min_date + (datetime.timedelta(days=(max_date - min_date).days / 4)) *2
third_qua

datetime.datetime(2025, 7, 2, 12, 0)

In [None]:
fourth_qua = min_date + (datetime.timedelta(days=(max_date - min_date).days / 4)) *3
fourth_qua

datetime.datetime(2025, 10, 1, 18, 0)

In [None]:
def weighted_mean(df ,col="update_timestamp"):
    date_col = df[col]

    min_date = date_col.min()
    max_date = date_col.max()

    bucket_time_period = (max_date - min_date)/ 4

    #first_quarter = min_date
    second_quarter = min_date + bucket_time_period
    third_quarter = min_date + bucket_time_period * 2
    fourth_quarter = min_date + bucket_time_period * 3

    # check if short period of time
    if (max_date[col] - min_date[col]).days < 7:
        mean_df = df.mean().drop(col)
    else:
        # calculate each quarter mean and multiply by matching weight
        # the later the records, the bigger the weight
        first_quarter_df = df[df[col] <= second_quarter].mean().drop(col).mul(1)
        second_quarter_df = df[(df[col] > second_quarter) & (df[col] <= third_quarter)].mean().drop(col).mul(2)
        third_quarter_df = df[(df[col] > third_quarter) & (df[col] <= fourth_quarter)].mean().drop(col).mul(3)
        fourth_quarter_df = df[df[col] > fourth_quarter].mean().drop(col).mul(4)

        all_quarter_df = pd.DataFrame({
            'first': first_quarter_df,
            'second': second_quarter_df,
            'third': third_quarter_df,
            'fourth': fourth_quarter_df})

        mean_df = all_quarter_df.mean(axis=1)
    return mean_df

In [None]:
col = "update_timestamp"
date_col = df[col]
min_date = date_col.min()
max_date = date_col.max()

bucket_time_period = (max_date - min_date)/ 4

first_quarter = min_date
second_quarter = min_date + bucket_time_period
third_quarter = min_date + bucket_time_period * 2
fourth_quarter = min_date + bucket_time_period * 3

print(f"first_quarter : {first_quarter}")
print(f"second_quarter : {second_quarter}")
print(f"third_quarter : {third_quarter}")
print(f"fourth_quarter : {fourth_quarter}")



first_quarter : 2025-02-22 22:46:32
second_quarter : 2025-02-28 14:43:25.250000
third_quarter : 2025-03-06 06:40:18.500000
fourth_quarter : 2025-03-11 22:37:11.750000


In [None]:
first_quarter_df = df[df[col] <= second_quarter].mean().drop(col).mul(1)
second_quarter_df = df[(df[col] > second_quarter) & (df[col] <= third_quarter)].mean().drop(col).mul(2)
third_quarter_df = df[(df[col] > third_quarter) & (df[col] <= fourth_quarter)].mean().drop(col).mul(3)
fourth_quarter_df = df[df[col] > fourth_quarter].mean().drop(col).mul(4)

In [None]:
print(first_quarter_df)

acousticness          0.2672
danceability        0.529333
energy                 0.556
instrumentalness     0.00644
liveness            0.101667
loudness             0.79798
mode                0.666667
popularity          0.496667
speechiness         0.027367
tempo               0.598667
valence                 0.35
year_2000_2004           0.0
year_2005_2009           0.0
year_2010_2014           1.0
year_2015_2019           0.0
year_2020_2024           0.0
dtype: object


In [None]:
all_quarter_df = pd.DataFrame({
    'first': first_quarter_df,
    'second': second_quarter_df,
    'third': third_quarter_df,
    'fourth': fourth_quarter_df
})

In [None]:
print(all_quarter_df)

                     first    second     third    fourth
acousticness      0.321329  0.642067  0.959267  1.282618
danceability      0.537609  1.075619  1.613459  2.150429
energy            0.639788  1.281021  1.921758  2.561308
instrumentalness  0.252409  0.504718  0.757215  1.010032
liveness          0.222802  0.447148  0.668447  0.891228
loudness          0.780757  1.561696  2.342363   3.12293
mode              0.633999  1.268238   1.90425  2.540674
popularity        0.183806   0.36793  0.551783  0.736198
speechiness       0.092878  0.185827   0.27865   0.37056
tempo             0.483951  0.967246  1.450726  1.933813
valence           0.455689  0.913073  1.367167  1.821821
year_2000_2004    0.185178  0.368898  0.550634   0.74234
year_2005_2009    0.197855  0.396349  0.595475  0.788413
year_2010_2014    0.220147  0.438934  0.657618  0.876007
year_2015_2019    0.224893  0.447888  0.677186  0.897526
year_2020_2024    0.171927  0.347931  0.519087  0.695715


In [None]:
mean_df = all_quarter_df.mean(axis=1)

In [None]:
print(mean_df)

acousticness         1.01752
danceability        1.391867
energy              1.242133
instrumentalness    0.006464
liveness             0.25764
loudness            2.051515
mode                2.266667
popularity          1.425333
speechiness         0.080947
tempo               1.482133
valence             0.988533
year_2000_2004           0.0
year_2005_2009           0.0
year_2010_2014      2.666667
year_2015_2019           0.0
year_2020_2024           0.0
dtype: object
