In [1]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, desc, countDistinct, max as spark_max, min as spark_min, explode

# Load environment variables from .env
load_dotenv()

print(" COCKROACH_USER:", os.getenv("COCKROACH_USER"))
print(" COCKROACH_PASS:", os.getenv("COCKROACH_PASS"))
print(" COCKROACH_HOST:", os.getenv("COCKROACH_HOST"))
print(" COCKROACH_PORT:", os.getenv("COCKROACH_PORT"))
print(" MONGO URI:", os.getenv("MONGO_ATLAS_URI"))

 COCKROACH_USER: shubh
 COCKROACH_PASS: nrTPuxNNk9Oggf1lCXDkYw
 COCKROACH_HOST: bowing-slime-10451.j77.aws-ap-south-1.cockroachlabs.cloud
 COCKROACH_PORT: 26257
 MONGO URI: mongodb+srv://shbpndr:CrVz9nzipaLOZFVk@sounds-similar.8sd8tnl.mongodb.net/


In [2]:
spark = SparkSession.builder \
    .appName("CockroachDB_PySpark_Project") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.18") \
    .getOrCreate()

In [3]:
spark

In [4]:
COCKROACH_USER = os.getenv("COCKROACH_USER")
COCKROACH_PASS = os.getenv("COCKROACH_PASS")
COCKROACH_HOST = os.getenv("COCKROACH_HOST")
COCKROACH_PORT = os.getenv("COCKROACH_PORT")
DATABASE_NAME = "music"

jdbc_url = f"jdbc:postgresql://{COCKROACH_HOST}:{COCKROACH_PORT}/{DATABASE_NAME}?sslmode=require"

connection_properties = {
    "user": COCKROACH_USER,
    "password": COCKROACH_PASS,
    "driver": "org.postgresql.Driver"
}

In [5]:
def load_table(table_name):
    return spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=connection_properties
    )

track_link_df = load_table("track_links")
audio_df = load_table("audio_features")
track_reference_df = load_table("track_reference")
lyrics_df = load_table("lyrics")
sentiments_df = load_table('lyrics_emotions')

## Merge Tables

In [6]:
merged_df = lyrics_df.join(sentiments_df, on='musicbrainz_id', how='inner') \
                     .join(audio_df, on='musicbrainz_id', how='inner')

In [7]:
merged_df.show(5, truncate=False)

+------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
assembler_input_cols = ['musicbrainz_id', 
  # 'genius_lyrics',
  # 'lastfm_wiki_content',
  'goemotion_sadness',
  'goemotion_realization',
  'goemotion_neutral',
  'goemotion_love','goemotion_grief','goemotion_amusement','goemotion_gratitude',
  'goemotion_disappointment','goemotion_surprise','goemotion_nervousness',
  'goemotion_embarrassment','goemotion_remorse','goemotion_joy','goemotion_fear','goemotion_excitement',
  'goemotion_anger','goemotion_pride','goemotion_caring',
  'goemotion_disgust','goemotion_confusion','goemotion_optimism','goemotion_relief',
  'goemotion_desire','goemotion_annoyance','goemotion_approval',
  'goemotion_admiration','goemotion_disapproval','goemotion_curiosity','nrc_anger','nrc_anticipation','nrc_disgust',
  'nrc_fear','nrc_joy','nrc_negative','nrc_positive','nrc_sadness','nrc_surprise','nrc_trust',
  'textblob_polarity','textblob_subjectivity','vader_neg','vader_neu','vader_pos','vader_compound','duration_seconds'  ,'sample_rate','tempo'  ,'loudness', 
  'danceability','energy'   ,'speechiness'  ,'acousticness'   ,'instrumentalness'  ,'liveness'  ,'valence' ,'spectral_centroid' ,'spectral_rolloff'  ,'spectral_bandwidth',
  'spectral_flatness'    ,'zero_crossing_rate'  ,'rms_energy'    ,'tempo_variability' ,'f0_mean' ,'mel_mean' ,
  'dynamic_range'  ,'mfcc_1'  ,'mfcc_2' ,
  'mfcc_3'  ,'mfcc_4' ,'mfcc_5' ,'mfcc_6' ,'mfcc_7'  ,'mfcc_8' ,'mfcc_9'  ,'mfcc_10' ,'mfcc_11' ,
  'mfcc_12' ,'mfcc_13' ,'spectral_contrast_1','spectral_contrast_2','spectral_contrast_3','spectral_contrast_4','spectral_contrast_5','spectral_contrast_6','spectral_contrast_7','chroma_cens_1'   ,'chroma_cens_2'   ,'chroma_cens_3'   ,'chroma_cens_4'   ,'chroma_cens_5'   ,'chroma_cens_6'   ,'chroma_cens_7'   ,'chroma_cens_8'   ,'chroma_cens_9'   ,'chroma_cens_10'  ,
  'chroma_cens_11'  ,'chroma_cens_12'  ,'tonnetz_1'  ,'tonnetz_2' ,'tonnetz_3' ,'tonnetz_4' ,'tonnetz_5' ,'tonnetz_6']

In [None]:
# Now fill only those
df_filled = merged_df.fillna("", subset=existing_cols)

In [12]:
df_filled = merged_df.fillna(0.0, subset=assembler_input_cols)

In [13]:
# Count of nulls in each column
null_counts = merged_df.select([col(c).isNull().cast("int").alias(c) for c in merged_df.columns]) \
                .groupBy().sum() \
                .collect()[0].asDict()
print(null_counts)

{'sum(musicbrainz_id)': 0, 'sum(genius_lyrics)': 0, 'sum(genius_url)': 0, 'sum(lastfm_wiki_summary)': 0, 'sum(lastfm_wiki_content)': 0, 'sum(goemotion_sadness)': 0, 'sum(goemotion_realization)': 0, 'sum(goemotion_neutral)': 0, 'sum(goemotion_love)': 0, 'sum(goemotion_grief)': 0, 'sum(goemotion_amusement)': 0, 'sum(goemotion_gratitude)': 0, 'sum(goemotion_disappointment)': 0, 'sum(goemotion_surprise)': 0, 'sum(goemotion_nervousness)': 0, 'sum(goemotion_embarrassment)': 0, 'sum(goemotion_remorse)': 0, 'sum(goemotion_joy)': 0, 'sum(goemotion_fear)': 0, 'sum(goemotion_excitement)': 0, 'sum(goemotion_anger)': 0, 'sum(goemotion_pride)': 0, 'sum(goemotion_caring)': 0, 'sum(goemotion_disgust)': 0, 'sum(goemotion_confusion)': 0, 'sum(goemotion_optimism)': 0, 'sum(goemotion_relief)': 0, 'sum(goemotion_desire)': 0, 'sum(goemotion_annoyance)': 0, 'sum(goemotion_approval)': 0, 'sum(goemotion_admiration)': 0, 'sum(goemotion_disapproval)': 0, 'sum(goemotion_curiosity)': 0, 'sum(nrc_anger)': 0, 'sum(n

In [10]:
# Filter only columns that exist in merged_df
existing_cols = [col for col in assembler_input_cols if col in merged_df.columns]



In [None]:
existing_cols

In [None]:
missing_cols = [col for col in assembler_input_cols if col not in merged_df.columns]
print("Missing columns:", missing_cols)

## Missing values

In [19]:
df_filled.show(1, truncate=False)

+------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Assemble Numeric Features into a Feature Vector

In [15]:
from pyspark.ml.feature import VectorAssembler

# Filter only columns that exist in merged_df
existing_cols = [col for col in assembler_input_cols if col in merged_df.columns]

numeric_cols = [field.name for field in merged_df.schema.fields
                if isinstance(field.dataType, int or float) and field.name in existing_cols]

# Fill missing values (assuming this step is needed before assembling)
df_filled = merged_df.fillna("", subset=numeric_cols)

# Create VectorAssembler
assembler = VectorAssembler(
    inputCols=numeric_cols,
    outputCol="features"
)

# Apply the assembler to create the 'features' column
df_vectorized = assembler.transform(df_filled)


## Cosine

In [16]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

# Assuming df_vectorized has a "features" column with the assembled feature vectors


In [17]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
import numpy as np

# Function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    dot_product = float(vec1.dot(vec2))
    norm1 = np.linalg.norm(vec1.toArray())
    norm2 = np.linalg.norm(vec2.toArray())
    if norm1 == 0.0 or norm2 == 0.0:
        return 0.0
    return dot_product / (norm1 * norm2)

# UDF to apply cosine similarity
cosine_similarity_udf = udf(cosine_similarity, DoubleType())

# Generate the pairwise cosine similarity between the features of different rows
# Self-join the dataframe for pairwise comparison
df_pairs = df_vectorized.alias("df1").join(df_vectorized.alias("df2"), col("df1.musicbrainz_id") < col("df2.musicbrainz_id"))

# Compute cosine similarity
df_similarities = df_pairs.withColumn(
    "cosine_similarity",
    cosine_similarity_udf(col("df1.features"), col("df2.features"))
)



In [18]:
# Show the top similarities
df_similarities.select("df1.musicbrainz_id", "df2.musicbrainz_id", "cosine_similarity").show()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\spark-3.5.5\python\lib\pyspark.zip\pyspark\worker.py", line 1100, in main
pyspark.errors.exceptions.base.PySparkRuntimeError: [PYTHON_VERSION_MISMATCH] Python in worker has different version (3, 11) than that in driver 3.12, PySpark cannot run with different minor versions.
Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.
