In [10]:
!pip install -q sentence-transformers qdrant-client pandas numpy scikit-learn

In [11]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from google.colab import files


In [12]:
df = pd.read_csv('spotify_cleaned_dataset.csv')
print("Dataset loaded")

✅ Dataset loaded


In [13]:
print("Available columns:")
print(df.columns.tolist())


📊 Available columns:
['Artist(s)', 'song', 'text', 'Length', 'emotion', 'Genre', 'Album', 'Release Date', 'Key', 'Tempo', 'Loudness (db)', 'Time signature', 'Popularity', 'Energy', 'Danceability', 'Positiveness', 'Speechiness', 'Liveness', 'Acousticness', 'Instrumentalness', 'Good for Party', 'Good for Work/Study', 'Good for Relaxation/Meditation', 'Good for Exercise', 'Good for Running', 'Good for Yoga/Stretching', 'Good for Driving', 'Good for Social Gatherings', 'Good for Morning Routine', 'Release_Year']


In [14]:
text_columns = ['Artist(s)', 'Genre', 'text']
df['combined_text'] = (
    "Artist: " + df['Artist(s)'].fillna("") + ". "
    "Genre: " + df['Genre'].fillna("") + ". "
    "Lyrics: " + df['text'].fillna("")
)

In [15]:
numeric_cols = [
    'Tempo', 'Loudness (db)', 'Popularity',
    'Energy', 'Danceability', 'Positiveness',
    'Speechiness', 'Liveness', 'Acousticness',
    'Instrumentalness'
]

scaler = MinMaxScaler()
numeric_features = scaler.fit_transform(df[numeric_cols])

# ***Generating Vector Embeddings***

In [16]:
# Loading SBERT model for generating vector embeddings
model_name = 'all-MiniLM-L6-v2'

model = SentenceTransformer(model_name)

# Moving model to GPU, if available
if torch.cuda.is_available():
    model.to('cuda')
    print("Using GPU:", torch.cuda.get_device_name())
else:
    print("No GPU found — running on CPU")


🧠 Loading model 'all-MiniLM-L6-v2'...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Using GPU: Tesla T4


In [17]:
print("\n🧬 Generating SBERT embeddings...")
embeddings = model.encode(
    df['combined_text'].tolist(),
    batch_size=128,
    convert_to_numpy=True,
    show_progress_bar=True
)


🧬 Generating SBERT embeddings...


Batches:   0%|          | 0/287 [00:00<?, ?it/s]

In [18]:
final_embeddings = np.hstack([embeddings, numeric_features])
print("📌 Final embedding shape:", final_embeddings.shape)

📌 Final embedding shape: (36671, 394)


In [21]:
PAYLOAD_COLS = ['Artist(s)', 'song', 'Length', 'Genre', 'Album', 'Release_Year']
payload_df = df[PAYLOAD_COLS]

In [24]:
np.save("final_embeddings.npy", final_embeddings)
payload_df.to_csv("song_metadata.csv", index=False)



# **Ensuring both have same dimensions along axis=0**

In [26]:
payload_df.shape

(36671, 6)

In [27]:
final_embeddings.shape

(36671, 394)