In [None]:
import pandas as pd
import numpy as np
import gdown
# https://drive.google.com/file/d/1JhMQl_ZKg7PQjB-I_I41ngH1fLgFPlz_/view?usp=drive_link
file_id = "1JhMQl_ZKg7PQjB-I_I41ngH1fLgFPlz_"
download_url = f"https://drive.google.com/uc?id={file_id}"

output = "data.csv"
gdown.download(download_url, output, quiet=False)
pd.set_option('display.max_columns', None)
df = pd.read_csv("data.csv", encoding='ISO-8859-1')

In [None]:
df = pd.read_csv("data.csv", encoding='ISO-8859-1')
df.head()

In [None]:
# df = pd.read_csv("data.csv", encoding='cp1252')

In [None]:
df.dropna(inplace=True)

In [None]:
a=df["artists"].head(1)
a

In [None]:
def clean_artists_column(artist_string):
    if pd.isna(artist_string):
        return []

    items = [item.strip() for item in artist_string.split(",")]

    prefixes_to_remove = [
        "https://api.spotify.com/v1/artists/",
        "spotify:artist:",
        "https://open.spotify.com/artist/"
    ]

    cleaned_items = []
    for item in items:
        for prefix in prefixes_to_remove:
            if item.startswith(prefix):
                item = item.replace(prefix, "")
        cleaned_items.append(item)

    final_items = [item for item in cleaned_items if item.lower() != "artist"]
    seen = set()
    unique_items = []
    for item in final_items:
        if item not in seen:
            unique_items.append(item)
            seen.add(item)

    return unique_items
df['updated_artists'] = df['artists'].apply(clean_artists_column)

In [None]:
df['available_markets'] = df['available_markets'].str.split(', ')
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
market_dummies = pd.DataFrame(mlb.fit_transform(df['available_markets']),
                              columns=mlb.classes_,
                              index=df.index)
df = df.join(market_dummies)

In [None]:
df.columns
df.drop(columns=["artists","uri.x","is_local","type.x","href","album.artists","album.href","album.images","album.uri","album.external_urls.spotify","external_urls.spotify","uri.y","track_href","analysis_url","preview_url","album.available_markets","available_markets"],inplace=True)

In [None]:
df.head()

In [None]:
df["album.release_date_precision"].value_counts()

In [None]:
df.dropna(inplace=True)

In [None]:
# for col in df_encoded.columns:
#     print(f"Unique values in column '{col}': {df_encoded[col].nunique()}")


In [None]:
# # for col in df_encoded.columns:
#     print(f"NAN values in column '{col}': {df_encoded[col].isna().sum()}")

In [None]:
features = [
    'danceability', 'energy', 'valence', 'tempo',
    'acousticness', 'instrumentalness', 'liveness', 'speechiness'
]

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# # Load your data
# df = pd.read_csv('your_dataset.csv')  # or however you have it

# Keep only the needed features
X = df[features].copy()

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
!pip install tensorflow
import tensorflow as tf
from tensorflow.keras import layers, models

input_dim = X_scaled.shape[1]

# Autoencoder
input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(64, activation='relu')(input_layer)
encoded = layers.Dense(32, activation='relu')(encoded)
embedding = layers.Dense(16, activation='relu', name='embedding_layer')(encoded)

decoded = layers.Dense(32, activation='relu')(embedding)
decoded = layers.Dense(64, activation='relu')(decoded)
output_layer = layers.Dense(input_dim, activation='linear')(decoded)

autoencoder = models.Model(inputs=input_layer, outputs=output_layer)

autoencoder.compile(optimizer='adam', loss='mse')

# Train the autoencoder
autoencoder.fit(X_scaled, X_scaled, epochs=30, batch_size=32, verbose=1)

In [None]:
embedding_model = models.Model(inputs=autoencoder.input,
                               outputs=autoencoder.get_layer('embedding_layer').output)

song_embeddings = embedding_model.predict(X_scaled)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend_similar_songs(song_index, top_n=5):
    similarities = cosine_similarity(
        [song_embeddings[song_index]],
        song_embeddings
    )[0]

    # Exclude the song itself
    similar_indices = np.argsort(similarities)[::-1][1:top_n+1]
    return df.iloc[similar_indices][['name', 'updated_artists', 'album.name']]

In [None]:
def recommend_similar_songs(song_index, top_n=5):
    similarities = cosine_similarity(
        [song_embeddings[song_index]],
        song_embeddings
    )[0]

    similar_indices = np.argsort(similarities)[::-1][1:top_n+1]
    return df.iloc[similar_indices][['name', 'updated_artists', 'album.name']]

# Example: Recommend songs similar to the first track
recommend_similar_songs(0)


In [None]:
def recommend_by_name(song_name, top_n=5):
    song_index = df[df['name'] == song_name].index[0]
    return recommend_similar_songs(song_index, top_n=top_n)

# Try:
recommend_by_name("Still Got Time (feat. PARTYNEXTDOOR)")

In [None]:
def recommend_by_name(song_name, top_n=5):
    matches = df[df['name'].str.lower() == song_name.lower()]
    if matches.empty:
        return f"Song '{song_name}' not found."
    song_index = matches.index[0]
    return recommend_similar_songs(song_index, top_n=top_n)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

tsne = TSNE(n_components=2, perplexity=30, random_state=42)
reduced = tsne.fit_transform(song_embeddings)

plt.figure(figsize=(12, 8))
plt.scatter(reduced[:, 0], reduced[:, 1], alpha=0.5)
plt.title("t-SNE visualization of song embeddings")
plt.show()

In [None]:
loss = autoencoder.evaluate(X_scaled, X_scaled)
print(f"Autoencoder reconstruction loss: {loss:.4f}")

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=42)
labels = kmeans.fit_predict(song_embeddings)

score = silhouette_score(song_embeddings, labels)
print(f"Silhouette score: {score:.4f}")

In [None]:
colors = pd.to_numeric(df['popularity'], errors='coerce')  # converts strings to numbers
plt.figure(figsize=(12, 8))
plt.scatter(reduced[:, 0], reduced[:, 1], c=colors, cmap='viridis', alpha=0.5)
plt.title("t-SNE colored by Popularity")
plt.colorbar(label='Popularity')
plt.show()

In [None]:
df['description'] = (
    df['name'] + ' by ' + df['updated_artists'].apply(lambda x: ', '.join(x)) +
    '. Energetic: ' + df['energy'].round(2).astype(str) +
    ', Danceable: ' + df['danceability'].round(2).astype(str) +
    ', Mood: ' + df['valence'].round(2).astype(str)
)


In [None]:
!pip install sentence_transformers
!pip install tf-keras
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight, effective

song_embeddings = model.encode(df['description'].tolist(), show_progress_bar=True)

In [None]:
def recommend_by_prompt(prompt, top_n=5):
    query_embedding = model.encode([prompt])[0]
    sims = cosine_similarity([query_embedding], song_embeddings)[0]
    top_indices = sims.argsort()[::-1][:top_n]
    return df.iloc[top_indices][['name', 'updated_artists', 'album.name']]

In [None]:
# recommend_by_prompt("happy upbeat party song", top_n=5)
# # recommend_by_prompt("something mellow and sad", top_n=5)
# # recommend_by_prompt("romantic slow dance", top_n=5)
# recommend_by_prompt("Indian sad song", top_n=5)
recommend_by_prompt("Indian happy  song", top_n=5)

In [None]:
prompt = "Indian sad song"  # or any test phrase

In [None]:
query_embedding = model.encode([prompt])[0]
sims = cosine_similarity([query_embedding], song_embeddings)[0]

In [None]:
country_bias = 0.15  # Strength of boost; tune as needed
bias = df['IN'] * country_bias  # 1 if in India, else 0
biased_scores = sims + bias

In [None]:
top_n = 5  # or however many results you want
top_indices = biased_scores.argsort()[::-1][:top_n]
results = df.iloc[top_indices][['name', 'updated_artists', 'album.name', 'description']]

In [None]:
def recommend_biased_by_country(prompt, country_code='IN', country_boost=0.15, top_n=5):
    query_embedding = model.encode([prompt])[0]
    sims = cosine_similarity([query_embedding], song_embeddings)[0]

    # Add country bias softly
    bias = df[country_code] * country_boost
    biased_scores = sims + bias

    top_indices = biased_scores.argsort()[::-1][:top_n]
    return df.iloc[top_indices][['name', 'updated_artists', 'album.name', 'description']]

In [None]:
recommend_biased_by_country("Arijit Singh", country_code='IN', country_boost=0.9, top_n=5)

In [None]:
pip install langdetect

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 0  # Make results consistent

In [None]:
language_to_country = {
    'hi': 'IN',
    'pa': 'IN',
    'ta': 'IN',
    'te': 'IN',
    'en': 'US',
    'es': 'ES',
    'fr': 'FR',
    'de': 'DE',
    'ja': 'JP',
    'ko': 'KR'
}

In [None]:
def detect_language(title):
    try:
        return detect(title)
    except:
        return 'unknown'

df['detected_language'] = df['name'].apply(detect_language)
df['country_code'] = df['detected_language'].apply(lambda lang: language_to_country.get(lang, 'IN'))  # fallback: IN

In [None]:
df['description'] = (
    df['name'] + ' by ' + df['updated_artists'].apply(lambda x: ', '.join(x)) +
    '. Language: ' + df['detected_language'] +
    '. Energy: ' + df['energy'].round(2).astype(str) +
    ', Danceability: ' + df['danceability'].round(2).astype(str) +
    ', Valence: ' + df['valence'].round(2).astype(str)
)

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
song_embeddings = model.encode(df['description'].tolist(), show_progress_bar=True)

In [None]:
def smart_recommend(prompt, country_boost=0.2, top_n=5):
    query_embedding = model.encode([prompt])[0]
    sims = cosine_similarity([query_embedding], song_embeddings)[0]

    # Detect country from prompt
    try:
        prompt_lang = detect(prompt)
    except:
        prompt_lang = 'unknown'

    prompt_country = language_to_country.get(prompt_lang, 'IN')

    # Apply bias using country column
    bias = df[prompt_country] * country_boost if prompt_country in df.columns else 0
    biased_scores = sims + bias

    top_indices = biased_scores.argsort()[::-1][:top_n]
    return df.iloc[top_indices][['name', 'updated_artists', 'album.name', 'description', 'detected_language']]

In [None]:
smart_recommend("happy hindi song Atif Aslam")

In [None]:
smart_recommend("Punjabi party track")
