In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

In [39]:
df = pd.read_csv('preprocessed_data.csv')
df.columns

Index(['track_id', 'artists', 'album_name', 'track_name', 'popularity',
       'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness',
       'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'track_genre', 'combined_name'],
      dtype='object')

In [40]:
df.drop('track_id' , axis = 1, inplace=True)

In [55]:
# Define which columns to apply transformations to
numeric_columns = ['popularity', 'duration_ms', 'danceability', 'energy', 'key', 
                   'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
                   'liveness', 'valence', 'tempo']

categorical_columns = ['artists', 'album_name', 'track_name', 'track_genre']

In [56]:
data = df[numeric_columns + categorical_columns]

In [57]:
data

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,artists,album_name,track_name,track_genre
0,73,230666,0.676,0.4610,1.0,-6.746,0.1430,0.03220,0.000001,0.3580,0.715,87.917,Gen Hoshino,comedy,comedy,acoustic
1,55,149610,0.420,0.1660,1.0,-17.235,0.0763,0.92400,0.000006,0.1010,0.267,77.489,Ben Woodward,ghost,ghost acoustic,acoustic
2,57,210826,0.438,0.3590,0.0,-9.734,0.0557,0.21000,0.000000,0.1170,0.120,76.332,Ingrid Michaelson;ZAYN,begin,begin,acoustic
3,71,201933,0.266,0.0596,0.0,-18.515,0.0363,0.90500,0.000071,0.1320,0.143,181.740,Kina Grannis,crazy rich asians,cant help falling love,acoustic
4,82,198853,0.618,0.4430,2.0,-9.681,0.0526,0.46900,0.000000,0.0829,0.167,119.949,Chord Overstreet,hold,hold,acoustic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45390,38,462397,0.296,0.7620,2.0,-5.696,0.0548,0.07240,0.000003,0.3740,0.146,139.051,Planetshakers,greater,stay live,world-music
45391,32,250629,0.387,0.5310,8.0,-4.788,0.0290,0.00305,0.000000,0.2010,0.153,146.003,Chris Tomlin,ultimate playlist,cross,world-music
45392,38,312566,0.475,0.8600,10.0,-4.722,0.0421,0.00650,0.000002,0.2460,0.427,113.949,Jesus Culture,revelation songs,love never fails,world-music
45393,39,256026,0.505,0.6870,10.0,-4.375,0.0287,0.08410,0.000000,0.1880,0.382,104.083,Chris Tomlin,see morning,keep singing,world-music


In [58]:
# Normalize numerical features and one-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),  # Normalize numerical features
        ('cat', OneHotEncoder(), categorical_columns)  # One-Hot Encode categorical features
    ])

# Apply transformations and combine features into a single feature vector
X = preprocessor.fit_transform(data)


In [60]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
# Reduce dimensions
pca = PCA(n_components=3)  # 95% variance explained
X_reduced = pca.fit_transform(X)

In [61]:
# Add reduced dimensions to the original dataframe (just for visualization or inspection)
df['pca1'] = X_reduced[:, 0]
df['pca2'] = X_reduced[:, 1]

# Now apply KNN to find similar songs based on PCA components
knn = NearestNeighbors(n_neighbors=3, algorithm='auto', metric='euclidean')
knn.fit(X_reduced)

In [62]:
from pymongo import MongoClient

# MongoDB connection setup
client = MongoClient("mongodb://localhost:27017/")  # Replace with your MongoDB URI
db = client['MusicRecommender']  # Replace with your database name
collection = db['songs']  # Replace with your collection name

# Fetch all songs data from MongoDB
dataset = list(collection.find())
# Convert MongoDB data to DataFrame
df = pd.DataFrame(dataset)


In [63]:
song = collection.find_one({'track_name': "crazy"}, { '_id': 0})
song

{'artists': 'Seal',
 'album_name': 'best 1991 2004',
 'track_name': 'crazy',
 'popularity': 65,
 'duration_ms': 356520,
 'explicit': False,
 'danceability': 0.633,
 'energy': 0.858,
 'key': 11.0,
 'loudness': -7.42,
 'mode': 0,
 'speechiness': 0.0473,
 'acousticness': 0.208,
 'instrumentalness': 0.0112,
 'liveness': 0.0655,
 'valence': 0.724,
 'tempo': 102.561,
 'track_genre': 'british'}

In [77]:
def recommend_song_from_db(song_name, n_neighbors=11):
    # Query MongoDB to find the song by name
    song = collection.find_one({'track_name': song_name})
    
    if not song:
        print(f"Song with name '{song_name}' not found in the database.")
        return
    
    # Extract song features for the queried song
    song_features = {
        'popularity': song['popularity'],
        'duration_ms': song['duration_ms'],
        'danceability': song['danceability'],
        'energy': song['energy'],
        'key': song['key'],
        'loudness': song['loudness'],
        'speechiness': song['speechiness'],
        'acousticness': song['acousticness'],
        'instrumentalness': song['instrumentalness'],
        'liveness': song['liveness'],
        'valence': song['valence'],
        'tempo': song['tempo'],
        'artists': song['artists'],
        'album_name': song['album_name'],
        'track_name': song['track_name'],
        'track_genre': song['track_genre']
    }
    
    # Convert the song features into a DataFrame to apply the preprocessor
    song_df = pd.DataFrame([song_features])
    
    # Preprocess and scale the song features
    song_processed = preprocessor.transform(song_df)  # This will work because song_df is now a DataFrame
    
    # Apply PCA to the song's features
    song_pca = pca.transform(song_processed)
    
    # Find nearest neighbors using KNN
    distances, indices = knn.kneighbors(song_pca, n_neighbors=n_neighbors)
    
    print(f"Recommendations for '{song_name}':")
    
    data = []
    # Retrieve and print the recommended songs from MongoDB based on indices
    for idx in indices[0]:
        if idx != indices[0][0]:
            recommended_song = df.iloc[idx]
            # Query MongoDB again to get the full details of the recommended song
            recommended_song_data = collection.find_one({'track_name': recommended_song['track_name']}, {'_id': 1})
            
            data.append(recommended_song_data)
            
    return data


In [78]:
# Example: Search for a song by name and get recommendations
search_song = "crazy"  # Replace with the song you want to search for
recommend_song_from_db(search_song, n_neighbors=11)

Recommendations for 'crazy':


[{'_id': ObjectId('679f329ba71d6530e5b50900')},
 {'_id': ObjectId('679f329fa71d6530e5b59352')},
 {'_id': ObjectId('679f329aa71d6530e5b4f2e9')},
 {'_id': ObjectId('679f329ba71d6530e5b50b7b')},
 {'_id': ObjectId('679f329aa71d6530e5b4f156')},
 {'_id': ObjectId('679f329aa71d6530e5b4f136')},
 {'_id': ObjectId('679f329ba71d6530e5b51035')},
 {'_id': ObjectId('679f329ba71d6530e5b516a5')},
 {'_id': ObjectId('679f329aa71d6530e5b4f14f')},
 {'_id': ObjectId('679f329fa71d6530e5b59095')}]

we can query the database using these song ids now