<a href="https://colab.research.google.com/github/Pawan-Pokhrel/Music-Playlist-Recommender-System/blob/main/Hybrid_Music_Recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Environment Setup

In [12]:
!pip install pandas numpy scikit-learn



In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD

## 2. Load Dataset

In [14]:
df = pd.read_csv('output_playlists_tracks.csv')
df.head()

Unnamed: 0,playlist_id,playlist_name,playlist_num_tracks,playlist_num_albums,playlist_num_followers,playlist_modified_at,track_pos,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms,album_name
0,0,Throwbacks,52,47,1,1493424000,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook
1,0,Throwbacks,52,47,1,1493424000,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone
2,0,Throwbacks,52,47,1,1493424000,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit)
3,0,Throwbacks,52,47,1,1493424000,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified
4,0,Throwbacks,52,47,1,1493424000,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot


## 3. Preprocessing

In [15]:
# Remove duplicate track entries per playlist
df = df.drop_duplicates(subset=['playlist_id', 'track_uri'])

# Handle missing values in critical columns
df = df.dropna(subset=['track_name', 'artist_name', 'album_name'])

# Fill optional numerical fields
df['duration_ms'] = df['duration_ms'].fillna(df['duration_ms'].median())
df['track_pos'] = df['track_pos'].fillna(df['track_pos'].median())
df.reset_index(drop=True, inplace=True)
df.shape

(66721, 14)

## 4. TF-IDF Vectorization for song name, artist name and album name

In [16]:
df['track_text'] = (
    df['track_name'] + ' ' + df['artist_name'] + ' ' + df['album_name']
)

tfidf_tracks = TfidfVectorizer(stop_words='english', max_features=5000)
track_tfidf = tfidf_tracks.fit_transform(df['track_text'])

## 5. TF-IDF Vectorization for playlist_title

In [17]:
tfidf_titles = TfidfVectorizer(stop_words='english', max_features=2000)
playlist_title_tfidf = tfidf_titles.fit_transform(df['playlist_name'])

## 6. Collaborative Filtering (Matrix Factorization – SVD)

In [18]:
from scipy.sparse import csr_matrix

# 1. Create a mapping for unique playlist_id values to integer indices
df['playlist_idx'] = df['playlist_id'].astype('category').cat.codes

# 2. Create a mapping for unique track_uri values to integer indices
df['track_idx'] = df['track_uri'].astype('category').cat.codes

# 3. Determine the number of unique playlists and tracks
num_playlists = df['playlist_idx'].nunique()
num_tracks = df['track_idx'].nunique()

# 4. Create a sparse interaction matrix
# Rows: playlist_idx, Columns: track_idx, Data: 1 (implicit rating)
interaction_matrix = csr_matrix(
    (np.ones(len(df)), (df['playlist_idx'], df['track_idx'])),
    shape=(num_playlists, num_tracks)
)

# 5. Initialize TruncatedSVD model
svd_model = TruncatedSVD(n_components=100, random_state=42)

# 6. Fit the TruncatedSVD model to the sparse interaction matrix
svd_model.fit(interaction_matrix)

# 7. Reconstruct the rating matrix
reconstructed_matrix = svd_model.inverse_transform(
    svd_model.transform(interaction_matrix)
)

print("TruncatedSVD model fitted and rating matrix reconstructed.")
print(f"Original interaction matrix shape: {interaction_matrix.shape}")
print(f"Reconstructed matrix shape: {reconstructed_matrix.shape}")

TruncatedSVD model fitted and rating matrix reconstructed.
Original interaction matrix shape: (1000, 34443)
Reconstructed matrix shape: (1000, 34443)


## 7. Feature Engineering for Supervised Learning

In [19]:
df['label'] = 1

# --- Generate Negative Samples ---
all_unique_tracks = df['track_uri'].unique()
negative_data_list = []

# Pre-compute playlist information for faster lookup
playlist_info_map = df[['playlist_id', 'playlist_name', 'playlist_num_tracks',
                        'playlist_num_albums', 'playlist_num_followers',
                        'playlist_modified_at', 'playlist_idx']].drop_duplicates('playlist_id').set_index('playlist_id')

# Pre-compute track information for faster lookup
track_details_map = df[['track_uri', 'artist_name', 'artist_uri', 'track_name',
                         'album_uri', 'album_name', 'duration_ms', 'track_idx']].drop_duplicates('track_uri').set_index('track_uri')

for playlist_id in df['playlist_id'].unique():
    current_playlist_tracks = df[df['playlist_id'] == playlist_id]['track_uri'].unique()
    num_positive_tracks = len(current_playlist_tracks)

    potential_negative_tracks = np.setdiff1d(all_unique_tracks, current_playlist_tracks)

    if len(potential_negative_tracks) > 0:
        num_neg_to_sample = min(num_positive_tracks, len(potential_negative_tracks))
        selected_negative_tracks = np.random.choice(potential_negative_tracks, num_neg_to_sample, replace=False)

        playlist_current_info = playlist_info_map.loc[playlist_id]

        for neg_track_uri in selected_negative_tracks:
            track_info_row = track_details_map.loc[neg_track_uri]

            negative_data_list.append({
                'playlist_id': playlist_id,
                'playlist_name': playlist_current_info['playlist_name'],
                'playlist_num_tracks': playlist_current_info['playlist_num_tracks'],
                'playlist_num_albums': playlist_current_info['playlist_num_albums'],
                'playlist_num_followers': playlist_current_info['playlist_num_followers'],
                'playlist_modified_at': playlist_current_info['playlist_modified_at'],
                'track_pos': 0,
                'artist_name': track_info_row['artist_name'],
                'track_uri': neg_track_uri,
                'artist_uri': track_info_row['artist_uri'],
                'track_name': track_info_row['track_name'],
                'album_uri': track_info_row['album_uri'],
                'album_name': track_info_row['album_name'],
                'duration_ms': track_info_row['duration_ms'],
                'playlist_idx': playlist_current_info['playlist_idx'],
                'track_idx': track_info_row['track_idx'],
                'label': 0
            })

# Concatenate original positive samples with newly generated negative samples
if negative_data_list:
    negative_df = pd.DataFrame(negative_data_list)
    # Ensure consistent columns before concatenation (important if 'track_text' etc. were not present)
    all_cols = list(df.columns) # Capture all current columns
    for col in all_cols:
        if col not in negative_df.columns:
            negative_df[col] = np.nan
    df = pd.concat([df, negative_df], ignore_index=True)

# Re-calculate 'track_text' for the expanded DataFrame (required for TFIDF)
df['track_text'] = (
    df['track_name'] + ' ' + df['artist_name'] + ' ' + df['album_name']
).fillna('') # Fill NaN with empty string to avoid errors in TFIDF

# Re-fit TFIDF vectorizers on the expanded data
# This is crucial because `track_tfidf` and `playlist_title_tfidf` were based on the original df
tfidf_tracks = TfidfVectorizer(stop_words='english', max_features=5000)
track_tfidf = tfidf_tracks.fit_transform(df['track_text'])

tfidf_titles = TfidfVectorizer(stop_words='english', max_features=2000)
playlist_title_tfidf = tfidf_titles.fit_transform(df['playlist_name'].fillna(''))

# Replace content_score calculation with TruncatedSVD
svd_content = TruncatedSVD(n_components=1, random_state=42)
df['content_score'] = svd_content.fit_transform(track_tfidf)

# Replace nlp_score calculation with TruncatedSVD
svd_nlp = TruncatedSVD(n_components=1, random_state=42)
df['nlp_score'] = svd_nlp.fit_transform(playlist_title_tfidf)

# Collaborative filtering score (OPTIMIZED: Using direct NumPy indexing instead of df.apply)
df['cf_score'] = reconstructed_matrix[df['playlist_idx'], df['track_idx']]

## 8. Supervised Relevance Scoring Model

In [20]:
feature_cols = ['content_score', 'nlp_score', 'cf_score', 'duration_ms', 'track_pos']
X = df[feature_cols]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

df['supervised_score'] = model.predict_proba(X)[:, 1]

## 9. Hybrid Recommendation Fusion

In [21]:
scaler = MinMaxScaler()
score_cols = ['content_score', 'nlp_score', 'cf_score', 'supervised_score']
df[score_cols] = scaler.fit_transform(df[score_cols])

# Equal-weight fusion (can be tuned)
df['final_score'] = df[score_cols].mean(axis=1)

## 10. Generate Recommendations

In [23]:
def recommend_tracks(playlist_id, top_n=10):
    subset = df[df['playlist_id'] == playlist_id]
    subset = subset.sort_values('final_score', ascending=False)
    return subset[[
        'track_name', 'artist_name', 'album_name', 'final_score'
    ]].head(top_n)

sample_playlist_id = df['playlist_id'].iloc[0]
recommend_tracks(sample_playlist_id)

Unnamed: 0,track_name,artist_name,album_name,final_score
2,Crazy In Love,Beyoncé,Dangerously In Love (Alben für die Ewigkeit),0.432835
21,Where Is The Love?,The Black Eyed Peas,Elephunk,0.408541
34,Your Love Is My Drug,Kesha,Animal,0.395043
9,Hey Ya! - Radio Mix / Club Mix,OutKast,Speakerboxxx/The Love Below,0.390778
41,Somebody To Love,Justin Bieber,My Worlds,0.384826
30,Party In The U.S.A.,Miley Cyrus,The Time Of Our Lives,0.352126
5,Yeah!,Usher,Confessions,0.34997
28,Whatcha Say,Jason Derulo,Jason Derulo,0.34262
32,Replay,Iyaz,Replay,0.334977
36,One Less Lonely Girl,Justin Bieber,My World,0.333426


## 11. Conclusion
This notebook demonstrates a complete hybrid AI-based music playlist recommendation system combining content-based filtering, collaborative filtering, NLP, supervised learning, and hybrid fusion, aligned with academic AI module requirements.