In [22]:
##Week 9 - Building off of Week 8 (Combining all data into one + adding in lightly weighted genres) 

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

In [2]:
df_test_encoded = pd.read_csv(r'week_5_test_encoded.csv')

df_train_encoded = pd.read_csv(r'week_5_train_encoded.csv')

df_val_encoded = pd.read_csv(r'week_5_validation_encoded.csv')

In [3]:
combined_df = pd.concat([df_train_encoded, df_test_encoded, df_val_encoded], ignore_index=True)

In [4]:
combined_df.columns

Index(['Song', 'Artist', 'Popularity', 'BPM', 'Dance', 'Energy', 'Acoustic',
       'Happy', 'Loud', 'Clean_Lyrics', 'Power', 'sqrt_Acoustic', 'sqrt_BPM',
       'camelot_sin', 'camelot_cos', 'Genre_alternative', 'Genre_country',
       'Genre_dance', 'Genre_electronic', 'Genre_folk', 'Genre_hip hop',
       'Genre_indie', 'Genre_pop', 'Genre_rnb', 'Genre_rock', 'Genre_soul',
       'Subgenre_alternative', 'Subgenre_alternative rnb',
       'Subgenre_alternative rock', 'Subgenre_country', 'Subgenre_country pop',
       'Subgenre_dance', 'Subgenre_dance electronic', 'Subgenre_dance pop',
       'Subgenre_electronic', 'Subgenre_electronic dance',
       'Subgenre_electronic pop', 'Subgenre_folk', 'Subgenre_folk rock',
       'Subgenre_hip hop', 'Subgenre_hip hop 90s', 'Subgenre_hip hop rnb',
       'Subgenre_indie', 'Subgenre_indie alternative', 'Subgenre_indie folk',
       'Subgenre_indie pop', 'Subgenre_indie rock', 'Subgenre_pop',
       'Subgenre_pop 60s', 'Subgenre_pop 70s', 'Subge

In [5]:
print(combined_df.select_dtypes(include=['number']).columns)


Index(['Popularity', 'BPM', 'Dance', 'Energy', 'Acoustic', 'Happy', 'Loud',
       'Power', 'sqrt_Acoustic', 'sqrt_BPM', 'camelot_sin', 'camelot_cos',
       'Genre_alternative', 'Genre_country', 'Genre_dance', 'Genre_electronic',
       'Genre_folk', 'Genre_hip hop', 'Genre_indie', 'Genre_pop', 'Genre_rnb',
       'Genre_rock', 'Genre_soul', 'Subgenre_alternative',
       'Subgenre_alternative rnb', 'Subgenre_alternative rock',
       'Subgenre_country', 'Subgenre_country pop', 'Subgenre_dance',
       'Subgenre_dance electronic', 'Subgenre_dance pop',
       'Subgenre_electronic', 'Subgenre_electronic dance',
       'Subgenre_electronic pop', 'Subgenre_folk', 'Subgenre_folk rock',
       'Subgenre_hip hop', 'Subgenre_hip hop 90s', 'Subgenre_hip hop rnb',
       'Subgenre_indie', 'Subgenre_indie alternative', 'Subgenre_indie folk',
       'Subgenre_indie pop', 'Subgenre_indie rock', 'Subgenre_pop',
       'Subgenre_pop 60s', 'Subgenre_pop 70s', 'Subgenre_pop 80s',
       'Subgenre_pop

Looking at the numerical columns for our dataset we see that most of them come from the genre one-hot encoding. For now we will continue to omit these columns from the model. 

Last week's model that incorporated the musical numerical features performed slightly better than the model's using just embedding. However, it was noticed that the best model incorporated BPM, sqrt_BPM, Acoustic, and aqrt_Acoustic. This is not good becuase those metrics were double weighted in the model. This week, part of our updated model is addressing that which we will do when creating the model. 

Our model will continue to use the Glove-Twitter (Word2Vec) embeddings as they provided a better accuracy score compared to the original embedding method.


In [28]:
#Using word2vec this week (from week 7 notebook)

import gensim.downloader as api
model = api.load("glove-twitter-200")

In [29]:
#Function that converts a list of tokenized words into a single (fixed length) vector using the glove-twitter-200 model
def get_embedding_twitter(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [30]:
#Creates a column called 'Twitter_Embeddings" in all three of the dataframes/datasets by applying the get_embedding_twitter() function to the tokens list

combined_df['Twitter_Embeddings'] = combined_df['Tokens'].apply(lambda tokens: get_embedding_twitter(tokens, model))

In [31]:
#Allows us to work with the vector numerically
def str_to_embedding(s):
    # Remove [ and ] characters
    s = s.strip('[]')
    # Split by spaces and convert each to float
    return np.array([float(x) for x in s.split()])

# Example usage:
embedding_str = "[-5.49920015e-02 -4.24173214e-02 1.20421372e-02]"
embedding_vector = str_to_embedding(embedding_str)
print(embedding_vector)

[-0.054992   -0.04241732  0.01204214]


In [32]:
#Converts the string in 'Embeddings' column into numpy array using the above function
combined_df['Embeddings'] = combined_df['Embeddings'].apply(str_to_embedding)

In [33]:
combined_df.head()

Unnamed: 0,Song,Artist,Popularity,BPM,Dance,Energy,Acoustic,Happy,Loud,Clean_Lyrics,...,Subgenre_rock alternative,Subgenre_soul,Subgenre_soul 60s,Subgenre_soul 70s,Subgenre_soul 80s,Subgenre_soul disco,Subgenre_soul rnb,Tokens,Embeddings,Twitter_Embeddings
0,Stretch You Out (feat. A Boogie wit da Hoodie),"Summer Walker,A Boogie Wit da Hoodie",0.5,0.317647,0.623529,0.494737,0.252525,0.231579,0.76,get london da track niggas insecure claim enou...,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,"['get', 'london', 'da', 'track', 'niggas', 'in...","[-0.0549920015, -0.0424173214, 0.120421372, -0...","[0.073948905, -0.1129652, -0.046067417, 0.0734..."
1,On Melancholy Hill,Gorillaz,0.8,0.417647,0.682353,0.726316,0.0,0.578947,0.8,melancholy hill good side consumerism like muc...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"['melancholy', 'hill', 'good', 'side', 'consum...","[-0.0140985716, -0.0445414186, 0.111462869, 0....","[0.10808624, -0.1391475, -0.055444743, 0.04700..."
2,WHO CARES?,Rex Orange County,0.35,0.217647,0.835294,0.242105,0.59596,0.589474,0.8,mmmm mmmm mmmm mmmm first time try free doubt ...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"['mmmm', 'mmmm', 'mmmm', 'mmmm', 'first', 'tim...","[-0.00185818132, -0.0364763662, 0.147965446, 0...","[0.07740225, -0.12858482, -0.05046654, 0.07772..."
3,Solid,Ashford & Simpson,0.45,0.305882,0.823529,0.442105,0.272727,0.978947,0.56,love sake mistake oh forgave soon learn trust ...,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,"['love', 'sake', 'mistake', 'oh', 'forgave', '...","[-0.0136662908, -0.0114702322, 0.119016811, 0....","[0.053561937, -0.07968545, -0.08123524, 0.0626..."
4,BREAK MY SOUL,Beyoncé,0.55,0.388235,0.682353,0.884211,0.060606,0.863158,0.84,'bout explode take load bend bust open ya make...,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[""'bout"", 'explode', 'take', 'load', 'bend', '...","[-0.0563979559, -0.0503409989, 0.103356943, -0...","[0.06484171, -0.08521436, -0.048548214, 0.1225..."


In [34]:
print(type(combined_df['Embeddings'].iloc[0]))
print(type(combined_df['Twitter_Embeddings'].iloc[0]))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [35]:
#Converts the "Embeddings" column into a 2D numpy array (matrix) where each row represents a song
X = np.stack(combined_df['Embeddings'].values)

Fitting the KNN model - NEW MODEL
1. Excluding 'Acoustic' and 'BPM' as the sqrt transformations of those metrics give a more normal distrubution
2. Weighing lyric embeddings and music features the same -- last week embeddings were weighed equal to any other metric such as BPM. We think lyrics are a significant part of music (half) and therefore in this model the lyric embeddings will be weighed 50% of the total distance and the remaining numerical metadata will collectively weigh the other 50% of the distance. 
3. Now including genres slightly - hopefully to get country songs better aligned

In [36]:
#Omit subgenre, BPM, and Acoustic, and embeddings (will add them later)

columns_to_exclude = [col for col in combined_df.columns 
                      if col.startswith('Subgenre_') or 
                      col in ['Song', 'Artist', 'Clean_Lyrics', 'Tokens', 'Embeddings', 
                              'Twitter_Embeddings', 'BPM', 'Acoustic']]


In [37]:
# Get numeric metadata features - numerical data with only the columns we want (no embeddings)
X_meta = combined_df.drop(columns=columns_to_exclude).select_dtypes(include=[np.number])

# Extract Twitter embeddings
X_embed = np.stack(combined_df['Twitter_Embeddings'].values)

# Extract genre features (one-hot encoded)
X_genre = combined_df.filter(like='Genre_')

# Apply weighting so that each block contributes 50% to the distance
m = X_meta.shape[1]   # number of metadata features
g = X_genre.shape[1]
e = X_embed.shape[1]  # number of embedding dimensions

# Weighting: ensure total sums to 1.0
meta_weight = np.sqrt(0.45 / m)
genre_weight = np.sqrt(0.1 / g)
embed_weight = np.sqrt(0.45 / e)

# Weighted feature arrays
X_full = np.hstack([
    X_meta * meta_weight,
    X_genre * genre_weight,
    X_embed * embed_weight
])


In [38]:
# Fit the KNN_Weighted model
KNN_Weighted = NearestNeighbors(n_neighbors=6, metric='cosine')
KNN_Weighted.fit(X_full)

0,1,2
,n_neighbors,6
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


In [39]:
# Step: Example recommendation for one song from the combined dataset
test_song_idx = 110  # Index of the song to base recommendations on
distances, indices = KNN_Weighted.kneighbors([X_full[test_song_idx]])

print(f"\nRecommendations for song: {combined_df['Song'].iloc[test_song_idx]} by {combined_df['Artist'].iloc[test_song_idx]}")

# Skip the first match (which will be the song itself)
for rank, (idx, dist) in enumerate(zip(indices[0][1:], distances[0][1:]), start=1):
    song = combined_df['Song'].iloc[idx]
    artist = combined_df['Artist'].iloc[idx]
    print(f"{rank}. {song} by {artist} (distance: {dist:.4f})")



Recommendations for song: Night Shift by Jon Pardi
1. Save a Horse (Ride a Cowboy) by Big & Rich (distance: 0.0067)
2. I Wrote The Book by Morgan Wallen (distance: 0.0104)
3. It's Five O'Clock Somewhere by Alan Jackson,Jimmy Buffett (distance: 0.0105)
4. T-Shirt by Thomas Rhett (distance: 0.0132)
5. What My World Spins Around by Jordan Davis (distance: 0.0140)


In [40]:
# Get top 2 neighbors for each song (so we can skip the self-match)
full_distances, full_indices = KNN_Weighted.kneighbors(X_full, n_neighbors=2)

full_recommendations = []

# Generate recommendations for the first 100 songs
for i in range(100):
    test_song_title = combined_df.iloc[i]['Song']
    test_artist = combined_df.iloc[i]['Artist']
    
    rec_idx = full_indices[i][1]  # [1] skips the self-match at [0]
    rec_song_title = combined_df.iloc[rec_idx]['Song']
    rec_artist = combined_df.iloc[rec_idx]['Artist']
    
    full_recommendations.append({
        'Test Song Title': test_song_title,
        'Test Artist': test_artist,
        'Recommended Song Title': rec_song_title,
        'Recommended Artist': rec_artist
    })

# Save to CSV
full_rec_df = pd.DataFrame(full_recommendations)
full_rec_df.to_csv('top_100_knn_weighted_w_genre_recommendations.csv', index=False)

print("Saved top 100 KNN_Weighted recommendations to 'top_100_knn_weighted_w_genre_recommendations.csv'")


Saved top 100 KNN_Weighted recommendations to 'top_100_knn_weighted_w_genre_recommendations.csv'


In [41]:
import os
import pickle

# Set Downloads folder as working directory
downloads_path = os.path.join(os.environ['USERPROFILE'], 'Downloads')
os.chdir(downloads_path)

# Save the model
with open('knn_weighted_model.pkl', 'wb') as f:
    pickle.dump(KNN_Weighted, f)

print(f"✅ Model saved to: {os.path.join(downloads_path, 'knn_weighted_model.pkl')}")


✅ Model saved to: C:\Users\Ryan\Downloads\knn_weighted_model.pkl


In [42]:
np.save(os.path.expanduser('~/Downloads/X_full.npy'), X_full)
combined_df.to_csv('~/Downloads/combined_df.csv', index=False)


In [6]:
combined_df.to_csv('~/Downloads/combined_df.csv', index=False)