In [2]:
import numpy as np
import pandas as pd

import time

import matplotlib.pyplot as plt

import spotipy
# from spotipy.oauth2 import SpotifyClientCredentials 
# spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=CLIENT_ID, client_secret=CLIENT_SECRET))

#uncomment above and replace CLIENT_ID and CLIENT_SECRET with the id and secret you get from creating your app with spotify (going through the "Getting Started" instructions in the Spotify Web API documentation)

from sklearn.model_selection import train_test_split
import math



# Spotify Integration

In [None]:
discussions_df = pd.read_csv('data/Discussions.csv', header=0, encoding='utf-8')
ratings_df = pd.read_csv('data/Ratings.csv', header=0, encoding='utf-8')

# creating new columns and intializing each value to 0
discussions_df['EnergyAvg'] = 0
discussions_df['ValenceAvg'] = 0
discussions_df['AcousticnessAvg'] = 0
discussions_df['InstrumentalnessAvg'] = 0

display(discussions_df)

In [None]:
for ind in range(len(discussions_df)):
    results = spotify.album_tracks(discussions_df['SpotifyID'][ind]) # getting tracks from current album via SpotiPy

    # filtering track ids into list
    ids = []
    for track in results['items']:
        ids.append(track['id'])
    numTracks = len(results['items'])

    # getting features from each track in ids via SpotiPy
    features = spotify.audio_features(ids)
    for feature in features['audio_features']:
        # calculating and storing average in df
        discussions_df['EnergyAvg'][ind] += (feature['energy'] / numTracks)
        discussions_df['ValenceAvg'][ind] += (feature['valence'] / numTracks)
        discussions_df['AcousticnessAvg'][ind] += (feature['acousticness'] / numTracks)
        discussions_df['InstrumentalnessAvg'][ind] += (feature['instrumentalness'] / numTracks)
    time.sleep(3) # sleep with arbitrary time of 3 seconds for circumventing rate limit
    
display(discussions_df)

In [None]:
discussions_df.to_csv("data/Discussions_Audio_Features.csv")

In [None]:
discussionsAF_df = pd.read_csv('data/Discussions_Audio_Features.csv', header=0, encoding='utf-8')

ratings_df['WeightedEnergyRating'] = 0
ratings_df['WeightedValenceRating'] = 0
ratings_df['WeightedAcousticnessRating'] = 0
ratings_df['WeightedInstrumentalnessRating'] = 0

display(ratings_df)

In [None]:
merged_df = pd.merge(discussionsAF_df, ratings_df, on='DiscussionID')
merged_df = merged_df.drop(['AlbumName', 'ArtistName', 'Date', 'AvgRating', 'Stdev', 'Attendance', 'RotationGenre', 'OtherGenre','Subgenres','ReleaseYear','FavoriteTrack','Popularity','Tracks','SpotifyID','Image', 'Unnamed: 0', 'FavoriteTrack1', 'FavoriteTrack2', 'FavoriteTrack3'], axis='columns') #cleaning up df

merged_df['WeightedEnergyRating'] = merged_df['Rating'] * merged_df['EnergyAvg']
merged_df['WeightedValenceRating'] = merged_df['Rating'] * merged_df['ValenceAvg']
merged_df['WeightedAcousticnessRating'] = merged_df['Rating'] * merged_df['AcousticnessAvg']
merged_df['WeightedInstrumentalnessRating'] = merged_df['Rating'] * merged_df['InstrumentalnessAvg']
display(merged_df)

# Baseline Estimate with weighted ratings

In [None]:
# Standard mu + b_i + b_u
for AF in ['WeightedEnergyRating', 'WeightedValenceRating', 'WeightedAcousticnessRating', 'WeightedInstrumentalnessRating']:
    X = merged_df.drop(columns=[AF])
    y = merged_df[AF]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    X_train[AF] = y_train

    mu = np.mean(X_train[AF])
    y_pred = []
    for idx, row in X_test.iterrows():
        b_u = np.mean(X_train[X_train['MemberID'] == row['MemberID']][AF]) - mu
        b_u = 0 if math.isnan(b_u) else b_u
        b_i = np.mean(X_train[X_train['DiscussionID'] == row['DiscussionID']][AF]) - mu
        b_i = 0 if math.isnan(b_i) else b_i
        estimate = max(min(int(np.round(mu - b_u - b_i)), 10), 1)
        y_pred.append(estimate)
    y_pred = np.array(y_pred)
    rmse = np.sqrt(np.mean((y_pred - y_test)**2))
    print(AF + " rmse: " + str(rmse))

# User-User Collaborative Filtering with weighted Audio Feature Ratings

In [48]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    retValue = dot_product / (norm_vec1 * norm_vec2) if not math.isnan(dot_product / (norm_vec1 * norm_vec2)) else 0
    return retValue

def count_non_zeros(arr):
    return sum(1 for num in arr if num != 0)

In [49]:
y_pred = []

X = merged_df.drop(columns=['WeightedInstrumentalnessRating'])
y = merged_df['WeightedInstrumentalnessRating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train['WeightedInstrumentalnessRating'] = y_train

pivot_df = X_train.pivot(index='DiscussionID', columns='MemberID', values='WeightedInstrumentalnessRating')
pivot_df = pivot_df.fillna(0)

In [None]:
user = 341
album = 68
user_sims = {}

for member_id in pivot_df.columns:
    if member_id != user:
        print('User:', member_id)
        print('Cosine Sim:', cosine_similarity(pivot_df[user], pivot_df[member_id]))
        user_sims[member_id] = cosine_similarity(pivot_df[user], pivot_df[member_id])

In [53]:
for idx, row in X_test.iterrows():
    user = row['MemberID']
    album = row['DiscussionID']
    user_sims = {}
    
    if user not in pivot_df.columns:
        y_pred.append(int(np.round(X_train['WeightedInstrumentalnessRating'].mean(),0)))
        continue
    if album not in pivot_df.index:
        y_pred.append(int(np.round(X_train['WeightedInstrumentalnessRating'].mean(),0)))
        continue

    for member_id in pivot_df.columns:
        if member_id != user:
            user_sims[member_id] = cosine_similarity(pivot_df[user], pivot_df[member_id])

    rated_users = []
    for i in pivot_df.columns:
        if pivot_df[i][album] != 0 and i != user:
            rated_users.append(i)
    rated_user_sims = []
    for u in rated_users:
        rated_user_sims.append(user_sims[u])

    top5_rated_users = []
    top5_sims = []
    sorted_pairs = sorted(zip(rated_users, rated_user_sims), key=lambda x: x[1], reverse=True)
    for top5_user, sim in sorted_pairs[:5]:
        top5_rated_users.append(top5_user)
        top5_sims.append(sim)
    normalized_top5_sims = [x * sum(top5_sims) for x in normalized_top5_sims]

    pred_r = 0
    ind = 0
    # # predict the rating with the weighted avg
    for u in top5_rated_users:
        pred_r += normalized_top5_sims[ind] * pivot_df[u][album]
        ind += 1

    y_pred.append(int(np.round(pred_r,0)))
    
y_pred = np.array(y_pred)
rmse = np.sqrt(np.mean((y_pred - y_test)**2))
print("RMSE:", rmse)

  retValue = dot_product / (norm_vec1 * norm_vec2) if not math.isnan(dot_product / (norm_vec1 * norm_vec2)) else 0


RMSE: 2.236919177526273


# Your Turn
Fill out the the database with audio features (watch out for 429 errors!) and run some of the filtering techniques we've covered on it.

Content filtering is a great fit here

also be cognisant of the fact that audio features are numerical values, and thus can be leveraged in weighted sums, numerical analysis, and any number of data science techniques.

and like mentioned in the content filtering slides, feel free to go above and beyond with any external resoruces (like RateYourMusic, AlbumOfTheYear, etc.) you'd like!

In [44]:
# TODO: integrate Spotify with your filtering code
discussions_df['length'] = 0
for ind in range(len(discussions_df)):
    try:
        results = spotify.album_tracks(discussions_df['SpotifyID'][ind]) # getting tracks from current album via SpotiPy

        # filtering track ids into list
        ids = []
        for track in results['items']:
            ids.append(track['id'])
        numTracks = len(ids)
        features = spotify.audio_features(ids)
        for feature in features:
            # calculating and storing average in df
        
            discussions_df['length'][ind] += round(feature['duration_ms'] / 60000, 2)
    except:
        pass
    
lengthy = discussions_df[['DiscussionID','AlbumName', 'length', 'AvgRating']]
lengthy.set_index('AlbumName')
lengthy = lengthy.drop_duplicates(subset=['AlbumName'])
lengthy = lengthy[lengthy['length'] != 0]
lengthy = lengthy[lengthy['AvgRating'] != 0]
lengthyRating = lengthy.sort_values('AvgRating', ascending= False)
lengthyLength = lengthy.sort_values('length', ascending= False)
lengthyLength = lengthyLength.drop_duplicates(subset=['AlbumName'])
display(lengthyRating.iloc[:10])



X = ratings_df.drop(columns=['Rating'])
y = ratings_df['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train['Rating'] = y_train

merged_df = pd.merge(lengthy, X_train, on='DiscussionID')
display(merged_df[["DiscussionID", "AlbumName", "length", "Rating"]])
y_pred = []
for idx, row in X_test.iterrows():
    # Get the decade of the current album
    album_decade = discussions_df.loc[discussions_df['DiscussionID'] == row['DiscussionID'], 'length'].iloc[0]
    
    # Filter merged_df to only include discussions attended by the user and in the same decade
    smaller_df = merged_df[(merged_df['MemberID'] == row['MemberID']) & (merged_df['length'] == album_decade)]
    
    # Calculate the average rating for discussions in the same decade
    avg_rating = smaller_df['Rating'].mean() if not smaller_df.empty else X_train['Rating'].mean()
    y_pred.append(int(np.round(avg_rating, 0)))

# Calculate RMSE
y_pred = np.array(y_pred)
rmse = np.sqrt(np.mean((y_pred - y_test)**2))
print("RMSE:", rmse)
