### Creating Baseline Model

In [2]:
import pandas as pd
import json

#import missingno as msno

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [3]:
# Importing dataset

# Load JSON data from file
with open('data/challenge_set.json', 'r') as file: # Replace with local dataset path
    data = json.load(file)

# Initialize an empty list to collect all track data
all_tracks = []

# Loop through each playlist in the dataset
for playlist in data['playlists']:
    for track in playlist['tracks']:
        # Add playlist-level information to each track record
        track_info = {
            'playlist_name': playlist.get('name', 'Unknown'),
            'playlist_pid': playlist['pid'],
            'playlist_num_tracks': playlist['num_tracks'],
            'track_pos': track['pos'],
            'artist_name': track['artist_name'],
            'track_uri': track['track_uri'],
            'artist_uri': track['artist_uri'],
            'track_name': track['track_name'],
            'album_uri': track['album_uri'],
            'duration_ms': track['duration_ms'],
            'album_name': track['album_name']
        }
        all_tracks.append(track_info)

# Convert the list of track dictionaries to a DataFrame and reassign dataframe to new variable name
df_spotify = pd.DataFrame(all_tracks)
spotify = df_spotify

# Display the first few rows of the DataFrame to verify
print(df_spotify.shape)

(281000, 11)


In [4]:
# Counting occurrences of each song and getting the top 500
top_500_songs = spotify['track_uri'].value_counts().head(500).index.tolist()

# Save the top 500 songs to a CSV file for future use
pd.DataFrame(top_500_songs, columns=['track_uri']).to_csv('top_500_songs.csv', index=False)

#### Recommending top 500 songs to each user playlist

In [7]:
# Group the dataset by 'playlist_name'
playlists = spotify.groupby('playlist_pid')['track_uri'].apply(list).to_dict()

# Function to recommend songs without duplicates
def recommend_songs(playlist_songs, recommendations):
    return [song for song in recommendations if song not in playlist_songs]

# Create recommendations for each playlist
recommendations = {
    playlist: recommend_songs(songs, top_500_songs) for playlist, songs in playlists.items()
}

# Convert recommendations to a DataFrame for visualization or saving
recommendations_df = pd.DataFrame([(playlist, song) for playlist, songs in recommendations.items() for song in songs],
                                 columns=['playlist_pid', 'recommended_song'])

# Save the recommendations to a CSV file
recommendations_df.to_csv('playlist_recommendations.csv', index=False)