In [None]:
import json
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
wrapped_file_path = 'datasets/Wrapped2023.json'

with open(wrapped_file_path, 'r') as file:
    data = json.load(file)
    
    
data

# Top Artists

In [None]:
def process_top_artists(data):
    artists_data = []
    for artist in data['topArtists']['topArtists']:
        artist_uri = artist['artistUri']
        for month in artist['monthlyStreams']:
            for period in month['biweeklyStreams']:
                artists_data.append({
                    'artist_uri': artist_uri,
                    'month': month['month'],
                    'period': period['period'],
                    'streams': period['numStreams'],
                })
    return pd.DataFrame(artists_data)


df_artists = process_top_artists(data)

In [None]:
df_artists.head()

In [None]:
artist_total_streams = df_artists.groupby('artist_uri')['streams'].sum().sort_values(ascending=False)

# Get top 5 artists
top_5_artists = artist_total_streams.index[:5].tolist()

# Save top 5 artist URIs to a file
with open('datasets/top_5_artists.json', 'w') as f:
    json.dump(top_5_artists, f)
    
# Plotting
plt.figure(figsize=(12, 6))
artist_total_streams.head(5).plot(kind='bar')
plt.title('Total Streams for Top 10 Artists')
plt.xlabel('Artist URI')
plt.ylabel('Total Streams')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# 2. Line plot of monthly streams for top 5 artists
plt.figure(figsize=(12, 6))
for artist in top_5_artists:
    artist_data = df_artists[df_artists['artist_uri'] == artist].groupby('month')['streams'].sum()
    plt.plot(artist_data.index, artist_data.values, marker='o', label=artist)
plt.title('Monthly Streams for Top 5 Artists')
plt.xlabel('Month')
plt.ylabel('Number of Streams')
plt.legend(title='Artist URI', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# 5. Pie chart of stream distribution among top 5 artists
plt.figure(figsize=(10, 10))
plt.pie(artist_total_streams.head(), labels=top_5_artists, autopct='%1.1f%%', startangle=90)
plt.title('Stream Distribution Among Top 5 Artists')
plt.axis('equal')
plt.show()

In [None]:
total_listening_time_in_mins = data['topArtists']['topArtistMilliseconds'] / 60000

# Print additional statistics
print("Top 5 Artists by Total Streams:")
for i, (artist, streams) in enumerate(artist_total_streams.head().items(), 1):
    print(f"{i}. {artist}: {streams} streams")
    
print("\nTotal Number of Unique Artists:", data['topArtists']['numUniqueArtists'])
print(f"Total Listening Time for Top Artists: {round(total_listening_time_in_mins)} minutes")
print("Top Artist Fan Percentage:", data['topArtists']['topArtistFanPercentage'])

In [None]:
for artist in top_5_artists:
    artist_data = df_artists[df_artists['artist_uri'] == artist].groupby('month')['streams'].sum()
    peak_month = artist_data.idxmax()
    peak_streams = artist_data.max()
    print(f"\nPeak month for {artist}: Month {peak_month} with {peak_streams} streams")

# Top Genres, Podcasts, Artists

In [None]:
# Top Genres
print("Top Genres:")
print(", ".join(data['topGenres']['topGenres']))
print(f"Total number of genres: {data['topGenres']['numberGenres']}")

In [None]:
# Top Podcasts
print("\nTop Podcasts:")
for uri in data['topPodcasts']['topPodcastsUri']:
    print(uri)
print(f"Total podcast listening time: {data['topPodcasts']['totalPodcastMilliseconds'] / 60000:.2f} minutes")
print(f"Podcast listening percentage: {data['topPodcasts']['topPodcastPercentage'] * 100:.2f}%")

In [None]:
# Top Tracks
top_tracks_uris = data['topTracks']['topTracks']
# Save to a JSON file
with open('datasets/top_tracks_uris.json', 'w') as f:
    json.dump(top_tracks_uris, f)
    
# Print additional statistics
print("\nTop Tracks:")
for uri in top_tracks_uris:
    print(uri)
print(f"Top track play count: {data['topTracks']['topTrackPlayCount']}")
print(f"Distinct tracks played: {data['topTracks']['distinctTracksPlayed']}")
print(f"Top track first played date: {data['topTracks']['topTrackFirstPlayedDate']}")

# Yearly Metrics

In [None]:
# Bonus: Yearly Metrics
print("\nYearly Metrics:")
yearly = data['yearlyMetrics']
print(f"Total listening time: {yearly['totalMsListened'] / 3600000:.2f} hours")
print(f"Most listened day: {yearly['mostListenedDay']}")
print(f"Minutes listened on most active day: {yearly['mostListenedDayMinutes']:.2f}")
print(f"Percentile among users: {yearly['percentGreaterThanWorldwideUsers']:.2f}%")