In [13]:
import os
import json
import pandas as pd
import numpy as np

import plotly.express as px

## Import the spotify data from the JSON file

__[Interesting discussion about the difference between json.load and json.loads](https://stackoverflow.com/questions/39719689/what-is-the-difference-between-json-load-and-json-loads-functions)__

In [14]:
#Initially, didn't know how to iterate through the files so did it with only one.

#with open("MyData/StreamingHistory1.json") as json_file:
    #streamingHistory1 = json.load(json_file)

In [15]:
#Trying to open the two streaming history files at the same time and add them to the same list, meaning loading the first one then adding the second one.
#They should have the same structure so it should be possible

#First, we initiate an empty list
streamingHistory_list = []
#we specify the absolute path where we want to look at our data
json_files_location = '/Users/kogimandias/DataScience/Spotifylex/MyData'
#we iterate through our files in that directory, we need the os library for that
for json_file in os.listdir(json_files_location):
    #We don't want to check all our files so we restrict the search to the streaminghistory files
    if 'StreamingHistory' in json_file:
        #we use two important statements here. 
        #"With" handles exceptions by ensuring that each file that we iterate through is closed.
        #"Open" opens a file and returns a file object so that we can manipulate it 
        with open(f"MyData/{json_file}") as f:
            #the load function returns a list, we store it and add it to our initially empty list        
            streamingHistory = json.load(f)
            streamingHistory_list.append(streamingHistory)

#STILL A PROBLEM, ONLY THE SECOND FILE IS ADDED TO THE EMPTY LIST, STREAMINGHISTORY0 MISSING

In [16]:
os.getcwd()

'/Users/kogimandias/DataScience/Spotifylex'

## Working with DataFrames

We have a dictionnary but we need to analyze the data and in order to do so, let's transform it to a dictionnary

In [17]:
streamingHistory = pd.DataFrame.from_dict(streamingHistory)

In [18]:
streamingHistory.dtypes

endTime       object
artistName    object
trackName     object
msPlayed       int64
dtype: object

Three of the four features are of type of object but with domain knowledge, we know that *trackName* and *artistName* should be **strings** while *endTime* should be **dateTime**

In [19]:
streamingHistory['artistName']=streamingHistory['artistName'].astype('string')
streamingHistory['trackName']=streamingHistory['trackName'].astype('string')
streamingHistory['endTime']=pd.to_datetime(streamingHistory.endTime)


We don't really need to have the number of milliseconds played for each songs but rather the number of seconds is more indicative.

In [20]:
streamingHistory['sPlayed'] = (streamingHistory['msPlayed']/60).round().astype('int')
streamingHistory.drop('msPlayed', axis=1, inplace=True)

# Exploratory Data Analysis

In [21]:
songs_played = streamingHistory.groupby(['trackName']).sum()

In [22]:
#to transform a univariate dataframe into a series, we can use the "squeeze()" function
songs_played = songs_played.squeeze()

In [23]:
songs_played.dtype

dtype('int64')

In [24]:
#We don't want to plot every single song, only the most played.
most_played_songs = songs_played[songs_played > 50000].sort_values()
test = px.bar(most_played_songs)
test.show()