In [22]:
import numpy as np
import pandas as pd
import json
import os

import matplotlib.pyplot as plt
import seaborn as sns

### Convert files to json formats

In [38]:
def get_list_of_jsons(filepath):
    data = []
    with open(filepath, "r") as f:
        for row_data in f.readlines():
            data.append(json.loads(row_data))
    return data

def save_json(list_of_jsons, file_name):
    with open(file_name, "w+") as f:
        json.dump(list_of_jsons, f)

In [39]:
for file_path in os.listdir("../data/"):
    file_path = os.path.join("../data/", file_path)
    data = get_list_of_jsons(file_path)
    new_file_path = file_path[:-1]  # get filepath of json file from jsonl
    save_json(data, new_file_path)
    print(file_path)

../data/artists.jsonl
../data/sessions.jsonl
../data/tracks.jsonl
../data/track_storage.jsonl
../data/users.jsonl


### Read data

In [161]:
artists = pd.read_json("../data/artists.json")
# sessions = pd.read_json("../data/sessions.json")
# tracks = pd.read_json("../data/tracks.json")
# track_storage = pd.read_json("../data/track_storage.json")
# users = pd.read_json("../data/users.json")

### Search for NaNs or wrong values

In [162]:
# convert ids to string and check their length
artists.id.apply(lambda x: len(str(x))).value_counts()

22    1596
2       71
Name: id, dtype: int64

In [163]:
artists.id.value_counts().head()

-1                        71
1fa0cOhromAZdq2xRA4vv8     1
6W5uA6CNMf3hd2j4a2XWCx     1
20JZFwl6HVl6yg8a4H3ZqK     1
1vppDmG3i5sXf3DJzrK4T1     1
Name: id, dtype: int64

In [164]:
# id of -1 suggest a wrong value
artists.id.replace(-1, np.nan, inplace=True)

In [165]:
artists.name.isna().any()

False

In [166]:
artists = artists[~artists.isna().any(axis=1)]

In [167]:
artists.head()

Unnamed: 0,id,name,genres
1,0xRXCcSX89eobfrshSVdyu,MEDUZA,"[dance pop, edm, pop dance, pop house, tropica..."
3,4f7KfxeHq9BiylGmyXepGt,Tanishk Bagchi,"[desi pop, filmi, modern bollywood]"
4,56mfhUDKa1vec6rSLZV5Eg,Jawsh 685,[nz pop]
5,523y9KSneKh6APd1hKxLuF,Master KG,[south african house]
6,25UNJbwGZSQKvz5cPLWlv3,Los Dos Carnales,"[corrido, norteno, nuevo regional mexicano, re..."


### Onehotencode genres

https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list

In [168]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)

artists = artists.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(artists.pop('genres')),
                index=artists.index,
                columns=mlb.classes_))

In [170]:
artists.head()

Unnamed: 0,id,name,a cappella,acid rock,acoustic pop,adult standards,afro dancehall,afrofuturism,afrofuturismo brasileiro,afropop,...,welsh metal,west coast rap,women's music,world,world worship,worship,wrestling,yacht rock,zhongguo feng,zolo
1,0xRXCcSX89eobfrshSVdyu,MEDUZA,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4f7KfxeHq9BiylGmyXepGt,Tanishk Bagchi,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,56mfhUDKa1vec6rSLZV5Eg,Jawsh 685,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,523y9KSneKh6APd1hKxLuF,Master KG,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,25UNJbwGZSQKvz5cPLWlv3,Los Dos Carnales,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
