In [1]:
import itertools
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

## Extract jsons to csvs

In [None]:
def clean_data(raw, z_threshold=2.5, min_listen_time=20):
    """
    Drops entries with abnormal listening_time (z-score too high or time too low).
    Also removes entries with unknown origin
    :param min_listen_time: entries where the user has listened for less than this (s) are dropped
    :param z_threshold: maximum z-score allowed in the dataset
    :param raw: raw data to be processed
    :return: clean dataset (pd.DataFrame)
    """
    if z_threshold > 0:
        print("Computing z-score...")
        z = np.abs(stats.zscore(raw["listening_time"]))

    print("Cleaning...")
    # Entries with a z-score above the threshold will be dropped (also instant-skips of listening_time < X)
    if z_threshold > 0:
        raw.drop(raw[z > z_threshold].index, inplace=True)
    raw.drop(raw[raw.listening_time < min_listen_time].index, inplace=True)
    return raw[(raw["origin"] != "unknown")]

In [None]:
# New Dataset!
# test = pd.read_json("../../data/spring2020/cmb_2020-06-19_streams_12.json", lines=True)
# test = clean_data(test, z_threshold=0)
# Lighter csv export
# test.drop(["album_id", "listen_type", "d", "ts_listen", "listening_time"], axis=1).rename(columns={"anon_user_id": "user_id", "media_id": "sng_id"}).to_csv("../../data/spring2020/dec_simple.csv", index=False)

## Combinations

In [3]:
# Combining into one?
months = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
files = [f"../../data/spring2020/{m}_simple.csv" for m in months]
data = pd.read_csv(files[0])
for i in range(1, 12):
    data = pd.concat([data, pd.read_csv(files[i])])

data.to_csv("../../data/spring2020/all_simple.csv", index=False)
data

Unnamed: 0,user_id,sng_id,origin,artist_id
0,49d2e98bcb58048af766d54872159ebb9849973f,675801,page_artist,611
1,948e06ab49e2af86f67afd19778060049d770a13,6626876,page_artist,611
2,753b6c11ec65e8fb293e9f080a2494eb10f82f81,4288159,page_artist,611
3,948e06ab49e2af86f67afd19778060049d770a13,368596071,page_artist,611
4,a643a25dfac987b543948c02e9de9872414e2fa4,352060551,page_artist,891
...,...,...,...,...
3907320,2707be884a98f7daf233ef9e814f4a7a72cc5d6f,124603268,page_playlist,246791
3907321,2707be884a98f7daf233ef9e814f4a7a72cc5d6f,467988092,page_playlist,4907510
3907322,2707be884a98f7daf233ef9e814f4a7a72cc5d6f,654827842,page_playlist,4907510
3907323,2707be884a98f7daf233ef9e814f4a7a72cc5d6f,356902671,page_playlist,616943
