In [1]:
import math
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import skops.io as sio
from copy import deepcopy
from sklearn.preprocessing import QuantileTransformer

# import os
# print(os.getcwd())

In [2]:
RND_SEED: int = 12345
np.random.seed(RND_SEED)
pd.core.common.random_state(RND_SEED)

RandomState(MT19937) at 0x12727F340

In [3]:
df: pd.DataFrame = pd.read_csv("/Users/nattamon/Documents/Spotify_Project_203/Combined-2023.csv", encoding="utf-8", index_col=[0])
tr_feature: str = "streams"
df["streams"] = df["streams"].astype(str).apply(lambda x: float(x) / 1e6 if x.isdigit() else np.nan)
df = df[tr_feature]

In [6]:
arr = df.dropna().to_numpy()
arr.sort()

def percentile(data: np.ndarray, p: float) -> float:
    n: int = len(data)
    x: float = n * p + 0.5
    if x.is_integer():
        return data[int(x - 1)]
    x1, x2 = math.floor(x), math.ceil(x)
    y1, y2 = data[x1 - 1], data[x2 - 1]
    return np.interp(x=x, xp=[x1, x2], fp=[y1, y2])

min_: float = np.min(arr)
max_: float = np.max(arr)
std: float = np.std(arr, ddof=1)
mean: float = np.mean(arr)
q1, median, q3 = percentile(arr, 0.25), percentile(arr, 0.5), percentile(arr, 0.75)
iqr: float = q3 - q1
lw_limit: float = q1 - 1.5 * iqr
lw_fence: float = np.min(list(filter(lambda x: x >= lw_limit, arr)))
up_limit: float = q3 + 1.5 * iqr
up_fence: float = np.max(list(filter(lambda x: x <= up_limit, arr)))
l_outlier = np.array(list(filter(lambda x: not math.isclose(x, lw_fence, abs_tol=1e-6) and x < lw_fence, arr)))
u_outlier = np.array(list(filter(lambda x: not math.isclose(x, up_fence, abs_tol=1e-6) and x > up_fence, arr)))
print(l_outlier)
print(u_outlier)

arr_clean = np.clip(arr, a_min=lw_fence, a_max=up_fence)
df_out = pd.DataFrame({
    "raw": arr,
    "clean": arr_clean,
}).melt(var_name="stage")

[]
[1472.799873 1479.115056 1479.264469 1481.349984 1553.497987 1555.511105
 1575.467011 1591.223784 1592.909789 1593.270737 1605.224506 1606.986953
 1608.045237 1608.164312 1624.165576 1641.426668 1647.990401 1661.187319
 1687.664027 1690.192927 1692.897992 1695.71202  1699.402402 1714.490998
 1735.441776 1755.214421 1759.567999 1763.363713 1788.326445 1791.00057
 1802.514301 1806.617704 1813.673666 1814.349763 1829.992958 1840.364617
 1858.144199 1887.039593 1897.517891 1929.770265 1947.371785 1953.533826
 1970.673297 2009.094673 2011.464183 2086.124197 2123.309722 2132.335812
 2135.158446 2159.346687 2197.010679 2204.080728 2236.667932 2280.566092
 2282.771485 2288.695111 2303.033973 2322.580122 2355.719893 2420.461338
 2484.812918 2513.188493 2557.975762 2559.529074 2565.529693 2591.224264
 2594.040133 2665.343922 2713.92235  2808.09655  2864.791672 2887.241814
 3562.54389  3703.895074]


In [7]:
annots = [
    dict(name="min", y=min_, text=f'Min: {min_:5.2f}'),
    dict(name="lower", y=lw_fence, text=f'Lower: {lw_fence:5.2f}'),
    dict(name="q1", y=q1, text=f'Q1: {q1:5.2f}'),
    dict(name="mean", y=mean, text=f'Mean±σ: {mean:5.2f}±{std:5.2f}'),
    dict(name="median", y=median, text=f'Median: {median:5.2f}'),
    dict(name="q3", y=q3, text=f'Q3: {q3:5.2f}'),
    dict(name="up", y=up_fence, text=f'Upper: {up_fence:5.2f}'),
    dict(name="max", y=max_, text=f'Max: {max_:5.2f}'),
]
fig = go.Figure(layout=px.box(df, y=tr_feature).layout)
fig.add_trace(go.Box(
    y=df.to_numpy(),
    name="Streams",
    boxpoints="outliers",
    boxmean=True,
    whiskerwidth=0.75,
    marker={
        "color": "rgba(90, 188, 110, 127)",
        "line": {
            "color": "red",
            "width": 3,
        }
    },
))
common_annotation_params = dict(
    font=dict(size=18, color="#ffffff"),
    showarrow=False,
    bgcolor=fig.data[0]["marker"]["color"], # Get same colour as the facet plot
    xref="x", # Specify which facet to put the annotation; Goes like x, x2, x3 ... xn.
    x=0.275,
    xanchor="left", # Align all the labels on x axis
)
anchor_right = set(["min", "q1", "mean"])
for annot in annots:
    annot = {**annot, **common_annotation_params}
    if annot["name"] in anchor_right:
        annot["x"] = -0.275
        annot["xanchor"] = "right"
    fig.add_annotation(annot)

fig.update_layout(
    showlegend=False,
    yaxis=dict(title_text="Million Views"),
    font=dict(
        size=18
    )
)
fig.show()
# fig.write_image("./../../images/2023/box-stream.svg", width=1366, height=768, scale=1.0)
# fig.write_image("./../../images/2023/box-stream.png", width=1366, height=768, scale=1.0)

In [8]:
df = df[(df >= lw_limit) & (df <= up_limit)]

In [9]:
def plot_hist(file_name: str, title: str) -> None:
    fig = px.histogram(df, x=tr_feature, histnorm="probability density")
    fig_data = fig.full_figure_for_development(warn=False)
    bin_data: dict[str, int] = deepcopy(fig_data.data[0].xbins)
    st, sz = bin_data["start"], bin_data["size"]
    n_edge = math.ceil((bin_data["end"] - st) / sz)
    ed = st + n_edge * sz
    np_bins = np.arange(st, ed + sz, sz, dtype=np.float32)
    pdf, edges = np.histogram(fig_data.data[0].x, bins=np_bins, density=True)
    print(pdf)
    del edges

    bin_center: list[float] = []
    for i in range(0, np_bins.shape[0] - 1):
        bin_center.append((np_bins[i] + np_bins[i + 1]) / 2)
    del i
    fig = go.Figure(data=[
        go.Histogram(
            x=fig_data.data[0].x,
            xbins={
                "start": st,
                "end": ed,
                "size": sz,
            },
            histnorm="probability density",
            marker={
                "color": "rgba(79, 189, 156, 127)",
                "line": {
                    "color": "rgba(16, 105, 103, 255)",
                    "width": 1,
                },
            },
            name="Histogram"
        ),
        go.Scatter(
            x=bin_center,
            y=pdf,
            name="PDF",
            mode="lines",
            marker={
                "color": "rgba(255, 10, 10, 255)",
            }
        ),
    ], layout=fig.layout)
    fig.update_layout(dict(
        xaxis_title=title,
        yaxis_title="Probabality Density",
        font=dict(
            size=18
        )
    ))
    fig.show()
    # fig.write_image(f"./../../images/2023/{file_name}.svg", width=1366, height=768, scale=1.0)
    # fig.write_image(f"./../../images/2023/{file_name}.png", width=1366, height=768, scale=1.0)
plot_hist("hist_stream", "Streams (Million Views)")

[1.82232346e-04 2.09567198e-03 2.32346241e-03 2.16400911e-03
 2.07289294e-03 1.57175399e-03 1.34396355e-03 1.11617312e-03
 7.74487472e-04 6.83371298e-04 5.69476082e-04 4.32801822e-04
 6.37813212e-04 3.41685649e-04 5.01138952e-04 2.27790433e-04
 3.87243736e-04 2.96127563e-04 2.27790433e-04 1.82232346e-04
 1.82232346e-04 1.82232346e-04 1.82232346e-04 2.50569476e-04
 2.50569476e-04 2.27790433e-04 1.82232346e-04 1.59453303e-04
 6.83371298e-05 1.82232346e-04]


In [12]:
scaler = QuantileTransformer(output_distribution="normal", random_state=RND_SEED)
arr = df.to_numpy()[:, np.newaxis]
scaler.fit(arr)
df[:] = scaler.transform(arr)[:, 0]
# sio.dump(scaler, "./../../scalers/2023/streams.skops")
plot_hist("histnorm_stream", "Normalized Streams")

[0.00569475 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.00569475
 0.01138951 0.01138951 0.02277901 0.03416852 0.05694753 0.07972654
 0.1082003  0.14806357 0.19362159 0.24487436 0.29043239 0.33029566
 0.37015892 0.38724318 0.39863269 0.38724318 0.37015892 0.33029566
 0.29043239 0.24487436 0.19362159 0.14806357 0.1082003  0.07972654
 0.05694753 0.03416852 0.02277901 0.01138951 0.01138951 0.00569475
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.00569475 0.        ]



n_quantiles (1000) is greater than the total number of samples (878). n_quantiles is set to n_samples.

