In [1]:
from pathlib import Path
from dataclasses import dataclass, asdict

@dataclass
class CFG:
    train_path: Path = Path("../data/train.csv")
    test_path: Path = Path("../data/test.csv")
    sub_path: Path = Path("../data/sample_submission.csv")

    num_fold: int = 5
    dev_mode: bool = False

    # Model parameters
    n_iter: int = 10000
    max_depth: int = -1
    num_leaves: int = 1024
    colsample_bytree: float = 0.7
    learning_rate: float = 0.02

    objective: str = 'l2'
    metric: str = 'rmse'
    verbosity: int = -1
    max_bin: int = 1024
    
    random_state: int = 42
    shuffle: bool = True
    encoded_columns_start: int = -91
    log_eval: int = 100
    early_stopping: int = 200
    
cfg = CFG() 
asdict(cfg)

{'train_path': PosixPath('../data/train.csv'),
 'test_path': PosixPath('../data/test.csv'),
 'sub_path': PosixPath('../data/sample_submission.csv'),
 'num_fold': 5,
 'dev_mode': False,
 'n_iter': 10000,
 'max_depth': -1,
 'num_leaves': 1024,
 'colsample_bytree': 0.7,
 'learning_rate': 0.02,
 'objective': 'l2',
 'metric': 'rmse',
 'verbosity': -1,
 'max_bin': 1024,
 'random_state': 42,
 'shuffle': True,
 'encoded_columns_start': -91,
 'log_eval': 100,
 'early_stopping': 200}

In [None]:
import sys

sys.path.append("../src")

import altair as alt

alt.renderers.enable("jupyter", offline=True)
alt.data_transformers.disable_max_rows()

from pathlib import Path

import altair as alt
import numpy as np
import polars as pl
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from config import cfg
from data.data_process import add_fold, preprocess, feature_eng
# from data.simple_feature_eng import 

# warnings.filterwarnings("ignore")
# warnings.simplefilter("ignore")

def calculate_rmse(actual, predicted):
    squared_diff = (actual - predicted) ** 2
    mean_squared_diff = squared_diff.mean()
    rmse = np.sqrt(mean_squared_diff)
    return rmse

cfg.train_path = Path("../data/train.csv")
cfg.test_path = Path("../data/test.csv")
cfg.pltpd_path = Path("../data/podcast_dataset.csv")



df_train = pl.read_csv(cfg.train_path)
df_train = df_train.drop_nulls(subset=["Episode_Length_minutes", "Number_of_Ads", "Guest_Popularity_percentage"])

# df_train = df_train.drop("id")
df_train = add_fold(df_train)
df_train = preprocess(df_train)
df_train = feature_eng(df_train, df_train)

df_pltpd = pl.read_csv(cfg.pltpd_path)
df_pltpd = df_pltpd.drop_nulls(subset=["Listening_Time_minutes"])
df_pltpd = df_pltpd.drop_nulls(subset=["Episode_Length_minutes", "Number_of_Ads", "Listening_Time_minutes", "Guest_Popularity_percentage"])
df_pltpd = df_pltpd.with_columns(pl.col("Number_of_Ads").cast(pl.Float64))
df_pltpd = add_fold(df_pltpd)
df_pltpd = preprocess(df_pltpd, df_train)
df_pltpd = df_pltpd.with_columns(pl.Series(range(1_000_000, 1_000_000 + len(df_pltpd))).alias("id"))
df_pltpd = feature_eng(df_pltpd, df_train)
# df_pltpd

df_test = pl.read_csv(cfg.test_path)
df_test = preprocess(df_test, df_train)
df_test = feature_eng(df_test, df_train)

# df_train = df_train.drop(["id", "fold"])
# df_pltpd = df_pltpd.drop(["id", "fold"])

df = df_train.clone()
df


id,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,fold,Episode_Num,Episode_Length_minutes_NaN,Guest_Popularity_percentage_NaN,Day_sin,Day_cos,Day_sin2,Day_cos2,Time_sin,Time_cos,Time_sin2,Time_cos2,Length_per_Ads,Length_per_Host,Length_per_Guest,ELen_Int,ELen_Dec,HPperc_Int,HPperc_Dec,Is_Positive_Sentiment,Sentiment_Multiplier,Episode_Length_squared,Episode_Length_squared2,Long_Term_Cycle_Sin,Long_Term_Cycle_Cos,Expected_Listening_Time_Sentiment,Diff_Squared
i64,cat,f64,cat,f64,cat,cat,f64,f64,cat,f64,cat,cat,cat,cat,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i8,f64,f64,f64,f64,f64,f64,f64
1,"""1""",119.8,"""1""",66.95,"""5""","""14""",75.95,2.0,"""0""",88.01241,"""Joke Junction_Episode 26_66.95…","""26""","""false""","""false""",-0.974928,-0.222521,0.433884,-0.900969,8.5725e-16,-1.0,0.866025,0.5,39.933333,1.763061,1.556855,119.0,0.8,66.0,0.95,0,0.717,14352.04,1.7194e6,0.998027,-0.062791,85.8966,3701.5993
2,"""2""",73.9,"""2""",69.97,"""1""","""17""",8.97,0.0,"""0""",44.92531,"""Study Sessions_Episode 16_69.9…","""16""","""false""","""false""",0.781831,0.62349,0.974928,-0.222521,1.0,-7.3541e-16,0.5,-0.866025,73.9,1.041285,7.412237,73.0,0.9,69.0,0.97,0,0.717,5461.21,403583.419,0.844328,0.535827,52.9863,2204.6169
3,"""3""",67.17,"""3""",57.22,"""0""","""10""",78.7,2.0,"""2""",46.27824,"""Digital Digest_Episode 45_57.2…","""45""","""false""","""false""",0.0,1.0,0.0,1.0,6.1232e-16,-1.0,-0.866025,0.5,22.39,1.153727,0.842785,67.0,0.17,57.0,0.22,1,0.75,4511.8089,303058.203813,0.309017,-0.951057,50.3775,648.1558
4,"""4""",110.51,"""4""",80.07,"""0""","""14""",58.68,3.0,"""1""",75.61031,"""Mind & Body_Episode 86_80.07_M…","""86""","""false""","""false""",0.0,1.0,0.0,1.0,8.5725e-16,-1.0,0.866025,0.5,27.6275,1.363143,1.851709,110.0,0.51,80.0,0.07,0,0.717,12212.4601,1.3496e6,-0.770513,0.637424,79.23567,2636.3905
6,"""6""",69.83,"""0""",35.82,"""6""","""21""",39.02,0.0,"""1""",64.75024,"""Criminal Minds_Episode 47_35.8…","""47""","""false""","""false""",-0.781831,0.62349,-0.974928,-0.222521,1.0,-4.9048e-16,-1.0,-4.2863e-16,69.83,1.896524,1.744878,69.0,0.83,35.0,0.82,0,0.717,4876.2289,340507.064087,0.187381,-0.982287,50.06811,825.0426
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
749992,"""46""",48.67,"""9""",88.62,"""2""","""17""",25.65,3.0,"""2""",42.08465,"""Fashion Forward_Episode 75_88.…","""75""","""false""","""false""",0.974928,-0.222521,-0.433884,-0.900969,1.0,-7.3541e-16,0.5,-0.866025,12.1675,0.543071,1.826266,48.0,0.67,88.0,0.62,1,0.75,2368.7689,115287.982363,-1.0,-1.8370e-16,36.5025,1845.3573
749993,"""28""",23.52,"""9""",38.14,"""1""","""17""",86.17,0.0,"""1""",19.71374,"""Style Guide_Episode 83_38.14_T…","""83""","""false""","""false""",0.781831,0.62349,0.974928,-0.222521,1.0,-7.3541e-16,0.5,-0.866025,23.52,0.60092,0.269818,23.0,0.52,38.0,0.14,0,0.717,553.1904,13011.038208,-0.876307,0.481754,16.86384,3138.8662
749997,"""37""",30.98,"""9""",78.58,"""3""","""10""",84.89,0.0,"""0""",15.26,"""Lifestyle Lounge_Episode 51_78…","""51""","""false""","""false""",0.433884,-0.900969,-0.781831,0.62349,6.1232e-16,-1.0,-0.866025,0.5,30.98,0.389294,0.360694,30.0,0.98,78.0,0.58,0,0.717,959.7604,29733.377192,-0.062791,-0.998027,22.21266,2401.3926
749998,"""28""",108.98,"""9""",45.39,"""3""","""10""",93.27,0.0,"""0""",100.72939,"""Style Guide_Episode 47_45.39_T…","""47""","""false""","""false""",0.433884,-0.900969,-0.781831,0.62349,6.1232e-16,-1.0,-0.866025,0.5,108.98,2.349213,1.156041,108.0,0.98,45.0,0.39,0,0.717,11876.6404,1.2943e6,0.187381,-0.982287,78.13866,3831.5353


In [21]:
df_pltpd.null_count()

Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,5246,0,0,0,0,5258,0,0,5395


In [17]:
import polars.selectors as cs

numeric_cols = df_pltpd.select(cs.numeric()).columns
inf_counts_by_col = df_pltpd.select([
    pl.col(col).is_infinite().sum().alias(f"{col}_inf_count")
    for col in numeric_cols
])

print("Infinity count for all columns:")
print(inf_counts_by_col)

# Check it is more than 0
inf_counts_by_col = inf_counts_by_col.to_dict()
inf_counts_by_col = {k: v[0] for k, v in inf_counts_by_col.items()}
# inf_counts_by_col = {k: v for k, v in inf_counts_by_col.items() if v > 0}
if len(inf_counts_by_col) > 0:
    print("Infinity count for all columns:")
    print(inf_counts_by_col)

Infinity count for all columns:
shape: (1, 29)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Episode_L ┆ Host_Popu ┆ Guest_Pop ┆ Number_of ┆ … ┆ Long_Term ┆ Long_Term ┆ Expected_ ┆ Diff_Squ │
│ ength_min ┆ larity_pe ┆ ularity_p ┆ _Ads_inf_ ┆   ┆ _Cycle_Si ┆ _Cycle_Co ┆ Listening ┆ ared_inf │
│ utes_inf_ ┆ rcentage_ ┆ ercentage ┆ count     ┆   ┆ n_inf_cou ┆ s_inf_cou ┆ _Time_Sen ┆ _count   │
│ cou…      ┆ inf…      ┆ _in…      ┆ ---       ┆   ┆ nt        ┆ nt        ┆ tim…      ┆ ---      │
│ ---       ┆ ---       ┆ ---       ┆ u32       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ u32      │
│ u32       ┆ u32       ┆ u32       ┆           ┆   ┆ u32       ┆ u32       ┆ u32       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 0         ┆ 0         ┆ 0         ┆ 0         ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0        │
└───────────┴───────────┴───────────┴───────

In [16]:
inf_counts_by_col

{}

In [3]:
# df_train, df_valid = df_train.random_split(0.8, seed=cfg.random_state)
df_valid = df.sample(fraction=0.2, seed=cfg.random_state)
df_train = df.filter(~pl.col("id").is_in(df_valid["id"]))

df_train = df_train.drop(["id", "fold"])
df_valid = df_valid.drop(["id", "fold"])

df_train, df_valid


(shape: (431_239, 36)
 ┌────────────┬────────────┬───────┬────────────┬───┬───────────┬───────────┬───────────┬───────────┐
 │ Podcast_Na ┆ Episode_Le ┆ Genre ┆ Host_Popul ┆ … ┆ Long_Term ┆ Long_Term ┆ Expected_ ┆ Diff_Squa │
 │ me         ┆ ngth_minut ┆ ---   ┆ arity_perc ┆   ┆ _Cycle_Si ┆ _Cycle_Co ┆ Listening ┆ red       │
 │ ---        ┆ es         ┆ cat   ┆ entage     ┆   ┆ n         ┆ s         ┆ _Time_Sen ┆ ---       │
 │ cat        ┆ ---        ┆       ┆ ---        ┆   ┆ ---       ┆ ---       ┆ tim…      ┆ f32       │
 │            ┆ f32        ┆       ┆ f32        ┆   ┆ f64       ┆ f64       ┆ ---       ┆           │
 │            ┆            ┆       ┆            ┆   ┆           ┆           ┆ f32       ┆           │
 ╞════════════╪════════════╪═══════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
 │ 1          ┆ 119.800003 ┆ 1     ┆ 66.949997  ┆ … ┆ 0.998027  ┆ -0.062791 ┆ 85.896606 ┆ 3701.5993 │
 │            ┆            ┆       ┆            ┆   ┆       

In [9]:
from itertools import combinations
from tqdm import tqdm
import polars.selectors as cs

cols = df.columns
cols = [item for item in cols if not "listening_time_minutes" in item.lower()]
# cols = [item for item in cols if not "sin" in item.lower() and not "cos" in item.lower() ]
# cols = [item for item in cols if not "pte" in item.lower()]


cols = [
    # 'Listening_Time_minutes',
    
    # 'Podcast_Name',
    # 'Episode_Length_minutes',
    # 'Genre',
    # 'Host_Popularity_percentage',
    # 'Publication_Day',
    # 'Publication_Time',
    # 'Guest_Popularity_percentage',
    # 'Number_of_Ads',
    # 'Episode_Sentiment',
    # 'Episode_Num',
    # 'Episode_Length_minutes_NaN',
    # 'Guest_Popularity_percentage_NaN',
    # 'Day_sin',
    # 'Day_cos',
    # 'Day_sin2',
    # 'Day_cos2',
    # 'Time_sin',
    # 'Time_cos',
    # 'Time_sin2',
    # 'Time_cos2',
    'Length_per_Ads',
    'Length_per_Host',
    'Length_per_Guest',
    'ELen_Int',
    'ELen_Dec',
    'HPperc_Int',
    'HPperc_Dec',
    # 'Is_Positive_Sentiment',
    # 'Sentiment_Multiplier',
    # 'Episode_Length_squared',
    # 'Episode_Length_squared2',
    # 'Long_Term_Cycle_Sin',
    # 'Long_Term_Cycle_Cos',
    # 'Expected_Listening_Time_Sentiment',
    
    # 'Episode_Length_squared',
    # 'Episode_Length_squared2',
    # 'pte-Episode_Length_minutes',
    # 'pte-Host_Popularity_percentage',
    # 'pte-Guest_Popularity_percentage',
    # 'pte-Number_of_Ads',
    # 'pte-Length_per_Ads',
    # 'pte-Length_per_Host',
    # 'pte-Length_per_Guest',
    # 'pte-ELen_Int',
    # 'pte-ELen_Dec',
    # 'pte-HPperc_Int',
    # 'pte-HPperc_Dec',
    # 'pte-Is_Positive_Sentiment',
    # 'pte-Sentiment_Multiplier',
    # 'pte-Episode_Length_squared',
    # 'pte-Episode_Length_squared2',
]

combs = []
combs += list(combinations(cols, 1))
combs += list(combinations(cols, 2))
combs += list(combinations(cols, 3))
# combs += list(combinations(cols, 4))

combs += [('Podcast_Name', 'Host_Popularity_percentage', 'Guest_Popularity_percentage')]

round_num = 4

std_distributes = []
for comb in tqdm(combs):
    # concat_expr = pl.col(comb[0]).cast(pl.Utf8)

    if comb[0] in df.select(cs.numeric()).columns:
        concat_expr =  pl.col(comb[0]).round(round_num).cast(pl.Utf8)
    else:
        concat_expr =  pl.col(comb[0]).cast(pl.Utf8)
    
    for col_name in comb[1:]:
        if col_name in df.select(cs.numeric()).columns:
            concat_expr = concat_expr + "_" + pl.col(col_name).round(round_num).cast(pl.Utf8)
        else:
            concat_expr = concat_expr + "_" + pl.col(col_name).cast(pl.Utf8)
    
    df_train = df_train.with_columns(
        concat_expr.alias("group").cast(pl.Categorical)
    )

    df_group = df_train.group_by("group").agg(
        pl.col("Listening_Time_minutes").count().alias("count"),
        pl.col("Listening_Time_minutes").std().alias("std_listening_time"),
        pl.col("Listening_Time_minutes").mean().alias("mean_listening_time"),
        pl.col("Episode_Length_minutes").mean().alias("mean_episode_length"),
        
    )
    df_group = df_group.filter(~pl.col("std_listening_time").is_null())
    if len(df_group) == 0:
        print(f"Skip {comb}")
        continue

    df_valid = df_valid.with_columns(
        concat_expr.alias("group").cast(pl.Categorical)
    )
    if "mean_listening_time" in df_valid.columns:
        df_valid = df_valid.drop("mean_listening_time")
    df_valid = df_valid.join(
        df_group[["group", "mean_listening_time"]],
        on="group",
        how="left",
    )

    std_distributes.append({
        "group": "-".join(comb),
        "cover_rate": df_group["count"].sum() / df_train["Listening_Time_minutes"].count(),
        "len": len(df_group),
        "mean_count": df_group["count"].mean(),
        "median_count": df_group["count"].median(),
        "std_count": df_group["count"].std(),
        "mean_std_listening_time": df_group["std_listening_time"].mean(),
        "rmse": calculate_rmse(df_valid["Listening_Time_minutes"], df_valid["mean_listening_time"]),
    })

    # break

std_distributes = pl.DataFrame(std_distributes)
std_distributes = std_distributes.sort("rmse")
# std_distributes.write_csv("std_distributes.csv")
std_distributes

  df_valid = df_valid.join(
100%|██████████| 64/64 [00:22<00:00,  2.83it/s]


group,cover_rate,len,mean_count,median_count,std_count,mean_std_listening_time,rmse
str,f64,i64,f64,f64,f64,f64,f64
"""Length_per_Ads-Length_per_Host…",0.020896,4087,2.204796,2.0,0.608631,0.001181,0.000014
"""Length_per_Host-Length_per_Gue…",0.025814,4984,2.233547,2.0,0.674371,0.012236,0.000022
"""Length_per_Host-Length_per_Gue…",0.025749,4970,2.234205,2.0,0.675206,0.003751,0.000022
"""Length_per_Host-Length_per_Gue…",0.025763,4973,2.234064,2.0,0.675027,0.010693,0.000022
"""Length_per_Host-Length_per_Gue…",0.025506,4926,2.232846,2.0,0.674996,0.000979,0.000022
…,…,…,…,…,…,…,…
"""ELen_Dec-HPperc_Int""",0.998639,8080,53.298515,52.0,16.841065,26.393375,27.085489
"""HPperc_Int""",0.999984,85,5073.317647,5317.0,1516.027957,25.816846,27.098182
"""HPperc_Dec""",0.999986,101,4269.633663,4254.0,736.999046,26.911094,27.166574
"""ELen_Dec-HPperc_Dec""",0.998534,10056,42.820903,41.0,14.934361,26.611374,27.247709


In [6]:
std_distributes.filter(pl.col("group")=="Podcast_Name-Host_Popularity_percentage-Guest_Popularity_percentage")

group,cover_rate,len,mean_count,median_count,std_count,mean_std_listening_time,rmse
str,f64,i64,f64,f64,f64,f64,f64
"""Podcast_Name-Host_Popularity_p…",0.030097,5827,2.22739,2.0,0.585866,0.246775,1.934649


In [7]:
std_distributes.plot.point(
    x="cover_rate",
    y="rmse",
    color="group",
).properties(
    width=600,
    height=400,
).show()

JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'data': {'name': 'da…

In [8]:
top_10 = std_distributes.head(10)
display(top_10)

top_10.plot.point(
    x="cover_rate",
    y="rmse",
    color="group",
).properties(
    width=600,
    height=400,
).show()

group,cover_rate,len,mean_count,median_count,std_count,mean_std_listening_time,rmse
str,f64,i64,f64,f64,f64,f64,f64
"""Podcast_Name-Length_per_Host-L…",0.010716,2220,2.081532,2.0,0.331766,0.009282,1e-06
"""Podcast_Name-Guest_Popularity_…",0.010665,2210,2.080995,2.0,0.32996,0.068237,1e-06
"""Podcast_Name-Host_Popularity_p…",0.010706,2220,2.07973,2.0,0.328109,0.03519,1e-06
"""Genre-Host_Popularity_percenta…",0.013134,2709,2.090808,2.0,0.339248,0.222042,1e-06
"""Genre-Guest_Popularity_percent…",0.013086,2698,2.091549,2.0,0.340335,0.247364,1e-06
"""Episode_Num-Length_per_Host-Le…",0.013918,2766,2.16992,2.0,0.568662,0.00787,4e-06
"""Host_Popularity_percentage-Epi…",0.013865,2757,2.168662,2.0,0.567176,0.029669,4e-06
"""Guest_Popularity_percentage-Ep…",0.013844,2751,2.17012,2.0,0.565166,0.005113,4e-06
"""Length_per_Ads-Length_per_Host…",0.020896,4087,2.204796,2.0,0.608631,0.001181,1.4e-05
"""Guest_Popularity_percentage-Le…",0.020659,4041,2.204652,2.0,0.608063,0.001194,1.4e-05


JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'data': {'name': 'da…

In [9]:
filter_cod = (
    (pl.col("rmse") < 12.4)
    # & (pl.col("cover_rate") > 0.2)
)

print(len(std_distributes.filter(filter_cod)))

std_distributes.filter(filter_cod).plot.point(
    x="cover_rate",
    y="rmse",
    color="group",
).properties(
    width=600,
    height=400,
).show()

588


JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'data': {'name': 'da…

In [10]:
filter_cod = (
    (pl.col("rmse") < 12.4)
    & (pl.col("cover_rate") > 0.2)
)

print(len(std_distributes.filter(filter_cod)))

std_distributes.filter(filter_cod).plot.point(
    x="cover_rate",
    y="rmse",
    color="group",
).properties(
    width=600,
    height=400,
).show()

235


JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'data': {'name': 'da…

In [26]:
import altair as alt
import polars as pl


def plot_highlighted_groups(highlight_groups):
    global std_distributes

    plot_df = std_distributes.with_columns(
        pl.when(pl.col("group").is_in(highlight_groups))
        .then(True)
        .otherwise(False)
        .alias("highlighted")
    )

    chart = alt.Chart(plot_df).mark_point().encode(
        x='cover_rate',
        y='rmse',
        color='group',
        size=alt.condition(
            alt.datum.highlighted == True,
            alt.value(50),
            alt.value(10)
        ),
        opacity=alt.condition(
            alt.datum.highlighted == True,
            alt.value(1.0),
            alt.value(0.3)
        ),
        tooltip=['group', 'cover_rate', 'rmse', 'len', 'mean_std_listening_time']
    ).properties(
        width=600,
        height=400,
        title='Highlighted Groups in Std Distributes'
    )

    chart.show()
    print(highlight_groups)

def qcut_least_std(df, n_bins=5):
    with_bins = df.with_columns(
        pl.col("cover_rate").log10().round(4).alias("target_log"),
    )
    with_bins = with_bins.unique(subset=["target_log"], keep="first")
    with_bins = with_bins.with_columns(
        pl.col("target_log").qcut(n_bins, allow_duplicates=True).alias("target_bin")
    )
    print(with_bins["target_bin"].unique().to_list())
    
    result = with_bins.group_by("target_bin").agg(
        pl.col("group").filter(pl.col("rmse") == pl.col("rmse").min())
    ).sort("target_bin")
    
    print(result)

    least_groups = [item[0] for item in result["group"].to_list() if len(item) > 0]
    return least_groups

least_groups = qcut_least_std(std_distributes, n_bins=20)
print("Length of least groups:", len(least_groups))
plot_highlighted_groups(least_groups)

['(-inf, -1.55543]', '(-1.55543, -1.38008]', '(-1.38008, -1.26916]', '(-1.26916, -1.16492]', '(-1.16492, -1.1000999999999999]', '(-1.1000999999999999, -1.02968]', '(-1.02968, -0.879735]', '(-0.879735, -0.76662]', '(-0.76662, -0.6718499999999998]', '(-0.6718499999999998, -0.63415]', '(-0.63415, -0.58807]', '(-0.58807, -0.52002]', '(-0.52002, -0.43141999999999997]', '(-0.43141999999999997, -0.36074]', '(-0.36074, -0.242375]', '(-0.242375, -0.15996]', '(-0.15996, -0.08053000000000007]', '(-0.08053000000000007, -0.031759999999999976]', '(-0.031759999999999976, -0.007120000000000027]', '(-0.007120000000000027, inf]']
shape: (20, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ target_bin                      ┆ group                           │
│ ---                             ┆ ---                             │
│ cat                             ┆ list[str]                       │
╞═════════════════════════════════╪═════════════════════════════════╡
│ (-inf, -1.555

ValueError: Unable to determine data type for the field "mean_std_listening_time"; verify that the field name is not misspelled. If you are referencing a field from a transform, also confirm that the data type is specified correctly.

alt.Chart(...)

['Episode_Num-Length_per_Host-Length_per_Guest', 'Episode_Num-Length_per_Ads-Length_per_Host', 'Episode_Length_minutes-Episode_Num-Length_per_Host', 'Publication_Time-Length_per_Ads-Length_per_Host', 'Episode_Length_minutes-Host_Popularity_percentage-Number_of_Ads', 'Episode_Length_minutes-Episode_Num-HPperc_Int', 'Length_per_Host-ELen_Int-HPperc_Dec', 'Genre-Host_Popularity_percentage-ELen_Int', 'Publication_Time-Length_per_Host-ELen_Int', 'Episode_Sentiment-Length_per_Host-ELen_Int', 'Episode_Length_minutes-Publication_Time-Episode_Num', 'Number_of_Ads-Episode_Num-Length_per_Ads', 'Guest_Popularity_percentage_NaN-Length_per_Host-ELen_Int', 'Length_per_Host-ELen_Int', 'Episode_Length_minutes-Guest_Popularity_percentage_NaN-HPperc_Int', 'Episode_Length_minutes-HPperc_Int', 'Genre-Number_of_Ads-Length_per_Ads', 'Publication_Time-Number_of_Ads-Length_per_Ads', 'Number_of_Ads-Guest_Popularity_percentage_NaN-Length_per_Ads', 'Number_of_Ads-Episode_Sentiment-ELen_Int']
