In [1]:
import sys

sys.path.append("../src")

import altair as alt

alt.renderers.enable("jupyter", offline=True)
alt.data_transformers.disable_max_rows()

from pathlib import Path

import altair as alt
import numpy as np
import polars as pl
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from config import cfg
from data.data_process import add_fold
from data.simple_feature_eng import preprocess

# warnings.filterwarnings("ignore")
# warnings.simplefilter("ignore")


def plot_point(df: pl.DataFrame):
    transforms = [pl.col(col).cast(pl.Float64) for col in ["Episode_Length_minutes", "Listening_Time_minutes"]]
    df = df.with_columns(transforms)

    print("Length:", len(df))
    df.plot.point(
        x="Episode_Length_minutes",
        y="Listening_Time_minutes",
    ).properties(
        width=800,
        height=400,
    ).show()


def plot_linear_regression(df):
    X = df.select("Episode_Length_minutes").to_numpy()
    y = df.select("Listening_Time_minutes").to_numpy()

    model = LinearRegression()
    model.fit(X, y)
    predictions = model.predict(X)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y, predictions))

    print(f"Coefficient: {model.coef_[0][0]:.4f}")
    print(f"Intercept: {model.intercept_[0]:.4f}")
    print(f"RMSE: {rmse:.4f}")

    df = df.with_columns(pl.Series("Predicted", predictions.flatten()))
    if len(df) > 10000:
        df = df.sample(10000)

    scatter_plot = (
        alt.Chart(df.to_pandas())
        .mark_circle()
        .encode(
            x=alt.X("Episode_Length_minutes", title="Episode Length (minutes)"),
            y=alt.Y("Listening_Time_minutes", title="Listening Time (minutes)"),
            tooltip=["Episode_Length_minutes", "Listening_Time_minutes"],
        )
    )

    regression_line = alt.Chart(df.to_pandas()).mark_line(color="red").encode(x="Episode_Length_minutes", y="Predicted")

    chart = (scatter_plot + regression_line).properties(
        width=800,
        height=400,
    )

    return chart.show()


cfg.train_path = Path("../data/train.csv")
cfg.test_path = Path("../data/test.csv")
cfg.pltpd_path = Path("../data/podcast_dataset.csv")


df_test = pl.read_csv(cfg.test_path)

df_train = pl.read_csv(cfg.train_path)
df_train = df_train.drop_nulls(subset=["Episode_Length_minutes", "Number_of_Ads"])


# df_train = df_train.drop("id")
df_train = add_fold(df_train)
df_train = preprocess(df_train)

df_pltpd = pl.read_csv(cfg.pltpd_path)
df_pltpd = df_pltpd.drop_nulls(subset=["Episode_Length_minutes", "Number_of_Ads", "Listening_Time_minutes"])
df_pltpd = df_pltpd.with_columns(pl.col("Number_of_Ads").cast(pl.Float64))
df_pltpd = add_fold(df_pltpd)
df_pltpd = preprocess(df_pltpd)
df_pltpd = df_pltpd.with_columns(pl.Series(range(1_000_000, 1_000_000 + len(df_pltpd))).alias("id"))
# df_pltpd

df_train = df_train.drop(["id", "fold"])
df_pltpd = df_pltpd.drop(["id", "fold"])

df = df_train.clone()
df


Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Num,Episode_Length_minutes_NaN,Guest_Popularity_percentage_NaN,Episode_Num_Cat
str,f64,str,f64,str,str,f64,f64,str,f64,i32,cat,cat,cat
"""1""",119.8,"""1""",66.95,"""5""","""14""",75.95,2.0,"""0""",88.01241,26,"""false""","""false""","""26"""
"""2""",73.9,"""2""",69.97,"""1""","""17""",8.97,0.0,"""0""",44.92531,16,"""false""","""false""","""16"""
"""3""",67.17,"""3""",57.22,"""0""","""10""",78.7,2.0,"""2""",46.27824,45,"""false""","""false""","""45"""
"""4""",110.51,"""4""",80.07,"""0""","""14""",58.68,3.0,"""1""",75.61031,86,"""false""","""false""","""86"""
"""5""",26.54,"""4""",48.96,"""5""","""14""",53.63,3.0,"""2""",22.77047,19,"""false""","""true""","""19"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""36""",75.66,"""2""",69.36,"""5""","""10""",53.63,0.0,"""0""",56.87058,25,"""false""","""true""","""25"""
"""19""",75.75,"""8""",35.21,"""5""","""21""",53.63,2.0,"""1""",45.46242,21,"""false""","""true""","""21"""
"""37""",30.98,"""9""",78.58,"""3""","""10""",84.89,0.0,"""0""",15.26,51,"""false""","""false""","""51"""
"""28""",108.98,"""9""",45.39,"""3""","""10""",93.27,0.0,"""0""",100.72939,47,"""false""","""false""","""47"""


In [2]:
pl_i_type = pl.Int32
pl_f_type = pl.Float32

def feature_eng(df: pl.DataFrame, df_train: pl.DataFrame) -> pl.DataFrame:
    global selected
    # Cyclical features for day and time
    df = df.with_columns(
        # Day features
        pl.col("Publication_Day").cast(pl_f_type).mul(2 * np.pi / 7).sin().alias("Day_sin"),
        pl.col("Publication_Day").cast(pl_f_type).mul(2 * np.pi / 7).cos().alias("Day_cos"),
        pl.col("Publication_Day").cast(pl_f_type).mul(4 * np.pi / 7).sin().alias("Day_sin2"),
        pl.col("Publication_Day").cast(pl_f_type).mul(4 * np.pi / 7).cos().alias("Day_cos2"),
        # Time features
        pl.col("Publication_Time").cast(pl_f_type).mul(2 * np.pi / 4).sin().alias("Time_sin"),
        pl.col("Publication_Time").cast(pl_f_type).mul(2 * np.pi / 4).cos().alias("Time_cos"),
        pl.col("Publication_Time").cast(pl_f_type).mul(4 * np.pi / 24).sin().alias("Time_sin2"),
        pl.col("Publication_Time").cast(pl_f_type).mul(4 * np.pi / 24).cos().alias("Time_cos2"),
        # Ratio features
        (pl.col("Episode_Length_minutes") / (pl.col("Number_of_Ads") + 1)).fill_null(0).alias("Length_per_Ads"),
        (pl.col("Episode_Length_minutes") / (pl.col("Host_Popularity_percentage") + 1)).fill_null(0).alias("Length_per_Host"),
        (pl.col("Episode_Length_minutes") / (pl.col("Guest_Popularity_percentage") + 1)).fill_null(0).alias("Length_per_Guest"),
        # Episode length features
        pl.col("Episode_Length_minutes").floor().alias("ELen_Int"),
        (pl.col("Episode_Length_minutes") - pl.col("Episode_Length_minutes").floor()).alias("ELen_Dec"),
        pl.col("Host_Popularity_percentage").floor().alias("HPperc_Int"),
        (pl.col("Host_Popularity_percentage") - pl.col("Host_Popularity_percentage").floor()).alias("HPperc_Dec"),
        # Sentiment features
        (pl.col("Episode_Sentiment") == "2").cast(pl.Int8).alias("Is_Positive_Sentiment"),
        pl.when(pl.col("Episode_Sentiment") == "2").then(0.75).otherwise(0.717).cast(pl_f_type).alias("Sentiment_Multiplier"),
        # Squared features
        (pl.col("Episode_Length_minutes") ** 2).alias("Episode_Length_squared"),
        (pl.col("Episode_Length_minutes") ** 3).alias("Episode_Length_squared2"),
    )

    df = df.with_columns(
        (np.sin(2 * np.pi * pl.col("Episode_Num") / 100)).alias("Long_Term_Cycle_Sin"),
        (np.cos(2 * np.pi * pl.col("Episode_Num") / 100)).alias("Long_Term_Cycle_Cos"),
        (pl.col("Episode_Length_minutes") * pl.col("Sentiment_Multiplier")).alias("Expected_Listening_Time_Sentiment"),
    )

    # Convert columns to categorical
    df = df.with_columns(pl.col("Episode_Num").cast(pl_i_type).alias("Episode_Num_Int"))
    for col in ["Podcast_Name", "Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment", "Episode_Num"]:
        df = df.with_columns(pl.col(col).cast(pl.Utf8).cast(pl.Categorical))

    return df

df_train = df.clone()
df_train = feature_eng(df_train, df_train)
df_train

Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Num,Episode_Length_minutes_NaN,Guest_Popularity_percentage_NaN,Episode_Num_Cat,Day_sin,Day_cos,Day_sin2,Day_cos2,Time_sin,Time_cos,Time_sin2,Time_cos2,Length_per_Ads,Length_per_Host,Length_per_Guest,ELen_Int,ELen_Dec,HPperc_Int,HPperc_Dec,Is_Positive_Sentiment,Sentiment_Multiplier,Episode_Length_squared,Episode_Length_squared2,Long_Term_Cycle_Sin,Long_Term_Cycle_Cos,Expected_Listening_Time_Sentiment,Episode_Num_Int
cat,f64,cat,f64,cat,cat,f64,f64,cat,f64,cat,cat,cat,cat,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,i8,f32,f64,f64,f64,f64,f64,i32
"""1""",119.8,"""1""",66.95,"""5""","""14""",75.95,2.0,"""0""",88.01241,"""26""","""false""","""false""","""26""",-0.974928,-0.222521,0.433884,-0.900969,-0.000001,-1.0,0.866026,0.5,39.933333,1.763061,1.556855,119.0,0.8,66.0,0.95,0,0.717,14352.04,1.7194e6,0.998027,-0.062791,85.896601,26
"""2""",73.9,"""2""",69.97,"""1""","""17""",8.97,0.0,"""0""",44.92531,"""16""","""false""","""false""","""16""",0.781832,0.62349,0.974928,-0.222521,1.0,-0.000001,0.5,-0.866025,73.9,1.041285,7.412237,73.0,0.9,69.0,0.97,0,0.717,5461.21,403583.419,0.844328,0.535827,52.986301,16
"""3""",67.17,"""3""",57.22,"""0""","""10""",78.7,2.0,"""2""",46.27824,"""45""","""false""","""false""","""45""",0.0,1.0,0.0,1.0,-6.7553e-7,-1.0,-0.866025,0.5,22.39,1.153727,0.842785,67.0,0.17,57.0,0.22,1,0.75,4511.8089,303058.203813,0.309017,-0.951057,50.3775,45
"""4""",110.51,"""4""",80.07,"""0""","""14""",58.68,3.0,"""1""",75.61031,"""86""","""false""","""false""","""86""",0.0,1.0,0.0,1.0,-0.000001,-1.0,0.866026,0.5,27.6275,1.363143,1.851709,110.0,0.51,80.0,0.07,0,0.717,12212.4601,1.3496e6,-0.770513,0.637424,79.235671,86
"""5""",26.54,"""4""",48.96,"""5""","""14""",53.63,3.0,"""2""",22.77047,"""19""","""false""","""true""","""19""",-0.974928,-0.222521,0.433884,-0.900969,-0.000001,-1.0,0.866026,0.5,6.635,0.531225,0.485814,26.0,0.54,48.0,0.96,1,0.75,704.3716,18694.022264,0.929776,0.368125,19.905,19
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""36""",75.66,"""2""",69.36,"""5""","""10""",53.63,0.0,"""0""",56.87058,"""25""","""false""","""true""","""25""",-0.974928,-0.222521,0.433884,-0.900969,-6.7553e-7,-1.0,-0.866025,0.5,75.66,1.075327,1.384953,75.0,0.66,69.0,0.36,0,0.717,5724.4356,433110.797496,1.0,6.1232e-17,54.248221,25
"""19""",75.75,"""8""",35.21,"""5""","""21""",53.63,2.0,"""1""",45.46242,"""21""","""false""","""true""","""21""",-0.974928,-0.222521,0.433884,-0.900969,1.0,-0.000002,-1.0,6.6361e-7,25.25,2.091964,1.386601,75.0,0.75,35.0,0.21,0,0.717,5738.0625,434658.234375,0.968583,0.24869,54.312751,21
"""37""",30.98,"""9""",78.58,"""3""","""10""",84.89,0.0,"""0""",15.26,"""51""","""false""","""false""","""51""",0.433884,-0.900969,-0.781831,0.62349,-6.7553e-7,-1.0,-0.866025,0.5,30.98,0.389294,0.360694,30.0,0.98,78.0,0.58,0,0.717,959.7604,29733.377192,-0.062791,-0.998027,22.21266,51
"""28""",108.98,"""9""",45.39,"""3""","""10""",93.27,0.0,"""0""",100.72939,"""47""","""false""","""false""","""47""",0.433884,-0.900969,-0.781831,0.62349,-6.7553e-7,-1.0,-0.866025,0.5,108.98,2.349213,1.156041,108.0,0.98,45.0,0.39,0,0.717,11876.6404,1.2943e6,0.187381,-0.982287,78.138661,47


In [3]:
alt.Chart(df_train.sample(10000)).mark_bar().encode(
    x=alt.X('Listening_Time_minutes', bin=True),
    y=alt.Y('count()')
)

JupyterChart(spec={'config': {'view': {'continuousWidth': 300, 'continuousHeight': 300}}, 'data': {'name': 'da…

In [66]:
df_train.filter(pl.col("Episode_Length_minutes_round_3_dec") != 0)

Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Num,Episode_Length_minutes_NaN,Guest_Popularity_percentage_NaN,Episode_Num_Cat,Day_sin,Day_cos,Day_sin2,Day_cos2,Time_sin,Time_cos,Time_sin2,Time_cos2,Length_per_Ads,Length_per_Host,Length_per_Guest,Is_Positive_Sentiment,Sentiment_Multiplier,Episode_Length_squared,Episode_Length_squared2,Episode_Length_minutes_int,Episode_Length_minutes_dec,Episode_Length_minutes_round_0,Episode_Length_minutes_round_0_dec,Episode_Length_minutes_round_1,Episode_Length_minutes_round_1_dec,Episode_Length_minutes_round_2,Episode_Length_minutes_round_2_dec,Episode_Length_minutes_round_3,Episode_Length_minutes_round_3_dec,Host_Popularity_percentage_int,Host_Popularity_percentage_dec,Host_Popularity_percentage_round_0,Host_Popularity_percentage_round_0_dec,Host_Popularity_percentage_round_1,Host_Popularity_percentage_round_1_dec,Host_Popularity_percentage_round_2,Host_Popularity_percentage_round_2_dec,Host_Popularity_percentage_round_3,Host_Popularity_percentage_round_3_dec,Long_Term_Cycle_Sin,Long_Term_Cycle_Cos,Expected_Listening_Time_Sentiment
cat,f64,cat,f64,cat,cat,f64,f64,cat,f64,cat,cat,cat,cat,f32,f32,f32,f32,f32,f32,f32,f32,f64,f64,f64,i8,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""27""",33.23,"""4""",43.35,"""2""","""21""",0.62,1.0,"""1""",30.12822,"""25""","""false""","""false""","""25""",0.974928,-0.222521,-0.433884,-0.900969,1.0,-0.000002,-1.0,6.6361e-7,16.615,0.749267,20.512346,0,0.717,1104.2329,36693.659267,33.0,0.23,33.0,3.0,33.2,2.0,33.23,2.0,33.23,10.0,43.0,0.35,43.0,3.0,43.4,3.0,43.35,5.0,43.35,0.0,1.0,6.1232e-17,23.82591
"""26""",19.56,"""7""",93.77,"""5""","""21""",93.09,1.0,"""2""",15.12164,"""79""","""false""","""false""","""79""",-0.974928,-0.222521,0.433884,-0.900969,1.0,-0.000002,-1.0,6.6361e-7,9.78,0.206394,0.207886,1,0.75,382.5936,7483.530816,19.0,0.56,20.0,9.0,19.6,5.0,19.56,5.0,19.56,10.0,93.0,0.77,94.0,3.0,93.8,7.0,93.77,7.0,93.77,0.0,-0.968583,0.24869,14.67
"""37""",72.32,"""9""",78.69,"""3""","""17""",39.11,3.0,"""0""",37.28,"""36""","""false""","""false""","""36""",0.433884,-0.900969,-0.781831,0.62349,1.0,-0.000001,0.5,-0.866025,18.08,0.907517,1.803042,0,0.717,5230.1824,378246.791168,72.0,0.32,72.0,2.0,72.3,3.0,72.32,1.0,72.32,10.0,78.0,0.69,79.0,8.0,78.7,6.0,78.69,9.0,78.69,0.0,0.770513,-0.637424,51.853441
"""42""",77.6,"""8""",79.63,"""3""","""10""",47.93,1.0,"""0""",71.11,"""23""","""false""","""false""","""23""",0.433884,-0.900969,-0.781831,0.62349,-6.7553e-7,-1.0,-0.866025,0.5,38.8,0.962421,1.585939,0,0.717,6021.76,467288.576,77.0,0.6,78.0,7.0,77.6,6.0,77.6,-1.0,77.6,10.0,79.0,0.63,80.0,9.0,79.6,6.0,79.63,3.0,79.63,0.0,0.992115,0.125333,55.639201
"""9""",64.57,"""6""",84.75,"""3""","""10""",53.63,1.0,"""1""",53.49181,"""59""","""false""","""true""","""59""",0.433884,-0.900969,-0.781831,0.62349,-6.7553e-7,-1.0,-0.866025,0.5,32.285,0.753003,1.181951,0,0.717,4169.2849,269210.725993,64.0,0.57,65.0,4.0,64.6,5.0,64.57,6.0,64.57,9.0,84.0,0.75,85.0,4.0,84.8,7.0,84.75,5.0,84.75,0.0,-0.535827,-0.844328,46.29669
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""6""",78.24,"""0""",57.37,"""2""","""17""",60.71,1.0,"""1""",68.60912,"""56""","""false""","""false""","""56""",0.974928,-0.222521,-0.433884,-0.900969,1.0,-0.000001,0.5,-0.866025,39.12,1.340415,1.267866,0,0.717,6121.4976,478945.972224,78.0,0.24,78.0,8.0,78.2,2.0,78.24,3.0,78.24,10.0,57.0,0.37,57.0,7.0,57.4,3.0,57.37,7.0,57.37,0.0,-0.368125,-0.929776,56.098081
"""31""",67.32,"""8""",98.59,"""6""","""10""",53.63,2.0,"""2""",42.6,"""68""","""false""","""true""","""68""",-0.781831,0.62349,-0.974928,-0.22252,-6.7553e-7,-1.0,-0.866025,0.5,22.44,0.675971,1.23229,1,0.75,4531.9824,305093.055168,67.0,0.32,67.0,7.0,67.3,3.0,67.32,1.0,67.32,10.0,98.0,0.59,99.0,8.0,98.6,5.0,98.59,9.0,98.59,0.0,-0.904827,-0.425779,50.49
"""17""",32.62,"""1""",73.07,"""6""","""10""",63.77,2.0,"""0""",16.36275,"""74""","""false""","""false""","""74""",-0.781831,0.62349,-0.974928,-0.22252,-6.7553e-7,-1.0,-0.866025,0.5,10.873333,0.440394,0.503628,0,0.717,1064.0644,34709.780728,32.0,0.62,33.0,2.0,32.6,6.0,32.62,1.0,32.62,9.0,73.0,0.07,73.0,3.0,73.1,0.0,73.07,6.0,73.07,10.0,-0.998027,-0.062791,23.38854
"""11""",16.4,"""6""",61.49,"""1""","""14""",41.34,0.0,"""1""",6.17,"""36""","""false""","""false""","""36""",0.781832,0.62349,0.974928,-0.222521,-0.000001,-1.0,0.866026,0.5,16.4,0.262442,0.387341,0,0.717,268.96,4410.944,16.0,0.4,16.0,6.0,16.4,4.0,16.4,-1.0,16.4,10.0,61.0,0.49,61.0,1.0,61.5,4.0,61.49,9.0,61.49,0.0,0.770513,-0.637424,11.7588


In [None]:
import polars as pl
import numpy as np
from sklearn.manifold import TSNE
import altair as alt

df_train = df_train.with_columns(
    pl.col("Genre").cast(pl.Int32),
    pl.col("Publication_Day").cast(pl.Int32),
    pl.col("Publication_Time").cast(pl.Int32),
    pl.col("Episode_Sentiment").cast(pl.Int32),
)
df_train = df_train.sample(1000)

features = df_train.select([
    "Episode_Length_minutes",
    "Listening_Time_minutes",
    "Genre",
    "Host_Popularity_percentage",
    "Guest_Popularity_percentage",
    "Publication_Day",
    "Publication_Time",
    "Number_of_Ads",
    "Episode_Sentiment",
    "Episode_Num",
]).to_numpy()

tsne = TSNE(n_components=2, random_state=42, perplexity=30)
tsne_results = tsne.fit_transform(features)

df = df.with_columns([
    pl.Series("tsne_1", tsne_results[:, 0]),
    pl.Series("tsne_2", tsne_results[:, 1])
])

chart = alt.Chart(df).mark_circle(size=60).encode(
    x='tsne_1:Q',
    y='tsne_2:Q',
    color=alt.Color('label:N', scale=alt.Scale(scheme='Listening_Time_minutes')),
    tooltip=['label:N']
).properties(
    width=600,
    height=400,
    title='t-SNE Visualization of High-Dimensional Data'
).interactive()

chart.show()


[['Guest_Popularity_percentage_NaN', 'ELen_Int'],
 ['Episode_Sentiment', 'ELen_Int'],
 ['Number_of_Ads', 'ELen_Int'],
 ['Number_of_Ads', 'Episode_Length_minutes_NaN', 'ELen_Int'],
 ['ELen_Dec'],
 ['Episode_Sentiment', 'Guest_Popularity_percentage_NaN', 'ELen_Int'],
 ['Number_of_Ads', 'Guest_Popularity_percentage_NaN', 'ELen_Int'],
 ['Publication_Time', 'Guest_Popularity_percentage_NaN', 'ELen_Int'],
 ['Number_of_Ads', 'Episode_Sentiment', 'ELen_Int'],
 ['Publication_Time', 'Number_of_Ads', 'ELen_Int'],
 ['Genre', 'Guest_Popularity_percentage_NaN', 'ELen_Int'],
 ['Publication_Day', 'Number_of_Ads', 'ELen_Int'],
 ['Genre', 'Number_of_Ads', 'ELen_Int'],
 ['Podcast_Name', 'ELen_Int'],
 ['Podcast_Name', 'Genre', 'ELen_Int'],
 ['ELen_Int', 'HPperc_Int'],
 ['ELen_Int', 'ELen_Dec'],
 ['Episode_Length_minutes_NaN', 'ELen_Int', 'ELen_Dec'],
 ['Episode_Num', 'Episode_Length_minutes_NaN', 'ELen_Int'],
 ['Host_Popularity_percentage', 'Episode_Num', 'Length_per_Guest'],
 ['Podcast_Name', 'Host_Popul

In [20]:
np.array([len(selected.split("-")) for selected in selecteds])

array([2, 2, 2, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 2, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 1, 3,
       1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 2, 3, 3, 3, 3, 3, 3,
       3, 3])

In [13]:
df_train = pl.read_csv(cfg.train_path, infer_schema_length=0).with_columns(pl.all().cast(pl.String, strict=False))
df_pltpd = pl.read_csv(cfg.pltpd_path, infer_schema_length=0).with_columns(pl.all().cast(pl.String, strict=False))
df_test = pl.read_csv(cfg.test_path, infer_schema_length=0).with_columns(pl.all().cast(pl.String, strict=False))

# df_train = df_train.drop_nulls(subset=["Episode_Length_minutes", "Number_of_Ads", "Guest_Popularity_percentage", "Listening_Time_minutes"])
# df_pltpd = df_pltpd.drop_nulls(subset=["Episode_Length_minutes", "Number_of_Ads", "Guest_Popularity_percentage", "Listening_Time_minutes"])
# df_test = df_test.drop_nulls(subset=["Episode_Length_minutes", "Guest_Popularity_percentage"])

df_pltpd = df_pltpd.with_columns(pl.col("Number_of_Ads").cast(pl.Float64).cast(pl.String))

transforms = []
for col in ["Episode_Length_minutes", "Host_Popularity_percentage", "Guest_Popularity_percentage", "Number_of_Ads", "Listening_Time_minutes"]:
    transforms += [
        pl.when(pl.col(col).str.contains("\\."))
            .then(
                pl.col(col).str.extract(r"\.(\d+)$").str.len_chars().cast(pl.Int64)
            )
            .otherwise(pl.lit(0))
            .alias(f"{col}_Decimal_Len"),
        
        pl.when(pl.col(col).str.contains("\\."))
            .then(
                pl.col(col).str.extract(r"\.(\d+)$").cast(pl.Int64)
            )
            .otherwise(pl.lit(0))
            .alias(f"{col}_Decimal"),
        
        pl.col(col).cast(pl.Float64)
    ]

df_train = df_train.with_columns(transforms)
df_pltpd = df_pltpd.with_columns(transforms)
df_test = df_test.with_columns(transforms[:-3])

display(df_train)

id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Length_minutes_Decimal_Len,Episode_Length_minutes_Decimal,Host_Popularity_percentage_Decimal_Len,Host_Popularity_percentage_Decimal,Guest_Popularity_percentage_Decimal_Len,Guest_Popularity_percentage_Decimal,Number_of_Ads_Decimal_Len,Number_of_Ads_Decimal,Listening_Time_minutes_Decimal_Len,Listening_Time_minutes_Decimal
str,str,str,f64,str,f64,str,str,f64,f64,str,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""0""","""Mystery Matters""","""Episode 98""",,"""True Crime""",74.81,"""Thursday""","""Night""",,0.0,"""Positive""",31.41998,0,0,2,81,0,0,1,0,5,41998
"""1""","""Joke Junction""","""Episode 26""",119.8,"""Comedy""",66.95,"""Saturday""","""Afternoon""",75.95,2.0,"""Negative""",88.01241,1,8,2,95,2,95,1,0,5,1241
"""2""","""Study Sessions""","""Episode 16""",73.9,"""Education""",69.97,"""Tuesday""","""Evening""",8.97,0.0,"""Negative""",44.92531,1,9,2,97,2,97,1,0,5,92531
"""3""","""Digital Digest""","""Episode 45""",67.17,"""Technology""",57.22,"""Monday""","""Morning""",78.7,2.0,"""Positive""",46.27824,2,17,2,22,1,7,1,0,5,27824
"""4""","""Mind & Body""","""Episode 86""",110.51,"""Health""",80.07,"""Monday""","""Afternoon""",58.68,3.0,"""Neutral""",75.61031,2,51,2,7,2,68,1,0,5,61031
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""749995""","""Learning Lab""","""Episode 25""",75.66,"""Education""",69.36,"""Saturday""","""Morning""",,0.0,"""Negative""",56.87058,2,66,2,36,0,0,1,0,5,87058
"""749996""","""Business Briefs""","""Episode 21""",75.75,"""Business""",35.21,"""Saturday""","""Night""",,2.0,"""Neutral""",45.46242,2,75,2,21,0,0,1,0,5,46242
"""749997""","""Lifestyle Lounge""","""Episode 51""",30.98,"""Lifestyle""",78.58,"""Thursday""","""Morning""",84.89,0.0,"""Negative""",15.26,2,98,2,58,2,89,1,0,2,26
"""749998""","""Style Guide""","""Episode 47""",108.98,"""Lifestyle""",45.39,"""Thursday""","""Morning""",93.27,0.0,"""Negative""",100.72939,2,98,2,39,2,27,1,0,5,72939


In [14]:
df = df_train.filter(pl.col("Episode_Length_minutes_Decimal_Len") > 2)
df

id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Length_minutes_Decimal_Len,Episode_Length_minutes_Decimal,Host_Popularity_percentage_Decimal_Len,Host_Popularity_percentage_Decimal,Guest_Popularity_percentage_Decimal_Len,Guest_Popularity_percentage_Decimal,Number_of_Ads_Decimal_Len,Number_of_Ads_Decimal,Listening_Time_minutes_Decimal_Len,Listening_Time_minutes_Decimal
str,str,str,f64,str,f64,str,str,f64,f64,str,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
"""948""","""Educational Nuggets""","""Episode 59""",96.10732,"""Education""",85.95,"""Tuesday""","""Evening""",90.83,2.0,"""Neutral""",96.10741,5,10732,2,95,2,83,1,0,5,10741
"""1745""","""Gadget Geek""","""Episode 21""",54.468002,"""Technology""",34.57,"""Sunday""","""Morning""",92.15,3.0,"""Neutral""",54.468,7,4680016,2,57,2,15,1,0,3,468
"""3098""","""Market Masters""","""Episode 62""",6.598,"""Business""",54.23,"""Sunday""","""Morning""",33.22,3.0,"""Positive""",6.59814,3,598,2,23,2,22,1,0,5,59814
"""3448""","""Home & Living""","""Episode 48""",13.000003,"""Lifestyle""",42.47,"""Sunday""","""Night""",96.87,1.0,"""Neutral""",13.0,6,3,2,47,2,87,1,0,1,0
"""3618""","""Health Hour""","""Episode 67""",54.468002,"""Health""",21.14,"""Thursday""","""Morning""",,2.0,"""Neutral""",54.468,7,4680016,2,14,0,0,1,0,3,468
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""747943""","""Business Insights""","""Episode 90""",29.378789,"""Business""",83.2,"""Tuesday""","""Afternoon""",,2.0,"""Negative""",29.0,8,37878924,1,2,0,0,1,0,1,0
"""748343""","""Tech Trends""","""Episode 47""",112.002,"""Technology""",87.0,"""Sunday""","""Morning""",66.51,0.0,"""Positive""",112.0,3,2,1,0,2,51,1,0,1,0
"""748407""","""Wellness Wave""","""Episode 64""",6.712923,"""Health""",53.72,"""Wednesday""","""Night""",64.78,1.0,"""Neutral""",6.71281,8,71292308,2,72,2,78,1,0,5,71281
"""749034""","""Tune Time""","""Episode 89""",8.328313,"""Music""",70.64,"""Sunday""","""Morning""",18.55,2.0,"""Negative""",8.32831,8,32831309,2,64,2,55,1,0,5,32831


In [15]:
import numpy as np

def calculate_rmse(actual, predicted):
    squared_diff = (actual - predicted) ** 2
    mean_squared_diff = squared_diff.mean()
    rmse = np.sqrt(mean_squared_diff)
    return rmse

def optimize_scaling_factor(input_data, target_data, n_searches=20):
    initial_x = target_data.mean() / input_data.mean()
    
    # Define search range
    lower_bound = initial_x * 0.5
    upper_bound = initial_x * 1.5
    
    # Perform n binary searches
    for _ in range(n_searches):
        mid_point = (lower_bound + upper_bound) / 2
        
        delta = (upper_bound - lower_bound) * 0.1
        
        lower_x = mid_point - delta
        upper_x = mid_point + delta
        
        lower_rmse = calculate_rmse(target_data, input_data * lower_x)
        upper_rmse = calculate_rmse(target_data, input_data * upper_x)
        
        if lower_rmse < upper_rmse:
            upper_bound = mid_point
        else:
            lower_bound = mid_point
    
    best_x = (lower_bound + upper_bound) / 2
    best_rmse = calculate_rmse(target_data, input_data * best_x)
    
    return best_x, best_rmse

import time
for i in range(0, 100, 1):
    start_time = time.time()
    x_optimal = optimize_scaling_factor(df["Episode_Length_minutes"], df["Listening_Time_minutes"], n_searches=i)
    print(i, "\tCoefficient:", x_optimal[0], "\tRMSE:", x_optimal[1], "\tTime taken: ", time.time() - start_time)

0 	Coefficient: 0.9598565775851021 	RMSE: 6.2916710331445485 	Time taken:  0.0009746551513671875
1 	Coefficient: 0.7198924331888266 	RMSE: 14.23471322374616 	Time taken:  0.0006189346313476562
2 	Coefficient: 0.8398745053869643 	RMSE: 8.918088931623851 	Time taken:  0.0019731521606445312
3 	Coefficient: 0.8998655414860333 	RMSE: 7.011795185255232 	Time taken:  0.00131988525390625
4 	Coefficient: 0.9298610595355676 	RMSE: 6.463500341295242 	Time taken:  0.0008881092071533203
5 	Coefficient: 0.9448588185603348 	RMSE: 6.327036445085333 	Time taken:  0.0013899803161621094
6 	Coefficient: 0.9523576980727184 	RMSE: 6.29649582675391 	Time taken:  0.0013933181762695312
7 	Coefficient: 0.9561071378289103 	RMSE: 6.290857862490507 	Time taken:  0.034173011779785156
8 	Coefficient: 0.9579818577070063 	RMSE: 6.290457747135269 	Time taken:  0.0019578933715820312
9 	Coefficient: 0.9570444977679583 	RMSE: 6.290456119788039 	Time taken:  0.0017893314361572266
10 	Coefficient: 0.9575131777374823 	RMSE: 

In [16]:
calculate_rmse(df["Episode_Length_minutes"] * 0.9575093955930962, df["Listening_Time_minutes"])

6.290406507123719