In [1]:
from pathlib import Path
from dataclasses import dataclass, asdict

@dataclass
class CFG:
    train_path: Path = Path("../data/train.csv")
    test_path: Path = Path("../data/test.csv")
    sub_path: Path = Path("../data/sample_submission.csv")
    pltpd_path: Path = Path("../data/podcast_dataset.csv")

    num_fold: int = 5
    dev_mode: bool = False

    # Model parameters
    n_iter: int = 10000
    max_depth: int = -1
    num_leaves: int = 1024
    colsample_bytree: float = 0.7
    learning_rate: float = 0.02

    objective: str = 'l2'
    metric: str = 'rmse'
    verbosity: int = -1
    max_bin: int = 1024
    
    random_state: int = 42
    shuffle: bool = True
    encoded_columns_start: int = -91
    log_eval: int = 100
    early_stopping: int = 200
    
cfg = CFG() 
asdict(cfg)

{'train_path': PosixPath('../data/train.csv'),
 'test_path': PosixPath('../data/test.csv'),
 'sub_path': PosixPath('../data/sample_submission.csv'),
 'pltpd_path': PosixPath('../data/podcast_dataset.csv'),
 'num_fold': 5,
 'dev_mode': False,
 'n_iter': 10000,
 'max_depth': -1,
 'num_leaves': 1024,
 'colsample_bytree': 0.7,
 'learning_rate': 0.02,
 'objective': 'l2',
 'metric': 'rmse',
 'verbosity': -1,
 'max_bin': 1024,
 'random_state': 42,
 'shuffle': True,
 'encoded_columns_start': -91,
 'log_eval': 100,
 'early_stopping': 200}

In [2]:
from IPython.display import display
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

re_dict = {}
re_dict['podc_dict'] = {
    'Mystery Matters': 0, 'Joke Junction': 1, 'Study Sessions': 2, 'Digital Digest': 3, 
    'Mind & Body': 4, 'Fitness First': 5, 'Criminal Minds': 6, 'News Roundup': 7, 
    'Daily Digest': 8, 'Music Matters': 9, 'Sports Central': 10, 'Melody Mix': 11, 
    'Game Day': 12, 'Gadget Geek': 13, 'Global News': 14, 'Tech Talks': 15, 
    'Sport Spot': 16, 'Funny Folks': 17, 'Sports Weekly': 18, 'Business Briefs': 19, 
    'Tech Trends': 20, 'Innovators': 21, 'Health Hour': 22, 'Comedy Corner': 23, 
    'Sound Waves': 24, 'Brain Boost': 25, "Athlete's Arena": 26, 'Wellness Wave': 27, 
    'Style Guide': 28, 'World Watch': 29, 'Humor Hub': 30, 'Money Matters': 31, 
    'Healthy Living': 32, 'Home & Living': 33, 'Educational Nuggets': 34, 
    'Market Masters': 35, 'Learning Lab': 36, 'Lifestyle Lounge': 37, 
    'Crime Chronicles': 38, 'Detective Diaries': 39, 'Life Lessons': 40, 
    'Current Affairs': 41, 'Finance Focus': 42, 'Laugh Line': 43, 
    'True Crime Stories': 44, 'Business Insights': 45, 'Fashion Forward': 46, 'Tune Time': 47
}
re_dict['genr_dict'] = {'True Crime': 0, 'Comedy': 1, 'Education': 2, 'Technology': 3, 'Health': 4, 'News': 5, 'Music': 6, 'Sports': 7, 'Business': 8, 'Lifestyle': 9}
re_dict['week_dict'] = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
re_dict['time_dict'] = {'Morning': 10, 'Afternoon': 14, 'Evening': 17, 'Night': 21}
re_dict['sent_dict'] = {'Negative': 0, 'Neutral': 1, 'Positive': 2}


def preprocess_df(df):
    df['Episode_Num'] = df['Episode_Title'].str[8:].astype(int)  # Convert to int before log transform
    df = df.drop(columns=['Episode_Title'])

    # Convert categorical variables
    df['Genre'] = df['Genre'].replace(re_dict["genr_dict"])
    df['Podcast_Name'] = df['Podcast_Name'].replace(re_dict["podc_dict"])
    df['Publication_Day'] = df['Publication_Day'].replace(re_dict["week_dict"])
    df['Publication_Time'] = df['Publication_Time'].replace(re_dict["time_dict"])
    df['Episode_Sentiment'] = df['Episode_Sentiment'].replace(re_dict["sent_dict"])

    df.loc[df['Episode_Length_minutes']>121.0, 'Episode_Length_minutes'] = 121.0

    df['Host_Guest_Diff'] = df['Host_Popularity_percentage'] - df['Guest_Popularity_percentage']
    df['Host_Guest_Ratio'] = (df['Host_Popularity_percentage'] / df['Guest_Popularity_percentage']).replace([float('inf'), -float('inf')], pd.NA)

    if "Listening_Time_minutes" in df.columns:
        df['Listening_Episode_Diff'] = df['Episode_Length_minutes'] - df['Listening_Time_minutes']
        df['Listening_Episode_Ratio'] = (df['Episode_Length_minutes'] / df['Listening_Time_minutes']).replace([float('inf'), -float('inf')], pd.NA)

    return df


df_train = pd.read_csv(cfg.train_path, index_col='id')
df_test = pd.read_csv(cfg.test_path, index_col='id')
df_sub = pd.read_csv(cfg.sub_path, index_col='id')

df_pltpd = pd.read_csv(cfg.pltpd_path)
df_pltpd = df_pltpd.dropna(subset=['Listening_Time_minutes'])
df_pltpd = df_pltpd.reset_index(drop=True)
df_pltpd.index = df_pltpd.index + 1000000

df_train = pd.concat([df_train, df_pltpd], axis=0)
df_train["id"] = df_train.index

# is_dev_mode = False
# # is_dev_mode = True
# if is_dev_mode:
#     df_train = df_train.sample(10000, random_state=42)
#     df_test = df_test[:10]
#     df_sub = df_sub[:10]
    
df_train = preprocess_df(df_train)
df_test = preprocess_df(df_test)

# target_col = "Listening_Time_minutes"
# y_train = df_train[target_col].copy()
# df_train = df_train.drop(columns=[target_col])

display(df_train)
display(df_train.describe())
display(df_train.isna().sum())

Unnamed: 0,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,id,Episode_Num,Host_Guest_Diff,Host_Guest_Ratio,Listening_Episode_Diff,Listening_Episode_Ratio
0,0,,0,74.81,3,21,,0.0,2,31.419980,0,98,,,,
1,1,119.80,1,66.95,5,14,75.95,2.0,0,88.012410,1,26,-9.00,0.881501,31.787590,1.361172
2,2,73.90,2,69.97,1,17,8.97,0.0,0,44.925310,2,16,61.00,7.800446,28.974690,1.644952
3,3,67.17,3,57.22,0,10,78.70,2.0,2,46.278240,3,45,-21.48,0.727065,20.891760,1.451438
4,4,110.51,4,80.07,0,14,58.68,3.0,1,75.610310,4,86,21.39,1.364519,34.899690,1.461573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1047100,33,24.81,9,66.15,0,17,98.63,1.0,1,20.573795,1047100,17,-32.48,0.670688,4.236205,1.205903
1047101,11,92.15,6,89.61,5,21,25.82,2.0,0,76.198459,1047101,9,63.79,3.470565,15.951541,1.209342
1047102,23,112.27,1,26.33,5,21,55.29,0.0,1,107.602135,1047102,24,-28.96,0.476216,4.667865,1.043381
1047103,19,,8,41.47,2,14,33.58,0.0,1,17.220998,1047103,85,7.89,1.234961,,


Unnamed: 0,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,id,Episode_Num,Host_Guest_Diff,Listening_Episode_Diff
count,797105.0,705317.0,797105.0,797105.0,797105.0,797105.0,646356.0,797104.0,797105.0,797105.0,797105.0,797105.0,646356.0,705317.0
mean,23.540988,64.408705,4.554814,59.877839,3.028731,15.663856,52.095246,1.357792,0.998145,45.444668,413325.8,51.378954,7.627507,18.679341
std,13.911304,32.981409,2.962341,22.88988,2.022848,4.027274,28.483819,1.149681,0.815531,27.140915,259814.6,28.131239,36.15216,13.566281
min,0.0,0.0,0.0,1.3,0.0,10.0,0.0,0.0,0.0,0.0,0.0,1.0,-80.17,-115.54
25%,12.0,35.67,2.0,39.45,1.0,14.0,28.1,0.0,0.0,23.18422,199276.0,28.0,-18.28,8.13
50%,23.0,63.77,5.0,60.06,3.0,17.0,53.35,1.0,1.0,43.39227,398552.0,52.0,6.64,15.64375
75%,36.0,94.0,7.0,79.56,5.0,21.0,76.49,2.0,2.0,64.81462,597828.0,75.0,33.0,26.68309
max,47.0,121.0,9.0,119.46,6.0,21.0,119.91,103.91,2.0,119.97,1047104.0,100.0,113.55,103.22044


Podcast_Name                        0
Episode_Length_minutes          91788
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    150749
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
id                                  0
Episode_Num                         0
Host_Guest_Diff                150749
Host_Guest_Ratio               150752
Listening_Episode_Diff          91788
Listening_Episode_Ratio        100172
dtype: int64

In [3]:
from sklearn.metrics import mean_squared_error

def calc_rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse

calc_rmse([89.693310], [69.530000])

20.163309999999996

In [None]:
cols_to_compare = ['Podcast_Name', 'Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage']

df_dup = df_train.copy()
df_dup = df_dup.dropna(subset=['Guest_Popularity_percentage'])
df_dup = df_dup[df_dup.duplicated(subset=cols_to_compare, keep=False)]
df_dup = df_dup.sort_values(cols_to_compare + ["Listening_Time_minutes"])
df_dup

Unnamed: 0,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,id,Episode_Num,Host_Guest_Diff,Host_Guest_Ratio,Listening_Episode_Diff,Listening_Episode_Ratio
246881,40,20.66,9,20.78,6,14,87.61,1.0,1,18.808360,246881,1,-66.83,0.237188,1.851640,1.098448
283318,40,20.08,9,20.78,6,14,87.61,1.0,1,18.808360,283318,1,-66.83,0.237188,1.271640,1.06761
541077,40,20.89,9,20.78,6,14,87.61,3.0,1,18.808360,541077,1,-66.83,0.237188,2.081640,1.110676
1006010,40,20.66,9,20.78,6,14,87.61,3.0,1,18.808362,1006010,1,-66.83,0.237188,1.851638,1.098448
373925,17,29.81,1,21.46,3,21,39.46,0.0,2,28.280210,373925,1,-18.00,0.543842,1.529790,1.054094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687822,10,,7,97.88,4,14,73.99,0.0,2,28.798980,687822,100,23.89,1.322881,,
1044663,39,33.13,0,98.43,3,17,7.35,1.0,2,26.427851,1044663,100,91.08,13.391837,6.702149,1.253602
1044937,39,33.13,0,98.43,3,17,7.35,1.0,2,26.427851,1044937,100,91.08,13.391837,6.702149,1.253602
110000,20,62.18,3,98.79,5,17,59.27,2.0,1,43.038940,110000,100,39.52,1.666779,19.141060,1.444738


In [64]:
cols_to_compare = ['Podcast_Name', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time', "Episode_Sentiment"]

df_dup2 = df_train.copy()
# df_dup = df_dup.dropna(subset=['Guest_Popularity_percentage'])
df_dup2 = df_dup2[df_dup2.duplicated(subset=cols_to_compare, keep=False)]
df_dup2 = df_dup2.sort_values(cols_to_compare + ["Listening_Time_minutes"])
df_dup2

Unnamed: 0,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,id,Episode_Num,Host_Guest_Diff,Host_Guest_Ratio,Listening_Episode_Diff,Listening_Episode_Ratio
263468,0,19.30,9,21.51,1,10,96.10,0.0,0,18.236090,263468,1,-74.59,0.223829,1.063910,1.058341
1043473,0,19.30,0,21.51,1,10,96.10,3.0,0,18.236093,1043473,1,-74.59,0.223829,1.063907,1.058341
163092,0,93.78,0,68.03,5,10,17.16,1.0,1,71.796010,163092,1,50.87,3.964452,21.983990,1.306201
348103,0,96.02,0,68.03,5,10,17.16,1.0,1,71.796010,348103,1,50.87,3.964452,24.223990,1.3374
148438,0,78.76,0,40.72,5,14,27.72,1.0,2,75.285590,148438,2,13.00,1.468975,3.474410,1.04615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
532675,47,,6,80.83,5,14,7.62,1.0,1,30.924990,532675,99,73.21,10.607612,,
89908,47,96.14,6,83.30,2,10,49.20,1.0,2,86.602180,89908,99,34.10,1.693089,9.537820,1.110134
413021,47,102.36,6,83.30,2,10,5.61,0.0,2,86.602180,413021,99,77.69,14.848485,15.757820,1.181956
383735,47,39.88,6,29.88,5,21,55.83,0.0,0,36.783300,383735,100,-25.95,0.535196,3.096700,1.084188


In [65]:
# # Get IDs from each dataframe
# ids_df_dup = set(df_dup['id'])
# ids_df_dup2 = set(df_dup2['id'])

# # Find common IDs
# common_ids = ids_df_dup.intersection(ids_df_dup2)

# # Find IDs unique to each dataframe
# unique_to_df_dup = ids_df_dup - common_ids
# unique_to_df_dup2 = ids_df_dup2 - common_ids

# # Print results
# print(f"IDs unique to df_dup: {sorted(list(unique_to_df_dup))}")
# print(f"Count: {len(unique_to_df_dup)}")
# print("\n")
# print(f"IDs unique to df_dup2: {sorted(list(unique_to_df_dup2))}")
# print(f"Count: {len(unique_to_df_dup2)}")

# # If you want to see the full rows with unique IDs from df_dup:
# unique_rows_df_dup = df_dup[df_dup['id'].isin(unique_to_df_dup)]

# # If you want to see the full rows with unique IDs from df_dup2:
# unique_rows_df_dup2 = df_dup2[df_dup2['id'].isin(unique_to_df_dup2)]

# # Display the dataframes with unique IDs if needed
# print("\nSample of unique rows in df_dup:")
# display(unique_rows_df_dup)

# print("\nSample of unique rows in df_dup2:")
# display(unique_rows_df_dup2)

In [66]:
ltm_f = df_dup.drop_duplicates(subset=cols_to_compare, keep='first')["Listening_Time_minutes"]
ltm_l = df_dup.drop_duplicates(subset=cols_to_compare, keep='last')["Listening_Time_minutes"]
calc_rmse(ltm_f, ltm_l)

0.5388207095798875

In [67]:
ltm_f = df_dup2.drop_duplicates(subset=cols_to_compare, keep='first')["Listening_Time_minutes"]
ltm_l = df_dup2.drop_duplicates(subset=cols_to_compare, keep='last')["Listening_Time_minutes"]
calc_rmse(ltm_f, ltm_l)

3.816022748415559

In [None]:
x = 110
df_dup.iloc[x*30 : (x+1)*30][["Podcast_Name", "Episode_Num", "Host_Popularity_percentage", "Guest_Popularity_percentage", "Listening_Time_minutes", "Episode_Length_minutes", "Publication_Day", "Publication_Time", "Episode_Sentiment"]]

In [None]:
grouped = df_train.groupby(cols_to_compare)
result = grouped.filter(lambda x: x['Listening_Time_minutes'].nunique() > 1)
result = result.sort_values(cols_to_compare)
result

In [None]:
x = 110
result.iloc[x*30 : (x+1)*30][["Podcast_Name", "Episode_Num", "Host_Popularity_percentage", "Guest_Popularity_percentage", "Listening_Time_minutes", "Episode_Length_minutes", "Publication_Day", "Publication_Time", "Episode_Sentiment"]]

In [None]:
# cols_to_compare = ['Podcast_Name', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time']

# df_test_with_id = df_test.copy()
# # df_test_with_id = df_test_with_id.dropna(subset=['Guest_Popularity_percentage'])

# leaked_rows = df_test_with_id.merge(
#     df_train[cols_to_compare + ['Listening_Time_minutes']].drop_duplicates(),
#     on=cols_to_compare,
#     how='inner'
# )

# mean_values = leaked_rows.groupby('id')['Listening_Time_minutes'].mean().reset_index()

# display(subm.loc[subm['id'].isin(mean_values['id'].values)])
# subm.loc[subm['id'].isin(mean_values['id'].values), "Listening_Time_minutes"] = mean_values["Listening_Time_minutes"].values
# display(subm.loc[subm['id'].isin(mean_values['id'].values)])

KeyError: 'id'

In [None]:
df_train['Listening_Time_minutes_rounded'] = df_train['Listening_Time_minutes'].round(1)

grouped = df_train.groupby(cols_to_compare)
unique_counts = grouped['Listening_Time_minutes_rounded'].nunique()

groups_with_diff = unique_counts[unique_counts > 1].index
result = df_train[df_train.set_index(cols_to_compare).index.isin(groups_with_diff)]

result = result.sort_values(cols_to_compare)
result

In [None]:
x = 0
result.iloc[x*30 : (x+1)*30][["Podcast_Name", "Episode_Num", "Host_Popularity_percentage", "Guest_Popularity_percentage", "Listening_Time_minutes", "Episode_Length_minutes", "Publication_Day", "Publication_Time", "Episode_Sentiment"]]

In [None]:
# Drop duplicate get first by Podcast_Name and Episode_Num
result_f = result.drop_duplicates(subset=['Podcast_Name', 'Episode_Num'], keep='first')["Listening_Time_minutes"]
result_l = result.drop_duplicates(subset=['Podcast_Name', 'Episode_Num'], keep='last')["Listening_Time_minutes"]
calc_rmse(result_f, result_l)

In [None]:
grouped.diff()