In [2]:
from pathlib import Path
from dataclasses import dataclass, asdict

@dataclass
class CFG:
    train_path: Path = Path("../data/train.csv")
    test_path: Path = Path("../data/test.csv")
    sub_path: Path = Path("../data/sample_submission.csv")

    num_fold: int = 5
    dev_mode: bool = False

    # Model parameters
    n_iter: int = 10000
    max_depth: int = -1
    num_leaves: int = 1024
    colsample_bytree: float = 0.7
    learning_rate: float = 0.02

    objective: str = 'l2'
    metric: str = 'rmse'
    verbosity: int = -1
    max_bin: int = 1024
    
    random_state: int = 42
    shuffle: bool = True
    encoded_columns_start: int = -91
    log_eval: int = 100
    early_stopping: int = 200
    
cfg = CFG() 
asdict(cfg)

{'train_path': PosixPath('../data/train.csv'),
 'test_path': PosixPath('../data/test.csv'),
 'sub_path': PosixPath('../data/sample_submission.csv'),
 'num_fold': 5,
 'dev_mode': False,
 'n_iter': 10000,
 'max_depth': -1,
 'num_leaves': 1024,
 'colsample_bytree': 0.7,
 'learning_rate': 0.02,
 'objective': 'l2',
 'metric': 'rmse',
 'verbosity': -1,
 'max_bin': 1024,
 'random_state': 42,
 'shuffle': True,
 'encoded_columns_start': -91,
 'log_eval': 100,
 'early_stopping': 200}

In [3]:
from IPython.display import display
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

from sklearn.metrics import mean_squared_error

def calc_rmse(actual, predicted):
    return np.sqrt(mean_squared_error(actual, predicted))

re_dict = {}
re_dict['podc_dict'] = {
    'Mystery Matters': 0, 'Joke Junction': 1, 'Study Sessions': 2, 'Digital Digest': 3, 
    'Mind & Body': 4, 'Fitness First': 5, 'Criminal Minds': 6, 'News Roundup': 7, 
    'Daily Digest': 8, 'Music Matters': 9, 'Sports Central': 10, 'Melody Mix': 11, 
    'Game Day': 12, 'Gadget Geek': 13, 'Global News': 14, 'Tech Talks': 15, 
    'Sport Spot': 16, 'Funny Folks': 17, 'Sports Weekly': 18, 'Business Briefs': 19, 
    'Tech Trends': 20, 'Innovators': 21, 'Health Hour': 22, 'Comedy Corner': 23, 
    'Sound Waves': 24, 'Brain Boost': 25, "Athlete's Arena": 26, 'Wellness Wave': 27, 
    'Style Guide': 28, 'World Watch': 29, 'Humor Hub': 30, 'Money Matters': 31, 
    'Healthy Living': 32, 'Home & Living': 33, 'Educational Nuggets': 34, 
    'Market Masters': 35, 'Learning Lab': 36, 'Lifestyle Lounge': 37, 
    'Crime Chronicles': 38, 'Detective Diaries': 39, 'Life Lessons': 40, 
    'Current Affairs': 41, 'Finance Focus': 42, 'Laugh Line': 43, 
    'True Crime Stories': 44, 'Business Insights': 45, 'Fashion Forward': 46, 'Tune Time': 47
}
re_dict['genr_dict'] = {'True Crime': 0, 'Comedy': 1, 'Education': 2, 'Technology': 3, 'Health': 4, 'News': 5, 'Music': 6, 'Sports': 7, 'Business': 8, 'Lifestyle': 9}
re_dict['week_dict'] = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
re_dict['time_dict'] = {'Morning': 10, 'Afternoon': 14, 'Evening': 17, 'Night': 21}
re_dict['sent_dict'] = {'Negative': 0, 'Neutral': 1, 'Positive': 2}


def preprocess_df(df):
    df['Episode_Num'] = df['Episode_Title'].str[8:].astype(int)  # Convert to int before log transform
    df = df.drop(columns=['Episode_Title'])

    # Convert categorical variables
    df['Genre'] = df['Genre'].replace(re_dict["genr_dict"])
    df['Podcast_Name'] = df['Podcast_Name'].replace(re_dict["podc_dict"])
    df['Publication_Day'] = df['Publication_Day'].replace(re_dict["week_dict"])
    df['Publication_Time'] = df['Publication_Time'].replace(re_dict["time_dict"])
    df['Episode_Sentiment'] = df['Episode_Sentiment'].replace(re_dict["sent_dict"])

    df.loc[df['Episode_Length_minutes']>121.0, 'Episode_Length_minutes'] = 121.0

    df['Host_Guest_Diff'] = df['Host_Popularity_percentage'] - df['Guest_Popularity_percentage']
    df['Host_Guest_Ratio'] = (df['Host_Popularity_percentage'] / df['Guest_Popularity_percentage']).replace([float('inf'), -float('inf')], pd.NA)

    if "Listening_Time_minutes" in df.columns:
        df['Listening_Episode_Diff'] = df['Episode_Length_minutes'] - df['Listening_Time_minutes']
        df['Listening_Episode_Ratio'] = (df['Episode_Length_minutes'] / df['Listening_Time_minutes']).replace([float('inf'), -float('inf')], pd.NA)

    return df


df_train = pd.read_csv(cfg.train_path, index_col='id')
df_test = pd.read_csv(cfg.test_path, index_col='id')
df_sub = pd.read_csv(cfg.sub_path, index_col='id')

df_train = preprocess_df(df_train)
df_test = preprocess_df(df_test)

# target_col = "Listening_Time_minutes"
# y_train = df_train[target_col].copy()
# df_train = df_train.drop(columns=[target_col])

# df_desc = df_train.describe()

# def feature_eng(df, df_desc=df_desc):
#     for col in df_desc.columns:
#         if df_desc[col]['std'] > 0:
#             df[col + '_log'] = df[col].apply(lambda x: np.log1p(x) if x > 0 else 0)
#             df[col + '_sqrt'] = df[col].apply(lambda x: np.sqrt(x) if x > 0 else 0)
#             df[col + '_exp'] = df[col].apply(lambda x: np.exp(x) if x > 0 else 0)

#     return df

# df_train = feature_eng(df_train)
# df_test = feature_eng(df_test)

display(df_train)
display(df_train.describe())

Unnamed: 0_level_0,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Num,Host_Guest_Diff,Host_Guest_Ratio,Listening_Episode_Diff,Listening_Episode_Ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0,,0,74.81,3,21,,0.0,2,31.41998,98,,,,
1,1,119.80,1,66.95,5,14,75.95,2.0,0,88.01241,26,-9.00,0.881501,31.78759,1.361172
2,2,73.90,2,69.97,1,17,8.97,0.0,0,44.92531,16,61.00,7.800446,28.97469,1.644952
3,3,67.17,3,57.22,0,10,78.70,2.0,2,46.27824,45,-21.48,0.727065,20.89176,1.451438
4,4,110.51,4,80.07,0,14,58.68,3.0,1,75.61031,86,21.39,1.364519,34.89969,1.461573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,36,75.66,2,69.36,5,10,,0.0,0,56.87058,25,,,18.78942,1.330389
749996,19,75.75,8,35.21,5,21,,2.0,1,45.46242,21,,,30.28758,1.666211
749997,37,30.98,9,78.58,3,10,84.89,0.0,0,15.26000,51,-6.31,0.925669,15.72000,2.030144
749998,28,108.98,9,45.39,3,10,93.27,0.0,0,100.72939,47,-47.88,0.486652,8.25061,1.081909


Unnamed: 0,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Num,Host_Guest_Diff,Listening_Episode_Diff
count,750000.0,662907.0,750000.0,750000.0,750000.0,750000.0,603970.0,749999.0,750000.0,750000.0,750000.0,603970.0,662907.0
mean,23.540307,64.50443,4.556036,59.859901,3.030805,15.6715,52.236449,1.348855,0.997969,45.437406,51.445811,7.456403,18.766443
std,13.917884,32.968121,2.965912,22.873098,2.024196,4.026379,28.451241,1.15113,0.81544,27.138306,28.085623,36.090841,13.494582
min,0.0,0.0,0.0,1.3,0.0,10.0,0.0,0.0,0.0,0.0,1.0,-80.17,-115.54
25%,12.0,35.73,2.0,39.41,1.0,14.0,28.38,0.0,0.0,23.17835,28.0,-18.39,8.27353
50%,23.0,63.84,5.0,60.05,3.0,17.0,53.58,1.0,1.0,43.37946,52.0,6.39,15.71305
75%,36.0,94.07,7.0,79.53,5.0,21.0,76.6,2.0,2.0,64.81158,75.0,32.73,26.71
max,47.0,121.0,9.0,119.46,6.0,21.0,119.91,103.91,2.0,119.97,100.0,113.55,103.22044


In [4]:
df_over_ads = df_train[df_train["Number_of_Ads"] > 3]
df_over_ads["Pred_Listening_Time_minutes"] = df_over_ads["Number_of_Ads"] * 0.993
print("RMSE score:", calc_rmse(df_over_ads["Listening_Time_minutes"], df_over_ads["Number_of_Ads"]))


RMSE score: 2.313134147390121


In [5]:
df_train['Host_Popularity_percentage'].nunique()

8038

In [None]:
cols_to_compare 
grouped = df_train.groupby(cols_to_compare)
result = grouped.filter(lambda x: x['Listening_Time_minutes'].nunique() > 1)
result = result.sort_values(cols_to_compare)
result