In [1]:
import wandb
from pathlib import Path
from dataclasses import dataclass, asdict

class WandbCallback:
    def __init__(self, log_every=50):
        self.log_every = log_every
        self.iteration = 0
        
    def __call__(self, env):
        # This gets called after each iteration
        if self.iteration % self.log_every == 0:
            # Log metrics to wandb
            metrics = {}
            for dataset_name, eval_name, value, _ in env.evaluation_result_list:
                metric_name = f"{dataset_name}/{eval_name}"
                metrics[metric_name] = value
            
            wandb.log(metrics, step=self.iteration)
        
        self.iteration += 1
        return False
    
@dataclass
class CFG:
    train_path: Path = Path("./data/train.csv")
    test_path: Path = Path("./data/test.csv")
    sub_path: Path = Path("./data/sample_submission.csv")

    num_fold: int = 5
    dev_mode: bool = False

    # Model parameters
    n_iter: int = 10000
    max_depth: int = -1
    num_leaves: int = 1024
    colsample_bytree: float = 0.7
    learning_rate: float = 0.04

    objective: str = 'l2'
    metric: str = 'rmse'
    verbosity: int = -1
    
    random_state: int = 42
    shuffle: bool = True
    encoded_columns_start: int = -91
    log_eval: int = 100
    early_stopping: int = 200
    
cfg = CFG() 
asdict(cfg)

{'train_path': PosixPath('data/train.csv'),
 'test_path': PosixPath('data/test.csv'),
 'sub_path': PosixPath('data/sample_submission.csv'),
 'num_fold': 5,
 'dev_mode': False,
 'n_iter': 10000,
 'max_depth': -1,
 'num_leaves': 1024,
 'colsample_bytree': 0.7,
 'learning_rate': 0.04,
 'objective': 'l2',
 'metric': 'rmse',
 'verbosity': -1,
 'random_state': 42,
 'shuffle': True,
 'encoded_columns_start': -91,
 'log_eval': 100,
 'early_stopping': 200}

In [None]:
from IPython.display import display
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

def calc_rmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse

re_dict = {}
re_dict['podc_dict'] = {
    'Mystery Matters': 0, 'Joke Junction': 1, 'Study Sessions': 2, 'Digital Digest': 3, 
    'Mind & Body': 4, 'Fitness First': 5, 'Criminal Minds': 6, 'News Roundup': 7, 
    'Daily Digest': 8, 'Music Matters': 9, 'Sports Central': 10, 'Melody Mix': 11, 
    'Game Day': 12, 'Gadget Geek': 13, 'Global News': 14, 'Tech Talks': 15, 
    'Sport Spot': 16, 'Funny Folks': 17, 'Sports Weekly': 18, 'Business Briefs': 19, 
    'Tech Trends': 20, 'Innovators': 21, 'Health Hour': 22, 'Comedy Corner': 23, 
    'Sound Waves': 24, 'Brain Boost': 25, "Athlete's Arena": 26, 'Wellness Wave': 27, 
    'Style Guide': 28, 'World Watch': 29, 'Humor Hub': 30, 'Money Matters': 31, 
    'Healthy Living': 32, 'Home & Living': 33, 'Educational Nuggets': 34, 
    'Market Masters': 35, 'Learning Lab': 36, 'Lifestyle Lounge': 37, 
    'Crime Chronicles': 38, 'Detective Diaries': 39, 'Life Lessons': 40, 
    'Current Affairs': 41, 'Finance Focus': 42, 'Laugh Line': 43, 
    'True Crime Stories': 44, 'Business Insights': 45, 'Fashion Forward': 46, 'Tune Time': 47
}
re_dict['genr_dict'] = {'True Crime': 0, 'Comedy': 1, 'Education': 2, 'Technology': 3, 'Health': 4, 'News': 5, 'Music': 6, 'Sports': 7, 'Business': 8, 'Lifestyle': 9}
re_dict['week_dict'] = {'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6}
re_dict['time_dict'] = {'Morning': 10, 'Afternoon': 14, 'Evening': 17, 'Night': 21}
re_dict['sent_dict'] = {'Negative': 0, 'Neutral': 1, 'Positive': 2}


def preprocess(df):
    df['Episode_Num'] = df['Episode_Title'].str[8:].astype(int)  # Convert to int before log transform
    df = df.drop(columns=['Episode_Title'])

    # Convert categorical variables
    df['Genre'] = df['Genre'].replace(re_dict["genr_dict"])
    df['Podcast_Name'] = df['Podcast_Name'].replace(re_dict["podc_dict"])
    df['Publication_Day'] = df['Publication_Day'].replace(re_dict["week_dict"])
    df['Publication_Time'] = df['Publication_Time'].replace(re_dict["time_dict"])
    df['Episode_Sentiment'] = df['Episode_Sentiment'].replace(re_dict["sent_dict"])

    df.loc[df['Episode_Length_minutes']>121.0, 'Episode_Length_minutes'] = 121.0
    df.loc[df["Number_of_Ads"] > 103.91, "Number_of_Ads"] = 103.91

    # Define categorical columns
    df["Episode_Length_minutes_NaN"] = df["Episode_Length_minutes"].isna().astype(int)
    df["Guest_Popularity_percentage_NaN"] = df["Guest_Popularity_percentage"].isna().astype(int)

    # Replacing null values by median
    df['Episode_Length_minutes'].fillna(df['Episode_Length_minutes'].median(), inplace=True)
    df['Guest_Popularity_percentage'].fillna(df['Guest_Popularity_percentage'].median(), inplace=True)
    
    return df


df_train = pd.read_csv(cfg.train_path, index_col='id')
df_test = pd.read_csv(cfg.test_path, index_col='id')
df_sub = pd.read_csv(cfg.sub_path, index_col='id')

is_dev_mode = False
is_dev_mode = True
if is_dev_mode:
    df_train = df_train.sample(100, random_state=42)
    df_test = df_test[:10]
    df_sub = df_sub[:10]
    
df_train = preprocess(df_train)
df_test = preprocess(df_test)

df_train_desc = df_train.describe()

target_col = "Listening_Time_minutes"
y_train = df_train[target_col].copy()
df_train = df_train.drop(columns=[target_col])

df_desc = df_train.describe()

def feature_eng(df, df_desc=df_desc):
    # Better capture cyclical nature of day and time
    df['Day_sin'] = np.sin(2 * np.pi * df['Publication_Day'] / 7)
    df['Day_cos'] = np.cos(2 * np.pi * df['Publication_Day'] / 7)
    df['Time_sin'] = np.sin(2 * np.pi * df['Publication_Time'] / 4)
    df['Time_cos'] = np.cos(2 * np.pi * df['Publication_Time'] / 4)

    # Higher frequency sinusoidal features for day and time
    df['Day_sin2'] = np.sin(4 * np.pi * df['Publication_Day'] / 7)
    df['Day_cos2'] = np.cos(4 * np.pi * df['Publication_Day'] / 7)
    df['Time_sin2'] = np.sin(4 * np.pi * df['Publication_Time'] / 24)
    df['Time_cos2'] = np.cos(4 * np.pi * df['Publication_Time'] / 24)

    df['Length_per_Ads'] = (df['Episode_Length_minutes'] / (df['Number_of_Ads'] + 1)).fillna(0)
    
    groups = ["Podcast_Name", "Episode_Length_minutes_NaN", "Guest_Popularity_percentage_NaN", "Publication_Day", "Publication_Time", "Genre"]
    for group in groups:
        numeric_cols = ["Episode_Num", "Episode_Length_minutes", "Number_of_Ads", "Host_Popularity_percentage", "Guest_Popularity_percentage"]
        for col in numeric_cols:
            df[f"{group}_{col}_norm"] = df.groupby(group)[col].transform(lambda x: (x - x.min()) / (x.max() - x.min() + 1e-8))

    df['Podcast_Name'] = df['Podcast_Name'].astype('category')
    df['Genre'] = df['Genre'].astype('category')
    df['Publication_Day'] = df['Publication_Day'].astype('category')
    df['Publication_Time'] = df['Publication_Time'].astype('category')
    df['Episode_Sentiment'] = df['Episode_Sentiment'].astype('category')
    df['Episode_Num'] = df['Episode_Num'].astype('category')

    return df

df_train = feature_eng(df_train)
df_test = feature_eng(df_test)

print(df_train.columns)
display(df_train)
display(df_train_desc)

Index(['Podcast_Name', 'Episode_Length_minutes', 'Genre',
       'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment',
       'Episode_Num', 'Episode_Length_minutes_NaN',
       'Guest_Popularity_percentage_NaN', 'Day_sin', 'Day_cos', 'Time_sin',
       'Time_cos', 'Day_sin2', 'Day_cos2', 'Time_sin2', 'Time_cos2',
       'Length_per_Ads', 'Podcast_Name_Episode_Num_norm',
       'Podcast_Name_Episode_Length_minutes_norm',
       'Podcast_Name_Number_of_Ads_norm',
       'Podcast_Name_Host_Popularity_percentage_norm',
       'Podcast_Name_Guest_Popularity_percentage_norm',
       'Episode_Length_minutes_NaN_Episode_Num_norm',
       'Episode_Length_minutes_NaN_Episode_Length_minutes_norm',
       'Episode_Length_minutes_NaN_Number_of_Ads_norm',
       'Episode_Length_minutes_NaN_Host_Popularity_percentage_norm',
       'Episode_Length_minutes_NaN_Guest_Popularity_percentage_norm',
       'Guest_Pop

Unnamed: 0_level_0,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Episode_Num,...,Publication_Time_Episode_Num_norm,Publication_Time_Episode_Length_minutes_norm,Publication_Time_Number_of_Ads_norm,Publication_Time_Host_Popularity_percentage_norm,Publication_Time_Guest_Popularity_percentage_norm,Genre_Episode_Num_norm,Genre_Episode_Length_minutes_norm,Genre_Number_of_Ads_norm,Genre_Host_Popularity_percentage_norm,Genre_Guest_Popularity_percentage_norm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,63.84,0,74.81,3,21,53.58,0.0,2,98,...,0.979798,0.524757,0.000000,0.733882,0.446835,0.979798,0.518065,0.000000,0.661115,0.499162
1,1,119.80,1,66.95,5,14,75.95,2.0,0,26,...,0.252525,0.990083,0.019370,0.553828,0.662017,0.252525,0.998476,0.037439,0.552964,0.720015
2,2,73.90,2,69.97,1,17,8.97,0.0,0,16,...,0.151515,0.608084,0.000000,0.505105,0.081442,0.151515,0.599235,0.000000,0.694360,0.077502
3,3,67.17,3,57.22,0,10,78.70,2.0,2,45,...,0.444444,0.536272,0.019247,0.476361,0.680650,0.444444,0.551276,0.666667,0.391849,0.656297
4,4,110.51,4,80.07,0,14,58.68,3.0,1,86,...,0.858586,0.913306,0.029056,0.665307,0.511464,0.858586,0.914276,0.028871,0.664966,0.586859
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749995,36,75.66,2,69.36,5,10,53.58,0.0,0,25,...,0.242424,0.609506,0.000000,0.579777,0.463368,0.242424,0.614542,0.000000,0.688149,0.463368
749996,19,75.75,8,35.21,5,21,53.58,2.0,1,21,...,0.202020,0.625561,0.666667,0.336212,0.446835,0.202020,0.615271,0.666667,0.302932,0.535807
749997,37,30.98,9,78.58,3,10,84.89,0.0,0,51,...,0.505051,0.224101,0.000000,0.658318,0.734193,0.505051,0.256033,0.000000,0.603047,0.738922
749998,28,108.98,9,45.39,3,10,93.27,0.0,0,47,...,0.464646,0.896921,0.000000,0.375586,0.806678,0.464646,0.900661,0.000000,0.261375,0.811874


Unnamed: 0,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,Episode_Num,Episode_Length_minutes_NaN,Guest_Popularity_percentage_NaN
count,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,750000.0,749999.0,750000.0,750000.0,750000.0,750000.0,750000.0
mean,23.540307,64.427274,4.556036,59.859901,3.030805,15.6715,52.498047,1.348855,0.997969,45.437406,51.445811,0.116124,0.194707
std,13.917884,30.995602,2.965912,22.873098,2.024196,4.026379,25.537152,1.15113,0.81544,27.138306,28.085623,0.320374,0.395975
min,0.0,0.0,0.0,1.3,0.0,10.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,12.0,39.42,2.0,39.41,1.0,14.0,34.55,0.0,0.0,23.17835,28.0,0.0,0.0
50%,23.0,63.84,5.0,60.05,3.0,17.0,53.58,1.0,1.0,43.37946,52.0,0.0,0.0
75%,36.0,90.31,7.0,79.53,5.0,21.0,71.04,2.0,2.0,64.81158,75.0,0.0,0.0
max,47.0,121.0,9.0,119.46,6.0,21.0,119.91,103.91,2.0,119.97,100.0,1.0,1.0


In [28]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
import wandb
import os
from dotenv import load_dotenv

load_dotenv()
wandb.login(key=os.getenv("WANDB_API_KEY"))
wandb.init(project="playground-series-s5e4", config=asdict(cfg))

X = df_train.copy()
y = y_train.copy()
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, 
    test_size=0.2,  # 20% for validation
    random_state=42
)

X_test = df_test[X.columns].copy()

# # Target encoding if needed
# encoded_columns = df_train.columns[cfg.encoded_columns_start:]
# encoder = TargetEncoder(random_state=cfg.random_state)

# X_train[encoded_columns] = encoder.fit_transform(X_train[encoded_columns], y_train)
# X_valid[encoded_columns] = encoder.transform(X_valid[encoded_columns])
# X_test[encoded_columns] = encoder.transform(X_test[encoded_columns])

# Initialize the model
model = lgb.LGBMRegressor(
    n_iter=cfg.n_iter,
    max_depth=cfg.max_depth,
    num_leaves=cfg.num_leaves,
    colsample_bytree=cfg.colsample_bytree,
    learning_rate=cfg.learning_rate,
    objective=cfg.objective,
    metric=cfg.metric, 
    verbosity=cfg.verbosity,
    random_state=42,
)

# Train model with validation
model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    callbacks=[
        lgb.log_evaluation(cfg.log_eval), 
        lgb.early_stopping(cfg.early_stopping),
        WandbCallback(log_every=10)
    ],
)

# Calculate validation score
val_score = model.best_score_['valid_1'][cfg.metric]
print(f"Validation score: {val_score}")
wandb.finish()
gc.collect()

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/masaishi/.netrc


Training until validation scores don't improve for 200 rounds
[100]	training's rmse: 11.238	valid_1's rmse: 12.8506
[200]	training's rmse: 10.1958	valid_1's rmse: 12.7589
[300]	training's rmse: 9.52104	valid_1's rmse: 12.7196
[400]	training's rmse: 8.98666	valid_1's rmse: 12.6952
[500]	training's rmse: 8.48857	valid_1's rmse: 12.6818
[600]	training's rmse: 8.02946	valid_1's rmse: 12.6687
[700]	training's rmse: 7.61932	valid_1's rmse: 12.656
[800]	training's rmse: 7.25943	valid_1's rmse: 12.6453
[900]	training's rmse: 6.90556	valid_1's rmse: 12.6338
[1000]	training's rmse: 6.57584	valid_1's rmse: 12.6243
[1100]	training's rmse: 6.27619	valid_1's rmse: 12.6185
[1200]	training's rmse: 5.99779	valid_1's rmse: 12.6147
[1300]	training's rmse: 5.74852	valid_1's rmse: 12.6104
[1400]	training's rmse: 5.49727	valid_1's rmse: 12.6059
[1500]	training's rmse: 5.2666	valid_1's rmse: 12.6034
[1600]	training's rmse: 5.05172	valid_1's rmse: 12.6007
[1700]	training's rmse: 4.8416	valid_1's rmse: 12.5977

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Validation score: 12.576538332048317


0,1
training/rmse,██▇▇▇▆▅▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁
valid_1/rmse,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
training/rmse,2.12979
valid_1/rmse,12.57678


338

In [None]:
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder
# import lightgbm as lgb
# import gc
# import wandb
# import os
# import pandas as pd
# import numpy as np
# from dataclasses import asdict
# from dotenv import load_dotenv

# # Load environment variables and initialize wandb
# load_dotenv()
# wandb.login(key=os.getenv("WANDB_API_KEY"))
# wandb.init(project="playground-series-s5e4", config=asdict(cfg))

# # Create copies of the data
# X = df_train.copy()
# y = y_train.copy()

# # Identify categorical columns
# categorical_columns = X.select_dtypes(include=['category', 'object']).columns.tolist()

# encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# # Fit and transform the categorical columns
# encoded_features = encoder.fit_transform(X[categorical_columns])

# # Create DataFrame with encoded features
# encoded_df = pd.DataFrame(
#     encoded_features,
#     columns=encoder.get_feature_names_out(categorical_columns)
# )

# # Drop original categorical columns and add encoded features
# X_encoded = X.drop(columns=categorical_columns).reset_index(drop=True)
# X_encoded = pd.concat([X_encoded, encoded_df], axis=1)

# # Do the same transformation for test data
# encoded_features_test = encoder.transform(df_test[categorical_columns])
# encoded_df_test = pd.DataFrame(
#     encoded_features_test,
#     columns=encoder.get_feature_names_out(categorical_columns)
# )
# X_test = df_test.drop(columns=categorical_columns).reset_index(drop=True)
# X_test = pd.concat([X_test, encoded_df_test], axis=1)

# # Use the encoded data for train/valid split
# X_train, X_valid, y_train, y_valid = train_test_split(
#     X_encoded, y, test_size=0.2, random_state=42
# )
# # Initialize the model
# model = lgb.LGBMRegressor(
#     n_iter=cfg.n_iter,
#     max_depth=cfg.max_depth,
#     num_leaves=cfg.num_leaves,
#     colsample_bytree=cfg.colsample_bytree,
#     learning_rate=cfg.learning_rate,
#     objective=cfg.objective,
#     metric=cfg.metric,
#     verbosity=cfg.verbosity,
#     random_state=42,
# )

# # Train model with validation
# model.fit(
#     X_train, y_train,
#     eval_set=[(X_train, y_train), (X_valid, y_valid)],
#     callbacks=[
#         lgb.log_evaluation(cfg.log_eval),
#         lgb.early_stopping(cfg.early_stopping),
#         WandbCallback(log_every=10)
#     ],
# )

# # Calculate validation score
# val_score = model.best_score_['valid_1'][cfg.metric]  # Fixed typo here
# print(f"Validation score: {val_score}")
# wandb.finish()
# gc.collect()