In [4]:
# import libraries
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


pd.set_option('display.max_columns', 100)

TARGET = 'Listening_Time_minutes'
CATS = ['Podcast_Name', 'Episode_Num', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
NUMS = ['Episode_Length_minutes', 'Host_Popularity_percentage', 
        'Guest_Popularity_percentage', 'Number_of_Ads']

oof_pred_name = '2+3+4_interact_xgb'

In [5]:
import warnings
warnings.simplefilter('ignore')

In [8]:
# train = pd.read_csv('../data/raw/train.csv', index_col='id')
# test = pd.read_csv('../data/raw/test.csv', index_col='id')
original = pd.read_csv('../data/raw/podcast_dataset.csv')
original_clean = original.dropna(subset=[TARGET]).drop_duplicates()
X= original_clean.drop(columns=[TARGET])
y= original_clean[TARGET]
train, test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Train shape: {train.shape}")
print(f"Test  shape: {test.shape}")
print(f"Orig  shape: {original.shape}")
print(f"Orig_clean  shape: {original_clean.shape}")
train.head(3)

Train shape: (35894, 10)
Test  shape: (8974, 10)
Orig  shape: (52500, 11)
Orig_clean  shape: (44868, 11)


Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
5825,Comedy Corner,Episode 76,69.26,Comedy,38.02,Monday,Afternoon,,3,Positive
32187,Laugh Line,Episode 96,5.21,Comedy,60.21,Thursday,Evening,32.01,1,Neutral
1738,Athlete's Arena,Episode 28,108.0,Sports,97.29,Sunday,Afternoon,0.08,3,Positive


In [9]:
train.describe()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads
count,32298.0,35894.0,32320.0,35894.0
mean,62.875148,60.161693,50.049801,1.503037
std,33.129163,23.117946,28.845242,1.1162
min,5.0,20.0,0.01,0.0
25%,34.4,40.02,25.11,1.0
50%,62.87,60.32,50.06,2.0
75%,91.61,80.2775,75.08,3.0
max,120.0,99.99,100.0,3.0


Add Features

In [10]:
def feature_eng(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df['Episode_Num'] = df['Episode_Title'].str[8:]     
    df['is_weekend']   = df['Publication_Day'].isin(['Saturday', 'Sunday']).astype(int)

    return df.drop(columns=['Episode_Title'])

train = feature_eng(train)
test = feature_eng(test)

In [11]:
ELM = []
for k in range(3):
    col_name = f'ELm_r{k}'
    train[col_name] = train['Episode_Length_minutes'].round(k)
    test[col_name] = test['Episode_Length_minutes'].round(k)
    ELM.append(col_name)

In [12]:
train.columns

Index(['Podcast_Name', 'Episode_Length_minutes', 'Genre',
       'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment',
       'Episode_Num', 'is_weekend', 'ELm_r0', 'ELm_r1', 'ELm_r2'],
      dtype='object')

In [13]:
encoded_columns = []

selected_comb = [
    # 2-interaction
    ['Episode_Length_minutes', 'Host_Popularity_percentage'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Number_of_Ads'],
    ['Episode_Num', 'Host_Popularity_percentage'],
    ['Episode_Num', 'Guest_Popularity_percentage'],
    ['Episode_Num', 'Number_of_Ads'],    
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Host_Popularity_percentage', 'Number_of_Ads'],
    ['Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Podcast_Name'],
    ['Episode_Num', 'Podcast_Name'],  
    ['Guest_Popularity_percentage', 'Podcast_Name'],
    ['ELm_r1', 'Episode_Num'],
    ['ELm_r1', 'Host_Popularity_percentage'], 
    ['ELm_r1', 'Guest_Popularity_percentage'],
    ['ELm_r2', 'Episode_Num'],
    ['ELm_r2', 'Episode_Sentiment'],
    ['ELm_r2', 'Publication_Day'],

    
    # 3-interaction
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Episode_Num', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Sentiment', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Day'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Genre'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Genre'],
    ['Episode_Num', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],

    ['Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],   
    ['ELm_r1', 'Number_of_Ads', 'Episode_Sentiment'],
    ['ELm_r2', 'Number_of_Ads', 'Podcast_Name'],
    
    # 4-interaction
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Genre'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day', 'Publication_Time'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day', 'Genre'],    
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Time'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time'],
    ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day', 'Genre'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Publication_Time'],
    ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],
    ['Episode_Length_minutes', 'Episode_Num', 'Publication_Time', 'Podcast_Name'],
    
    ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Publication_Day'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Genre'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Time', 'Genre'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
    ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],
    ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Podcast_Name'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Podcast_Name'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day', 'Podcast_Name'],
    ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Time', 'Podcast_Name'],
    
]

for comb in selected_comb:
    name = '_'.join(comb)
        
    if len(comb) == 2:
        train[name] = train[comb[0]].astype(str) + '_' + train[comb[1]].astype(str)
        test[name] = test[comb[0]].astype(str) + '_' + test[comb[1]].astype(str)
        
    elif len(comb) == 3:
        train[name] = (train[comb[0]].astype(str) + '_' +
                       train[comb[1]].astype(str) + '_' +
                       train[comb[2]].astype(str))
        test[name] = (test[comb[0]].astype(str) + '_' +
                      test[comb[1]].astype(str) + '_' +
                      test[comb[2]].astype(str))
        
    elif len(comb) == 4:
        train[name] = (train[comb[0]].astype(str) + '_' +
                       train[comb[1]].astype(str) + '_' +
                       train[comb[2]].astype(str) + '_' +
                       train[comb[3]].astype(str))
        test[name] = (test[comb[0]].astype(str) + '_' +
                      test[comb[1]].astype(str) + '_' +
                      test[comb[2]].astype(str) + '_' +
                      test[comb[3]].astype(str))
    
    encoded_columns.append(name)

train[encoded_columns] = train[encoded_columns].astype('category')
test[encoded_columns] = test[encoded_columns].astype('category')

In [14]:
train.shape

(35894, 112)

In [15]:
# reference: https://www.kaggle.com/code/masayakawamata/imputation-strategies/
train[NUMS] = train[NUMS].fillna(train[NUMS].median())
test[NUMS] = test[NUMS].fillna(train[NUMS].median())

In [16]:
FEATURES = NUMS + CATS + encoded_columns

print(f"Train Shape: {train.shape}")
print(f"Test  Shape: {test.shape}")
train.head(3)

Train Shape: (35894, 112)
Test  Shape: (8974, 112)


Unnamed: 0,Podcast_Name,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Episode_Num,is_weekend,ELm_r0,ELm_r1,ELm_r2,Episode_Length_minutes_Host_Popularity_percentage,Episode_Length_minutes_Guest_Popularity_percentage,Episode_Length_minutes_Number_of_Ads,Episode_Num_Host_Popularity_percentage,Episode_Num_Guest_Popularity_percentage,Episode_Num_Number_of_Ads,Host_Popularity_percentage_Guest_Popularity_percentage,Host_Popularity_percentage_Number_of_Ads,Host_Popularity_percentage_Episode_Sentiment,Episode_Length_minutes_Podcast_Name,Episode_Num_Podcast_Name,Guest_Popularity_percentage_Podcast_Name,ELm_r1_Episode_Num,ELm_r1_Host_Popularity_percentage,ELm_r1_Guest_Popularity_percentage,ELm_r2_Episode_Num,ELm_r2_Episode_Sentiment,ELm_r2_Publication_Day,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage,Episode_Length_minutes_Episode_Num_Guest_Popularity_percentage,Episode_Length_minutes_Episode_Num_Number_of_Ads,Episode_Length_minutes_Episode_Num_Episode_Sentiment,Episode_Length_minutes_Episode_Num_Publication_Day,Episode_Length_minutes_Host_Popularity_percentage_Guest_Popularity_percentage,Episode_Length_minutes_Host_Popularity_percentage_Number_of_Ads,Episode_Length_minutes_Host_Popularity_percentage_Episode_Sentiment,Episode_Length_minutes_Host_Popularity_percentage_Publication_Day,Episode_Length_minutes_Host_Popularity_percentage_Publication_Time,Episode_Length_minutes_Guest_Popularity_percentage_Number_of_Ads,Episode_Length_minutes_Guest_Popularity_percentage_Publication_Day,Episode_Length_minutes_Guest_Popularity_percentage_Publication_Time,Episode_Length_minutes_Number_of_Ads_Episode_Sentiment,Episode_Length_minutes_Number_of_Ads_Publication_Day,Episode_Length_minutes_Episode_Sentiment_Publication_Time,Episode_Num_Host_Popularity_percentage_Guest_Popularity_percentage,Episode_Num_Host_Popularity_percentage_Number_of_Ads,...,Host_Popularity_percentage_Guest_Popularity_percentage_Publication_Day,Host_Popularity_percentage_Guest_Popularity_percentage_Publication_Time,Host_Popularity_percentage_Number_of_Ads_Publication_Day,Guest_Popularity_percentage_Number_of_Ads_Episode_Sentiment,Guest_Popularity_percentage_Number_of_Ads_Genre,ELm_r1_Number_of_Ads_Episode_Sentiment,ELm_r2_Number_of_Ads_Podcast_Name,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Guest_Popularity_percentage,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Number_of_Ads,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Episode_Sentiment,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Publication_Day,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Publication_Time,Episode_Length_minutes_Episode_Num_Host_Popularity_percentage_Genre,Episode_Length_minutes_Episode_Num_Guest_Popularity_percentage_Number_of_Ads,Episode_Length_minutes_Episode_Num_Guest_Popularity_percentage_Episode_Sentiment,Episode_Length_minutes_Episode_Num_Guest_Popularity_percentage_Publication_Day,Episode_Length_minutes_Episode_Num_Guest_Popularity_percentage_Publication_Time,Episode_Length_minutes_Episode_Num_Number_of_Ads_Episode_Sentiment,Episode_Length_minutes_Episode_Num_Number_of_Ads_Publication_Day,Episode_Length_minutes_Episode_Num_Number_of_Ads_Publication_Time,Episode_Length_minutes_Episode_Num_Publication_Day_Publication_Time,Episode_Length_minutes_Episode_Num_Publication_Day_Genre,Episode_Length_minutes_Host_Popularity_percentage_Guest_Popularity_percentage_Number_of_Ads,Episode_Length_minutes_Host_Popularity_percentage_Guest_Popularity_percentage_Episode_Sentiment,Episode_Length_minutes_Host_Popularity_percentage_Guest_Popularity_percentage_Publication_Day,Episode_Length_minutes_Host_Popularity_percentage_Guest_Popularity_percentage_Publication_Time,Episode_Length_minutes_Host_Popularity_percentage_Number_of_Ads_Episode_Sentiment,Episode_Length_minutes_Host_Popularity_percentage_Number_of_Ads_Publication_Day,Episode_Length_minutes_Host_Popularity_percentage_Publication_Day_Publication_Time,Episode_Length_minutes_Host_Popularity_percentage_Publication_Day_Genre,Episode_Length_minutes_Guest_Popularity_percentage_Number_of_Ads_Episode_Sentiment,Episode_Length_minutes_Guest_Popularity_percentage_Number_of_Ads_Publication_Day,Episode_Length_minutes_Guest_Popularity_percentage_Number_of_Ads_Publication_Time,Episode_Length_minutes_Guest_Popularity_percentage_Number_of_Ads_Genre,Episode_Length_minutes_Episode_Num_Publication_Time_Podcast_Name,Episode_Num_Host_Popularity_percentage_Guest_Popularity_percentage_Number_of_Ads,Episode_Num_Host_Popularity_percentage_Guest_Popularity_percentage_Episode_Sentiment,Episode_Num_Host_Popularity_percentage_Number_of_Ads_Publication_Day,Episode_Num_Host_Popularity_percentage_Number_of_Ads_Publication_Time,Episode_Num_Host_Popularity_percentage_Episode_Sentiment_Publication_Day,Episode_Num_Host_Popularity_percentage_Episode_Sentiment_Publication_Time,Episode_Num_Host_Popularity_percentage_Episode_Sentiment_Genre,Episode_Num_Host_Popularity_percentage_Publication_Day_Publication_Time,Episode_Num_Host_Popularity_percentage_Publication_Time_Genre,Episode_Num_Guest_Popularity_percentage_Number_of_Ads_Episode_Sentiment,Episode_Num_Guest_Popularity_percentage_Number_of_Ads_Genre,Episode_Num_Host_Popularity_percentage_Episode_Sentiment_Podcast_Name,Host_Popularity_percentage_Number_of_Ads_Episode_Sentiment_Podcast_Name,Host_Popularity_percentage_Number_of_Ads_Publication_Day_Podcast_Name,Host_Popularity_percentage_Number_of_Ads_Publication_Time_Podcast_Name
5825,Comedy Corner,69.26,Comedy,38.02,Monday,Afternoon,50.06,3,Positive,76,0,69.0,69.3,69.26,69.26_38.02,69.26_nan,69.26_3,76_38.02,76_nan,76_3,38.02_nan,38.02_3,38.02_Positive,69.26_Comedy Corner,76_Comedy Corner,nan_Comedy Corner,69.3_76,69.3_38.02,69.3_nan,69.26_76,69.26_Positive,69.26_Monday,69.26_76_38.02,69.26_76_nan,69.26_76_3,69.26_76_Positive,69.26_76_Monday,69.26_38.02_nan,69.26_38.02_3,69.26_38.02_Positive,69.26_38.02_Monday,69.26_38.02_Afternoon,69.26_nan_3,69.26_nan_Monday,69.26_nan_Afternoon,69.26_3_Positive,69.26_3_Monday,69.26_Positive_Afternoon,76_38.02_nan,76_38.02_3,...,38.02_nan_Monday,38.02_nan_Afternoon,38.02_3_Monday,nan_3_Positive,nan_3_Comedy,69.3_3_Positive,69.26_3_Comedy Corner,69.26_76_38.02_nan,69.26_76_38.02_3,69.26_76_38.02_Positive,69.26_76_38.02_Monday,69.26_76_38.02_Afternoon,69.26_76_38.02_Comedy,69.26_76_nan_3,69.26_76_nan_Positive,69.26_76_nan_Monday,69.26_76_nan_Afternoon,69.26_76_3_Positive,69.26_76_3_Monday,69.26_76_3_Afternoon,69.26_76_Monday_Afternoon,69.26_76_Monday_Comedy,69.26_38.02_nan_3,69.26_38.02_nan_Positive,69.26_38.02_nan_Monday,69.26_38.02_nan_Afternoon,69.26_38.02_3_Positive,69.26_38.02_3_Monday,69.26_38.02_Monday_Afternoon,69.26_38.02_Monday_Comedy,69.26_nan_3_Positive,69.26_nan_3_Monday,69.26_nan_3_Afternoon,69.26_nan_3_Comedy,69.26_76_Afternoon_Comedy Corner,76_38.02_nan_3,76_38.02_nan_Positive,76_38.02_3_Monday,76_38.02_3_Afternoon,76_38.02_Positive_Monday,76_38.02_Positive_Afternoon,76_38.02_Positive_Comedy,76_38.02_Monday_Afternoon,76_38.02_Afternoon_Comedy,76_nan_3_Positive,76_nan_3_Comedy,76_38.02_Positive_Comedy Corner,38.02_3_Positive_Comedy Corner,38.02_3_Monday_Comedy Corner,38.02_3_Afternoon_Comedy Corner
32187,Laugh Line,5.21,Comedy,60.21,Thursday,Evening,32.01,1,Neutral,96,0,5.0,5.2,5.21,5.21_60.21,5.21_32.01,5.21_1,96_60.21,96_32.01,96_1,60.21_32.01,60.21_1,60.21_Neutral,5.21_Laugh Line,96_Laugh Line,32.01_Laugh Line,5.2_96,5.2_60.21,5.2_32.01,5.21_96,5.21_Neutral,5.21_Thursday,5.21_96_60.21,5.21_96_32.01,5.21_96_1,5.21_96_Neutral,5.21_96_Thursday,5.21_60.21_32.01,5.21_60.21_1,5.21_60.21_Neutral,5.21_60.21_Thursday,5.21_60.21_Evening,5.21_32.01_1,5.21_32.01_Thursday,5.21_32.01_Evening,5.21_1_Neutral,5.21_1_Thursday,5.21_Neutral_Evening,96_60.21_32.01,96_60.21_1,...,60.21_32.01_Thursday,60.21_32.01_Evening,60.21_1_Thursday,32.01_1_Neutral,32.01_1_Comedy,5.2_1_Neutral,5.21_1_Laugh Line,5.21_96_60.21_32.01,5.21_96_60.21_1,5.21_96_60.21_Neutral,5.21_96_60.21_Thursday,5.21_96_60.21_Evening,5.21_96_60.21_Comedy,5.21_96_32.01_1,5.21_96_32.01_Neutral,5.21_96_32.01_Thursday,5.21_96_32.01_Evening,5.21_96_1_Neutral,5.21_96_1_Thursday,5.21_96_1_Evening,5.21_96_Thursday_Evening,5.21_96_Thursday_Comedy,5.21_60.21_32.01_1,5.21_60.21_32.01_Neutral,5.21_60.21_32.01_Thursday,5.21_60.21_32.01_Evening,5.21_60.21_1_Neutral,5.21_60.21_1_Thursday,5.21_60.21_Thursday_Evening,5.21_60.21_Thursday_Comedy,5.21_32.01_1_Neutral,5.21_32.01_1_Thursday,5.21_32.01_1_Evening,5.21_32.01_1_Comedy,5.21_96_Evening_Laugh Line,96_60.21_32.01_1,96_60.21_32.01_Neutral,96_60.21_1_Thursday,96_60.21_1_Evening,96_60.21_Neutral_Thursday,96_60.21_Neutral_Evening,96_60.21_Neutral_Comedy,96_60.21_Thursday_Evening,96_60.21_Evening_Comedy,96_32.01_1_Neutral,96_32.01_1_Comedy,96_60.21_Neutral_Laugh Line,60.21_1_Neutral_Laugh Line,60.21_1_Thursday_Laugh Line,60.21_1_Evening_Laugh Line
1738,Athlete's Arena,108.0,Sports,97.29,Sunday,Afternoon,0.08,3,Positive,28,1,108.0,108.0,108.0,108.0_97.29,108.0_0.08,108.0_3,28_97.29,28_0.08,28_3,97.29_0.08,97.29_3,97.29_Positive,108.0_Athlete's Arena,28_Athlete's Arena,0.08_Athlete's Arena,108.0_28,108.0_97.29,108.0_0.08,108.0_28,108.0_Positive,108.0_Sunday,108.0_28_97.29,108.0_28_0.08,108.0_28_3,108.0_28_Positive,108.0_28_Sunday,108.0_97.29_0.08,108.0_97.29_3,108.0_97.29_Positive,108.0_97.29_Sunday,108.0_97.29_Afternoon,108.0_0.08_3,108.0_0.08_Sunday,108.0_0.08_Afternoon,108.0_3_Positive,108.0_3_Sunday,108.0_Positive_Afternoon,28_97.29_0.08,28_97.29_3,...,97.29_0.08_Sunday,97.29_0.08_Afternoon,97.29_3_Sunday,0.08_3_Positive,0.08_3_Sports,108.0_3_Positive,108.0_3_Athlete's Arena,108.0_28_97.29_0.08,108.0_28_97.29_3,108.0_28_97.29_Positive,108.0_28_97.29_Sunday,108.0_28_97.29_Afternoon,108.0_28_97.29_Sports,108.0_28_0.08_3,108.0_28_0.08_Positive,108.0_28_0.08_Sunday,108.0_28_0.08_Afternoon,108.0_28_3_Positive,108.0_28_3_Sunday,108.0_28_3_Afternoon,108.0_28_Sunday_Afternoon,108.0_28_Sunday_Sports,108.0_97.29_0.08_3,108.0_97.29_0.08_Positive,108.0_97.29_0.08_Sunday,108.0_97.29_0.08_Afternoon,108.0_97.29_3_Positive,108.0_97.29_3_Sunday,108.0_97.29_Sunday_Afternoon,108.0_97.29_Sunday_Sports,108.0_0.08_3_Positive,108.0_0.08_3_Sunday,108.0_0.08_3_Afternoon,108.0_0.08_3_Sports,108.0_28_Afternoon_Athlete's Arena,28_97.29_0.08_3,28_97.29_0.08_Positive,28_97.29_3_Sunday,28_97.29_3_Afternoon,28_97.29_Positive_Sunday,28_97.29_Positive_Afternoon,28_97.29_Positive_Sports,28_97.29_Sunday_Afternoon,28_97.29_Afternoon_Sports,28_0.08_3_Positive,28_0.08_3_Sports,28_97.29_Positive_Athlete's Arena,97.29_3_Positive_Athlete's Arena,97.29_3_Sunday_Athlete's Arena,97.29_3_Afternoon_Athlete's Arena


In [17]:
train.to_csv('../data/interim/train_exp1.1.csv', index=False)
test.to_csv('../data/interim/test_exp1.1.csv', index=False)

Train XGBoost

In [19]:
from sklearn.model_selection import KFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import gc

In [20]:
def target_encode(df_train, df_val, col, target, stats='mean', prefix='TE'):
    df_val = df_val.copy()
    agg = df_train.groupby(col)[target].agg(stats)    
    if isinstance(stats, (list, tuple)):
        for s in stats:
            colname = f"{prefix}_{col}_{s}"
            df_val[colname] = df_val[col].map(agg[s]).astype(float)
            df_val[colname].fillna(agg[s].mean(), inplace=True)
    else:
        suffix = stats if isinstance(stats, str) else stats.__name__
        colname = f"{prefix}_{col}_{suffix}"
        df_val[colname] = df_val[col].map(agg).astype(float)
        df_val[colname].fillna(agg.mean(), inplace=True)
    return df_val

In [21]:
# reference: https://www.kaggle.com/code/act18l/say-goodbye-to-ordinalencoder
class OrderedTargetEncoder(BaseEstimator, TransformerMixin):
    """
    Out‑of‑fold **mean‑rank** encoder with optional smoothing.
    • Encodes each category by the *rank* of its target mean within a fold.
    • Unseen categories get the global mean rank (or −1 if you prefer).
    """
    def __init__(self, cat_cols=None, n_splits=5, smoothing=0):
        self.cat_cols   = cat_cols
        self.n_splits   = n_splits
        self.smoothing  = smoothing       # 0 = no smoothing
        self.maps_      = {}              # per‑fold maps
        self.global_map = {}              # fit on full data for test set

    def _make_fold_map(self, X_col, y):
        means = y.groupby(X_col, dropna=False).mean()
        if self.smoothing > 0:
            counts = y.groupby(X_col, dropna=False).count()
            smooth = (counts * means + self.smoothing * y.mean()) / (counts + self.smoothing)
            means  = smooth
        return {k: r for r, k in enumerate(means.sort_values().index)}

    def fit(self, X, y):
        X, y = X.reset_index(drop=True), y.reset_index(drop=True)
        if self.cat_cols is None:
            self.cat_cols = X.select_dtypes(include='object').columns.tolist()

        kf = KFold(self.n_splits, shuffle=True, random_state=42)
        self.maps_ = {col: [None]*self.n_splits for col in self.cat_cols}

        for fold, (tr_idx, _) in enumerate(kf.split(X)):
            X_tr, y_tr = X.loc[tr_idx], y.loc[tr_idx]
            for col in self.cat_cols:
                self.maps_[col][fold] = self._make_fold_map(X_tr[col], y_tr)

        for col in self.cat_cols:
            self.global_map[col] = self._make_fold_map(X[col], y)

        return self

    def transform(self, X, y=None, fold=None):
        """
        • During CV pass fold index to use fold‑specific maps (leak‑free).
        • At inference time (fold=None) uses global map.
        """
        X = X.copy()
        tgt_maps = {col: (self.global_map[col] if fold is None else self.maps_[col][fold])
                    for col in self.cat_cols}
        for col, mapping in tgt_maps.items():
            X[col] = X[col].map(mapping).fillna(-1).astype(int)
        return X

In [22]:
encode_stats = ['mean']

In [None]:
FOLDS          = 10
outer_kf       = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
oof            = np.zeros(len(train))
pred           = np.zeros(len(test))

for fold, (tr_idx, vl_idx) in enumerate(outer_kf.split(train), 1):
    print(f"--- Fold {fold} / {FOLDS} ---")

    X_tr_raw = train.loc[tr_idx, FEATURES].reset_index(drop=True)
    y_tr     = train.loc[tr_idx, TARGET].reset_index(drop=True)

    X_vl_raw = train.loc[vl_idx, FEATURES].reset_index(drop=True)
    y_vl     = train.loc[vl_idx, TARGET].reset_index(drop=True)

    X_ts_raw = test[FEATURES].copy()

    X_tr, X_vl, X_ts = X_tr_raw.copy(), X_vl_raw.copy(), X_ts_raw.copy()

    inner_kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    for _, (in_tr_idx, in_vl_idx) in enumerate(inner_kf.split(X_tr_raw), 1):
        in_tr = pd.concat([X_tr_raw.loc[in_tr_idx], y_tr.loc[in_tr_idx]], axis=1)
        in_vl = X_tr_raw.loc[in_vl_idx].reset_index(drop=True)

        for col in encoded_columns:
            for stat in encode_stats:
                te_tmp = target_encode(
                    in_tr, in_vl.copy(),
                    col, TARGET,
                    stats=stat, prefix="TE"
                )
                te_col = f"TE_{col}_{stat}"
                X_tr.loc[in_vl_idx, te_col] = te_tmp[te_col].values

    tr_with_y = pd.concat([X_tr_raw, y_tr], axis=1)
    for col in encoded_columns:
        for stat in encode_stats:
            te_col = f"TE_{col}_{stat}"
            X_vl = target_encode(tr_with_y, X_vl,      col, TARGET,
                                  stats=stat, prefix="TE")
            X_ts = target_encode(tr_with_y, X_ts,      col, TARGET,
                                  stats=stat, prefix="TE")

    X_tr.drop(encoded_columns, axis=1, inplace=True)
    X_vl.drop(encoded_columns, axis=1, inplace=True)
    X_ts.drop(encoded_columns, axis=1, inplace=True)    

    enc = OrderedTargetEncoder(
        cat_cols=CATS,
        n_splits=FOLDS,
        smoothing=20
    ).fit(X_tr, y_tr)

    X_tr[CATS] = enc.transform(X_tr[CATS], fold=None)[CATS]
    X_vl[CATS] = enc.transform(X_vl[CATS], fold=None)[CATS]
    X_ts[CATS] = enc.transform(X_ts[CATS], fold=None)[CATS]
    
    model = XGBRegressor(
        tree_method='hist',
        max_depth=14,
        colsample_bytree=0.5,
        subsample=0.9,
        n_estimators=50_000,
        learning_rate=0.02,
        enable_categorical=True,
        min_child_weight=10,
        early_stopping_rounds=150,
    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_vl, y_vl)],
        verbose=500
    )

    oof[vl_idx]  = model.predict(X_vl)
    pred        += model.predict(X_ts)

    del X_tr_raw, X_vl_raw, X_ts_raw, X_tr, X_vl, X_ts, y_tr, y_vl
    if fold != FOLDS:
        del model
    gc.collect()

pred /= FOLDS
rmse = mean_squared_error(train[TARGET], oof, squared=False)
print(f"Final OOF RMSE (XGB): {rmse:.5f}")