# Data imp

In [2]:

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
from itertools import combinations
import matplotlib.pyplot as plt
original = pd.read_csv('../Data/podcast_dataset.csv')
df_subm = pd.read_csv('../Data/sample_submission.csv', index_col='id')
train = pd.read_csv('../Data/train.csv', index_col='id')
test = pd.read_csv('../Data/test.csv', index_col='id')


# Preprocessing

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 750000 non-null  object 
 1   Episode_Title                750000 non-null  object 
 2   Episode_Length_minutes       662907 non-null  float64
 3   Genre                        750000 non-null  object 
 4   Host_Popularity_percentage   750000 non-null  float64
 5   Publication_Day              750000 non-null  object 
 6   Publication_Time             750000 non-null  object 
 7   Guest_Popularity_percentage  603970 non-null  float64
 8   Number_of_Ads                749999 non-null  float64
 9   Episode_Sentiment            750000 non-null  object 
 10  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), object(6)
memory usage: 68.7+ MB


### Filter

In [4]:
train = train[(train["Number_of_Ads"] < 10) | (train["Number_of_Ads"].isna())]
train = train[(train["Episode_Length_minutes"] < 300) | (train["Episode_Length_minutes"].isna())]
train = train[(train["Host_Popularity_percentage"] < 100) | (train["Host_Popularity_percentage"].isna())]
train = train[(train["Guest_Popularity_percentage"] < 100) | (train["Guest_Popularity_percentage"].isna())]

In [5]:
train_counts = train.nunique().reset_index()
train_counts.columns = ['Column', 'Train Unique']
test_counts = test.nunique().reset_index()
test_counts.columns = ['Column', 'Test Unique']
original_counts = original.nunique().reset_index()
original_counts.columns = ['Column', 'Original Unique']
merged_counts = pd.merge(train_counts, test_counts, on='Column', how='outer')
merged_counts = pd.merge(merged_counts, original_counts, on='Column', how='outer')
merged_counts

Unnamed: 0,Column,Train Unique,Test Unique,Original Unique
0,Episode_Length_minutes,12267,11631.0,11297
1,Episode_Sentiment,3,3.0,3
2,Episode_Title,100,100.0,100
3,Genre,10,10.0,10
4,Guest_Popularity_percentage,10003,9961.0,9899
5,Host_Popularity_percentage,8019,8010.0,7976
6,Listening_Time_minutes,42807,,42909
7,Number_of_Ads,4,6.0,4
8,Podcast_Name,48,48.0,48
9,Publication_Day,7,7.0,7


### Episode Numbers encoded

In [6]:
def extract_episode_number(episode_title):
    """Extract episode number from episode title string."""
    try:
        return int(episode_title.split("Episode ")[1])
    except (IndexError, ValueError):
        return np.nan


train['Episode_Number'] = train['Episode_Title'].apply(extract_episode_number)
if 'test' in locals() or 'test' in globals():
    if isinstance(test['Episode_Title'].iloc[0], (int, np.integer)):
        test['Episode_Number'] = test['Episode_Title']
    else:
        test['Episode_Number'] = test['Episode_Title'].apply(extract_episode_number)

### Interaktionen

In [None]:
encode_columns = ['Episode_Length_minutes', 'Episode_Number', 'Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Publication_Day', 'Publication_Time']
pair_size = [2, 3, 4]

for r in pair_size:
    for cols in tqdm(list(combinations(encode_columns, r))):
        new_col_name = '_'.join(cols)
        
        train[new_col_name] = train[list(cols)].astype(str).agg('_'.join, axis=1)
        train[new_col_name] = train[new_col_name].astype('category')
        
        test[new_col_name] = test[list(cols)].astype(str).agg('_'.join, axis=1)
        test[new_col_name] = test[new_col_name].astype('category')

100%|██████████| 21/21 [00:47<00:00,  2.27s/it]
100%|██████████| 35/35 [01:44<00:00,  2.99s/it]
 26%|██▌       | 9/35 [00:35<01:34,  3.63s/it]

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 750000 non-null  object 
 1   Episode_Title                750000 non-null  object 
 2   Episode_Length_minutes       662907 non-null  float64
 3   Genre                        750000 non-null  object 
 4   Host_Popularity_percentage   750000 non-null  float64
 5   Publication_Day              750000 non-null  object 
 6   Publication_Time             750000 non-null  object 
 7   Guest_Popularity_percentage  603970 non-null  float64
 8   Number_of_Ads                749999 non-null  float64
 9   Episode_Sentiment            750000 non-null  object 
 10  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), object(6)
memory usage: 68.7+ MB


In [None]:
train.to_csv("../Data/train_eng.csv", sep=';', encoding='utf-8', header=True)

In [16]:
train.head()

Unnamed: 0_level_0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


# Feature Engineering Playground

In [25]:
train_eng = pd.read_csv('../Data/train_eng.csv', sep=';', encoding='utf-8')

In [27]:
train_eng.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749944 entries, 0 to 749943
Columns: 104 entries, id to Number_of_Ads_Episode_Sentiment_Publication_Day_Publication_Time
dtypes: float64(5), int64(2), object(97)
memory usage: 595.0+ MB


### One Hot Encode NAs

In [None]:
guest_pop_na_mask = train_eng['Guest_Popularity_percentage'].isna()
train_eng['Guest_Popularity_NA'] = guest_pop_na_mask.astype(int)

if 'test' in locals() or 'test' in globals():
    test['Guest_Popularity_NA'] = test['Guest_Popularity_percentage'].isna().astype(int)

for col in ['Episode_Length_minutes', 'Number_of_Ads']:
    if train_eng[col].isna().any():
        train_eng[f'{col}_NA'] = train_eng[col].isna().astype(int)
        if 'test' in locals() or 'test' in globals():
            test[f'{col}_NA'] = test[col].isna().astype(int)

Guest Popularity NA counts: 146028
Guest Popularity non-NA counts: 603916


### Target Encode every Col

In [None]:
m = train_eng[TARGET].mean()

ORIG_TARGET = []

for c in FEATURES:
    n = f"{c}2"
    ORIG_TARGET.append(n)  
    
    # Get target encoding mapping from original data
    target_mapping = original.groupby(c)[TARGET].mean()
    
    # Apply mapping to train and test
    if train_eng[c].dtype.name == 'category':
        # For categorical columns, convert to string first
        train_eng[n] = train_eng[c].astype(str).map(target_mapping)
        test[n] = test[c].astype(str).map(target_mapping)
    else:
        # For non-categorical columns
        train_eng[n] = train_eng[c].map(target_mapping)
        test[n] = test[c].map(target_mapping)
    
    # Fill NA values
    train_eng[n] = train_eng[eng[n].fillna(m)
    test[n] = test[n].fillna(m)

train.head(3)

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,...,Episode_Length_minutes2,Host_Popularity_percentage2,Guest_Popularity_percentage2,Number_of_Ads2,Podcast_Name2,Episode_Title2,Genre2,Publication_Day2,Publication_Time2,Episode_Sentiment2
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,...,45.436281,43.420073,45.436281,48.525459,46.143074,43.525145,46.551083,45.545049,45.90244,46.940936
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,...,87.105517,43.95797,36.098898,44.321965,43.651926,45.260247,44.524182,45.194999,45.937834,44.654776
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,...,59.04685,39.812068,36.64425,48.525459,45.938614,44.309041,45.222056,46.273659,44.989281,44.654776


### Interaktion

In [None]:
for col in CATS:
    combined = pd.concat([train_eng[col], test[col]], axis=0)
    codes, uniques = pd.factorize(combined)
    train_eng[col] = codes[:len(train)]
    test[col] = codes[len(train):]
    
# Define the selected interaction features based on my local forward selection:
SELECTED_INTERACT = [
    'Episode_Length_minutes_Host_Popularity_percentage', 
    'Episode_Length_minutes_Guest_Popularity_percentage', 
    'Episode_Length_minutes_Number_of_Ads', 
    'Episode_Length_minutes_Publication_Time', 
    'Episode_Length_minutes_Episode_Sentiment', 
    'Host_Popularity_percentage_Guest_Popularity_percentage', 
    'Host_Popularity_percentage_Number_of_Ads', 
    'Host_Popularity_percentage_Podcast_Name', 
    'Host_Popularity_percentage_Publication_Time', 
    'Host_Popularity_percentage_Episode_Sentiment', 
    'Guest_Popularity_percentage_Number_of_Ads', 
    'Guest_Popularity_percentage_Publication_Day', 
    'Guest_Popularity_percentage_Publication_Time', 
    'Guest_Popularity_percentage_Episode_Sentiment', 
    'Episode_Title_Episode_Sentiment'
]

# Create the selected interaction features for both train and test.
for candidate in SELECTED_INTERACT:
    c1, c2 = None, None
    for f in FEATURES:
        prefix = f + '_'
        if candidate.startswith(prefix):
            possible_c2 = candidate[len(prefix):]
            if possible_c2 in FEATURES:
                c1 = f
                c2 = possible_c2
                break
    if c1 is None or c2 is None:
        raise ValueError(f"Unable to parse the candidate feature '{candidate}' into two base features.")
    
    # Create the interaction feature as the product of the two columns.
    train_eng[candidate] = train_eng[c1] * train_eng[c2]
    test[candidate] = test[c1] * test[c2]

print("Selected interaction features have been created in both train and test.")

Selected interaction features have been created in both train and test.
