Import Data and Packages

In [3]:
# Setup relative imports
import os, sys
dir2 = os.path.abspath('')
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path: sys.path.append(dir1)



# After kernel restart, run the imports
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

from Helper.data import load_data

In [4]:
original = pd.read_csv('../Data/podcast_dataset.csv')

Filter Outliers

In [5]:
train, test = load_data()
train = train[(train["Number_of_Ads"] < 10) | (train["Number_of_Ads"].isna())]
train = train[(train["Episode_Length_minutes"] < 300) | (train["Episode_Length_minutes"].isna())]
train = train[(train["Host_Popularity_percentage"] < 100) | (train["Host_Popularity_percentage"].isna())]
train = train[(train["Guest_Popularity_percentage"] < 100) | (train["Guest_Popularity_percentage"].isna())]

Feature Engineering

In [6]:
TARGET = 'Listening_Time_minutes'

CATS = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']

NUMS = ['Episode_Length_minutes', 'Host_Popularity_percentage', 
        'Guest_Popularity_percentage', 'Number_of_Ads']
FEATURES = NUMS + CATS

In [7]:
train_counts = train.nunique().reset_index()
train_counts.columns = ['Column', 'Train Unique']
test_counts = test.nunique().reset_index()
test_counts.columns = ['Column', 'Test Unique']
original_counts = original.nunique().reset_index()
original_counts.columns = ['Column', 'Original Unique']
merged_counts = pd.merge(train_counts, test_counts, on='Column', how='outer')
merged_counts = pd.merge(merged_counts, original_counts, on='Column', how='outer')
merged_counts

Unnamed: 0,Column,Train Unique,Test Unique,Original Unique
0,Episode_Length_minutes,12267,11631.0,11297.0
1,Episode_Sentiment,3,3.0,3.0
2,Episode_Title,100,100.0,100.0
3,Genre,10,10.0,10.0
4,Guest_Popularity_percentage,10003,9961.0,9899.0
5,Host_Popularity_percentage,8019,8010.0,7976.0
6,Listening_Time_minutes,42807,,42909.0
7,Number_of_Ads,4,6.0,4.0
8,Podcast_Name,48,48.0,48.0
9,Publication_Day,7,7.0,7.0


Interaktions Features

In [8]:
m = train[TARGET].mean()

ORIG_TARGET = []

for c in FEATURES:
    n = f"{c}2"
    ORIG_TARGET.append(n)  
    
    # Get target encoding mapping from original data
    target_mapping = original.groupby(c)[TARGET].mean()
    
    # Apply mapping to train and test
    if train[c].dtype.name == 'category':
        # For categorical columns, convert to string first
        train[n] = train[c].astype(str).map(target_mapping)
        test[n] = test[c].astype(str).map(target_mapping)
    else:
        # For non-categorical columns
        train[n] = train[c].map(target_mapping)
        test[n] = test[c].map(target_mapping)
    
    # Fill NA values
    train[n] = train[n].fillna(m)
    test[n] = test[n].fillna(m)

train.head(3)

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,...,Episode_Length_minutes2,Host_Popularity_percentage2,Guest_Popularity_percentage2,Number_of_Ads2,Podcast_Name2,Episode_Title2,Genre2,Publication_Day2,Publication_Time2,Episode_Sentiment2
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,...,45.436281,43.420073,45.436281,48.525459,46.143074,43.525145,46.551083,45.545049,45.90244,46.940936
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,...,87.105517,43.95797,36.098898,44.321965,43.651926,45.260247,44.524182,45.194999,45.937834,44.654776
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,...,59.04685,39.812068,36.64425,48.525459,45.938614,44.309041,45.222056,46.273659,44.989281,44.654776


In [9]:
for col in CATS:
    combined = pd.concat([train[col], test[col]], axis=0)
    codes, uniques = pd.factorize(combined)
    train[col] = codes[:len(train)]
    test[col] = codes[len(train):]
    
# Define the selected interaction features based on my local forward selection:
SELECTED_INTERACT = [
    'Episode_Length_minutes_Host_Popularity_percentage', 
    'Episode_Length_minutes_Guest_Popularity_percentage', 
    'Episode_Length_minutes_Number_of_Ads', 
    'Episode_Length_minutes_Publication_Time', 
    'Episode_Length_minutes_Episode_Sentiment', 
    'Host_Popularity_percentage_Guest_Popularity_percentage', 
    'Host_Popularity_percentage_Number_of_Ads', 
    'Host_Popularity_percentage_Podcast_Name', 
    'Host_Popularity_percentage_Publication_Time', 
    'Host_Popularity_percentage_Episode_Sentiment', 
    'Guest_Popularity_percentage_Number_of_Ads', 
    'Guest_Popularity_percentage_Publication_Day', 
    'Guest_Popularity_percentage_Publication_Time', 
    'Guest_Popularity_percentage_Episode_Sentiment', 
    'Episode_Title_Episode_Sentiment'
]

# Create the selected interaction features for both train and test.
for candidate in SELECTED_INTERACT:
    c1, c2 = None, None
    for f in FEATURES:
        prefix = f + '_'
        if candidate.startswith(prefix):
            possible_c2 = candidate[len(prefix):]
            if possible_c2 in FEATURES:
                c1 = f
                c2 = possible_c2
                break
    if c1 is None or c2 is None:
        raise ValueError(f"Unable to parse the candidate feature '{candidate}' into two base features.")
    
    # Create the interaction feature as the product of the two columns.
    train[candidate] = train[c1] * train[c2]
    test[candidate] = test[c1] * test[c2]

print("Selected interaction features have been created in both train and test.")

Selected interaction features have been created in both train and test.


Structure

In [10]:

train.describe()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,...,Host_Popularity_percentage_Guest_Popularity_percentage,Host_Popularity_percentage_Number_of_Ads,Host_Popularity_percentage_Podcast_Name,Host_Popularity_percentage_Publication_Time,Host_Popularity_percentage_Episode_Sentiment,Guest_Popularity_percentage_Number_of_Ads,Guest_Popularity_percentage_Publication_Day,Guest_Popularity_percentage_Publication_Time,Guest_Popularity_percentage_Episode_Sentiment,Episode_Title_Episode_Sentiment
count,749944.0,749944.0,749944.0,662857.0,749944.0,749944.0,749944.0,749944.0,603916.0,749943.0,...,603916.0,749943.0,749944.0,749944.0,749944.0,603915.0,603916.0,603916.0,603916.0,749944.0
mean,375002.47905,23.53992,49.436726,64.503447,4.556028,59.858266,3.050962,1.472987,52.234551,1.347929,...,3132.481171,80.232223,1410.0877,88.215689,59.998595,69.172635,160.039133,76.669953,52.223871,49.732537
std,216506.838496,13.917803,28.687858,32.967747,2.965915,22.871467,1.99035,1.11742,28.449797,1.110975,...,2201.61447,76.534991,1041.301654,79.123382,57.094785,75.903942,147.563009,78.460666,56.240624,54.998111
min,0.0,0.0,0.0,0.0,0.0,1.3,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,187502.75,12.0,24.0,35.73,2.0,39.41,1.0,0.0,28.38,0.0,...,1372.6437,0.0,574.72,0.0,0.0,0.0,36.48,0.0,0.0,0.0
50%,375004.5,23.0,50.0,63.84,5.0,60.05,3.0,1.0,53.58,1.0,...,2672.8898,67.71,1180.36,75.32,54.29,48.38,118.62,57.75,37.71,33.0
75%,562503.25,36.0,74.0,94.07,7.0,79.53,5.0,2.0,76.6,2.0,...,4563.426125,125.88,2064.6,142.06,93.23,104.28,255.81,126.27,86.42,83.0
max,749999.0,47.0,99.0,120.93,9.0,99.99,6.0,3.0,99.99,3.0,...,9992.0016,299.97,4699.06,299.97,199.98,299.97,599.94,299.97,199.98,198.0


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 749944 entries, 0 to 749999
Data columns (total 37 columns):
 #   Column                                                  Non-Null Count   Dtype  
---  ------                                                  --------------   -----  
 0   id                                                      749944 non-null  int64  
 1   Podcast_Name                                            749944 non-null  int64  
 2   Episode_Title                                           749944 non-null  int64  
 3   Episode_Length_minutes                                  662857 non-null  float64
 4   Genre                                                   749944 non-null  int64  
 5   Host_Popularity_percentage                              749944 non-null  float64
 6   Publication_Day                                         749944 non-null  int64  
 7   Publication_Time                                        749944 non-null  int64  
 8   Guest_Popularity_percentage  

Training Model

Best Model yet (12.56)

In [48]:
# Für Reproduzierbarkeit
np.random.seed(42)


#features =  top5_features
features = train.drop(columns=['Listening_Time_minutes', 'id']).columns.tolist()
X = train[features]
y = train['Listening_Time_minutes']

# XGBoost-Regressor definieren
model = XGBRegressor(
    device='cuda',
    enable_categorical=True,
    n_estimators=5000,
    learning_rate=0.02,
    max_depth=11,
    subsample=0.8,
    colsample_bynode=0.6,
    random_state=42,
    verbosity=2,  # Standard-Regressionsziel: quadratischer Fehler
)

# 5-fache Cross-Validation mit KFold (Shuffle=True sorgt für zufällige Aufteilung)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Wir verwenden cross_val_score, um den negativen MSE zu berechnen, 
# der dann in den RMSE umgerechnet wird
cv_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf)

# Umrechnung in RMSE (Root Mean Squared Error)
rmse_scores = np.sqrt(-cv_scores)

print("RMSE pro Fold:", rmse_scores)
print("Durchschnittlicher RMSE:", rmse_scores.mean())




RMSE pro Fold: [12.63630189 12.64733918 12.61837204 12.58863104 12.58639433]
Durchschnittlicher RMSE: 12.615407697868207


In [53]:
# Für Reproduzierbarkeit
np.random.seed(42)


features =  top5_features
#features = train.drop(columns=['Listening_Time_minutes', 'id']).columns.tolist()
X = train[features]
y = train['Listening_Time_minutes']

# XGBoost-Regressor definieren
model = XGBRegressor(
    device='cuda',
    enable_categorical=True,
    n_estimators=5000,
    learning_rate=0.02,
    max_depth=11,
    subsample=0.8,
    colsample_bynode=0.6,
    random_state=42,
    verbosity=2,  # Standard-Regressionsziel: quadratischer Fehler
)

# 5-fache Cross-Validation mit KFold (Shuffle=True sorgt für zufällige Aufteilung)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Wir verwenden cross_val_score, um den negativen MSE zu berechnen, 
# der dann in den RMSE umgerechnet wird
cv_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf)

# Umrechnung in RMSE (Root Mean Squared Error)
rmse_scores = np.sqrt(-cv_scores)

print("RMSE pro Fold:", rmse_scores)
print("Durchschnittlicher RMSE:", rmse_scores.mean())




RMSE pro Fold: [13.35485422 13.31435216 13.29832033 13.28198736 13.26618174]
Durchschnittlicher RMSE: 13.303139163923671


In [None]:
# Für Reproduzierbarkeit
np.random.seed(42)


#features =  top5_features
features = train.drop(columns=['Listening_Time_minutes', 'id']).columns.tolist()
X = train[features]
y = train['Listening_Time_minutes']

# XGBoost-Regressor definieren
model = XGBRegressor(
    device='cuda',
    enable_categorical=True,
    n_estimators=5000,
    learning_rate=0.02,
    max_depth=11,
    subsample=0.8,
    colsample_bynode=0.6,
    random_state=42,
    verbosity=2,  # Standard-Regressionsziel: quadratischer Fehler
)

# 5-fache Cross-Validation mit KFold (Shuffle=True sorgt für zufällige Aufteilung)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Wir verwenden cross_val_score, um den negativen MSE zu berechnen, 
# der dann in den RMSE umgerechnet wird
cv_scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf)

# Umrechnung in RMSE (Root Mean Squared Error)
rmse_scores = np.sqrt(-cv_scores)

print("RMSE pro Fold:", rmse_scores)
print("Durchschnittlicher RMSE:", rmse_scores.mean())




RMSE pro Fold: [12.63630189 12.64733918 12.61837204 12.58863104 12.58639433]
Durchschnittlicher RMSE: 12.615407697868207


In [49]:
# Optional: Training des Modells auf dem gesamten Datensatz
model.fit(X, y)
# Jetzt kannst du das trainierte Modell z. B. für Vorhersagen verwenden:
# Ensure test data has the same features as training data
test_features = test[features]
y_pred = model.predict(test_features)



In [50]:

result_df = pd.DataFrame({'id': test['id'], 'Listening_Time_minutes': y_pred})
# wie speichere ich den dataframe unter submissions ab?
result_df.to_csv('../Submission/xgb_5000.csv', index=False)

In [51]:
result_df

Unnamed: 0,id,Listening_Time_minutes
0,750000,51.844666
1,750001,20.747276
2,750002,50.314034
3,750003,73.270142
4,750004,46.867775
...,...,...
249995,999995,10.712748
249996,999996,59.242859
249997,999997,8.248419
249998,999998,74.122543


Feature Selection

In [52]:
feature_names = X.columns

importances = model.feature_importances_

df_feat_importances = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

top5_features = feature_list[:5]

print(top5_features)

['Episode_Length_minutes', 'Episode_Length_minutes2', 'Episode_Length_minutes_Host_Popularity_percentage', 'Episode_Length_minutes_Episode_Sentiment', 'Number_of_Ads2']


In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import KFold, cross_val_score

# Reproduzierbarkeit
np.random.seed(42)

# Beispiel: Annahme eines DataFrames 'train'
# train = pd.read_csv("dein_train_datensatz.csv")
# Nehmen wir an, train enthält die Spalten "id", "Listening_Time_minutes" und weitere Features.
TARGET = 'target'  # Falls noch eine 'target'-Spalte existiert, die du ignorieren möchtest

# Auswahl aller Features außer 'id' und TARGET
features = [col for col in train.columns if col not in ['id', TARGET]]
X = train[features]
y = train['Listening_Time_minutes']

# Definiere das Modell
model = XGBRegressor(
    device='cuda',
    enable_categorical=True,
    n_estimators=4000,
    learning_rate=0.02,
    max_depth=11,
    subsample=0.8,
    colsample_bynode=0.6,
    random_state=42,
    verbosity=0,
)

# KFold-Cross-Validation für stabile Schätzungen
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Forward Selection: Startet mit einem leeren Featureset und fügt in jedem Schritt das Feature hinzu,
# das den besten CV-Score erzielt.
sfs = SFS(model,
          k_features='best',  # Alternativ k_features kann auch eine Zahl sein, z.B. 10
          forward=True,
          floating=False,
          scoring='neg_mean_squared_error',  # Wir verwenden neg. MSE als Bewertungsmetrik
          cv=cv,
          n_jobs=-1)  # Parallele Ausführung

# Fit des Selektors auf die Daten
sfs = sfs.fit(X, y)

# Zusammenfassung der ausgewählten Features
selected_features = list(sfs.k_feature_names_)
print("Ausgewählte Features:", selected_features)

# Optional: Bewertung der Modellgüte mit diesen Features
X_selected = sfs.transform(X)
scores = cross_val_score(model, X_selected, y, scoring='neg_mean_squared_error', cv=cv)
rmse_scores = np.sqrt(-scores)
print("RMSE Scores:", rmse_scores)
print("Durchschnittlicher RMSE:", rmse_scores.mean())

Hyperparameter Tuning

In [41]:

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score

# Für Reproduzierbarkeit
np.random.seed(42)

# Beispiel: Annahme eines DataFrames 'train'
# train = pd.read_csv("dein_train_datensatz.csv")
TARGET = 'target'  # Falls noch eine 'target'-Spalte existiert, die Sie ignorieren möchten

# Auswahl aller Features außer 'id' und TARGET
features = [col for col in train.columns if col not in ['id', TARGET]]
X = train[features]
y = train['Listening_Time_minutes']

# Definieren des Basis-Modells (ohne n_estimators, diese wird im Tuning festgelegt)
base_model = XGBRegressor(
    device='cuda',
    enable_categorical=True,
    learning_rate=0.02,
    max_depth=11,
    subsample=0.8,
    colsample_bynode=0.6,
    random_state=42,
    verbosity=0,
    # n_estimators wird hier nicht fest vergeben
)

# KFold-Cross-Validation für stabile Schätzungen
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Optional: Feature Selection mit SequentialFeatureSelector (SFS)
sfs = SFS(base_model,
          k_features='best',  # Oder z. B. k_features=10, wenn Sie eine feste Anzahl wollen
          forward=True,
          floating=False,
          scoring='neg_mean_squared_error',
          cv=cv,
          n_jobs=-1)
sfs = sfs.fit(X, y)

# Ausgewählte Features anwenden
selected_features = list(sfs.k_feature_names_)
print("Ausgewählte Features:", selected_features)
X_selected = sfs.transform(X)

# Parameter-Gitter für n_estimators definieren
param_grid = {
    'n_estimators': [1000, 2000, 3000, 4000, 5000]
}

# GridSearchCV konfigurieren: Suche nach dem besten n_estimators-Wert
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=cv,
    n_jobs=-1
)

# GridSearchCV auf den ausgewählten Features ausführen
grid_search.fit(X_selected, y)

# Beste gefundene n_estimators ausgeben
best_n_estimators = grid_search.best_params_['n_estimators']
best_score = np.sqrt(-grid_search.best_score_)  # Umrechnung in RMSE

print("Bester n_estimators:", best_n_estimators)
print("Beste RMSE (über CV):", best_score)

ModuleNotFoundError: No module named 'mlxtend'