In [1]:
import pandas as pd

train_df = pd.read_csv('train.csv').drop(columns=['id'])
train_df = train_df.dropna(subset=['Number_of_Ads'])
print(train_df.shape)
print(train_df.columns)
train_df.head()

(749999, 11)
Index(['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre',
       'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment',
       'Listening_Time_minutes'],
      dtype='object')


Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [2]:
# Split the data into train and test sets randomly and stratified by the target variable
from sklearn.model_selection import train_test_split

sub_train, sub_test = train_test_split(train_df, test_size=0.2, random_state=42)

In [3]:
# get the target variable and features
X_sub_train = sub_train.drop(columns=['Listening_Time_minutes'])
y_sub_train = sub_train['Listening_Time_minutes']
X_sub_test = sub_test.drop(columns=['Listening_Time_minutes'])
y_sub_test = sub_test['Listening_Time_minutes']
print(X_sub_train.shape, y_sub_train.shape)
print(X_sub_test.shape, y_sub_test.shape)

(599999, 10) (599999,)
(150000, 10) (150000,)


In [4]:
num_attributes = X_sub_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_attributes = X_sub_train.select_dtypes(include=['object']).columns.tolist()

num_attributes

['Episode_Length_minutes',
 'Host_Popularity_percentage',
 'Guest_Popularity_percentage',
 'Number_of_Ads']

In [5]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class GroupMedianImputer(BaseEstimator, TransformerMixin):
    """
    Imputes missing values in col_to_impute by taking
    the median of that column within groups defined by group_cols.
    """
    def __init__(self, group_cols, col_to_impute):
        self.group_cols = group_cols
        self.col_to_impute = col_to_impute
        self.group_medians_ = None
        self.global_median_ = None

    def fit(self, X, y=None):
        # Convert to DataFrame if needed
        df = pd.DataFrame(X.copy())
        
        # Compute groupwise median
        self.group_medians_ = (
            df.groupby(self.group_cols)[self.col_to_impute]
              .median()
              .to_dict()
        )
        # Compute overall median as fallback
        self.global_median_ = df[self.col_to_impute].median()
        return self

    def transform(self, X, y=None):
        df = pd.DataFrame(X.copy())
        
        # We'll apply groupwise median for each row
        def fill_with_group_median(row):
            group_key = tuple(row[col] for col in self.group_cols)
            if pd.isna(row[self.col_to_impute]):
                # if group exists in dictionary, use that median
                # otherwise, use global median
                return self.group_medians_.get(group_key, self.global_median_)
            else:
                return row[self.col_to_impute]
        
        df[self.col_to_impute] = df.apply(fill_with_group_median, axis=1)
        
        # Fill any remaining missing with global median
        df[self.col_to_impute].fillna(self.global_median_, inplace=True)
        
        return df



In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np


col_names = num_attributes

#find the column number of the column 'Episode_Length_minutes', 'Guest_Popularity_percentage', 'Host_Popularity_percentage'
episode_length_index = col_names.index('Episode_Length_minutes')
guest_popularity_index = col_names.index('Guest_Popularity_percentage')
host_popularity_index = col_names.index('Host_Popularity_percentage')

class NewAttributesAdder(BaseEstimator, TransformerMixin):
    """
    Adds new attributes to the DataFrame.
    """
    def __init__(self, add_new_attributes=1):
        self.add_new_attributes = add_new_attributes
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
       
        ideal_listening_time_guest1 = X[:, episode_length_index] * X[:, host_popularity_index] * 0.01

        ideal_listening_time_guest2 = X[:, episode_length_index] * X[:, guest_popularity_index] * 0.01

        # Add the new attributes to the DataFrame
        ideal_listening_time_guest3 = X[:, episode_length_index] * (X[:, host_popularity_index] * X[:, guest_popularity_index] * 0.0001)

        return np.c_[X, ideal_listening_time_guest1, ideal_listening_time_guest2]
    





# Model 7 Use median of global data set to impute.

Also add new attribute that popularity 

In [7]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# Create a pipeline with the custom and simple imputers

from sklearn.impute import KNNImputer

# Assume X is your feature matrix
knn_imputer = KNNImputer(n_neighbors=5)



group_imputer = GroupMedianImputer(
    group_cols=['Podcast_Name', 'Episode_Title'], 
    col_to_impute='Episode_Length_minutes'
)

group_imputer2 = GroupMedianImputer(
    group_cols=['Podcast_Name', 'Episode_Title'], 
    col_to_impute='Guest_Popularity_percentage'
)

simple_imputer = SimpleImputer(strategy='median')


imputer_pipeline = Pipeline(
    steps=[
        ('group_imputer', group_imputer),
        ('group_imputer2', group_imputer2)
    ]
)


col_names = X_sub_train.columns.tolist()


attr_adder = NewAttributesAdder(add_new_attributes=1)


num_attributes = X_sub_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_attributes = X_sub_train.select_dtypes(include=['object']).columns.tolist()

print(num_attributes)

print(cat_attributes)

num_pipeline = Pipeline([
    ('minmax', MinMaxScaler()),
    ('imputer', simple_imputer),
    ('attr_adder', attr_adder)
])

num_pipeline2 = Pipeline([
    ('minmax', RobustScaler()),
    ('imputer', simple_imputer),
    ('attr_adder', attr_adder)
])


cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder())
])


combine_pipeline = ColumnTransformer([
    ('scaler', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes)
])

combine_pipeline2 = ColumnTransformer([
    ('scaler', num_pipeline2, num_attributes),
    ('cat', cat_pipeline, cat_attributes)
])

# Fit the imputer pipeline on the training data
full_pipeline = Pipeline(steps=[
    ('imputer', imputer_pipeline),
    ('combine', combine_pipeline)
])

['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']
['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']


In [8]:
train_df_grouped_imputed = combine_pipeline2.fit_transform(X_sub_train)
train_df_grouped_imputed_test = combine_pipeline2.transform(X_sub_test)

In [9]:
print(train_df_grouped_imputed[0])

  (0, 0)	0.07788642991936867
  (0, 1)	0.24707055597107963
  (0, 2)	0.6570718575274385
  (0, 3)	-0.5
  (0, 4)	0.00019243443542780948
  (0, 5)	0.0005117698118330023
  (0, 42)	1.0
  (0, 149)	1.0
  (0, 163)	1.0
  (0, 165)	1.0
  (0, 174)	1.0
  (0, 175)	1.0


In [10]:
def display_scores(scores):
    scores = np.sqrt(-scores)  # Convert to RMSE
    #print the variable name
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [11]:
from xgboost import XGBRegressor

# xgb_reg = XGBRegressor(n_estimators=500, learning_rate=0.04, max_depth=15, random_state=42)

In [12]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from xgboost import XGBRegressor


param_dist = {
    'n_estimators': [450, 500, 550],
    'learning_rate': [0.03, 0.04, 0.05],
    'max_depth': [14, 15, 16],
    'min_child_weight': [1, 2, 3],
}

random_search = RandomizedSearchCV(
    estimator=XGBRegressor(),
    param_distributions=param_dist,
    n_iter=20,  # Try 20 random combinations (tweak as needed)
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=2,
    random_state=42
)

random_search.fit(train_df_grouped_imputed, y_sub_train)

print("Best parameters:", random_search.best_params_)
# print rmse
print("Best RMSE:", np.sqrt(-random_search.best_score_))



Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END learning_rate=0.04, max_depth=14, min_child_weight=2, n_estimators=450; total time=  31.2s
[CV] END learning_rate=0.04, max_depth=14, min_child_weight=2, n_estimators=450; total time=  29.1s
[CV] END learning_rate=0.04, max_depth=14, min_child_weight=2, n_estimators=450; total time=  29.1s
[CV] END learning_rate=0.03, max_depth=14, min_child_weight=1, n_estimators=450; total time=  34.6s
[CV] END learning_rate=0.03, max_depth=14, min_child_weight=1, n_estimators=450; total time=  38.1s
[CV] END learning_rate=0.03, max_depth=14, min_child_weight=1, n_estimators=450; total time=  34.0s
[CV] END learning_rate=0.03, max_depth=16, min_child_weight=2, n_estimators=500; total time=  52.1s
[CV] END learning_rate=0.03, max_depth=16, min_child_weight=2, n_estimators=500; total time=  50.7s
[CV] END learning_rate=0.03, max_depth=16, min_child_weight=2, n_estimators=500; total time=  50.6s
[CV] END learning_rate=0.04, max_depth=

In [14]:
# Evaluate the model on the test set

from sklearn.metrics import mean_squared_error

xgb_reg = XGBRegressor(**random_search.best_params_)
xgb_reg.fit(train_df_grouped_imputed, y_sub_train)
y_pred = xgb_reg.predict(train_df_grouped_imputed_test)


mse = mean_squared_error(y_sub_test, y_pred)
rmse = mse ** 0.5
print("Test RMSE:", rmse)

Test RMSE: 12.684557049978524
