In [16]:
import pandas as pd

train_df = pd.read_csv('train.csv').drop(columns=['id'])
train_df = train_df.dropna(subset=['Number_of_Ads'])
print(train_df.shape)
print(train_df.columns)
train_df.head()

(749999, 11)
Index(['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre',
       'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment',
       'Listening_Time_minutes'],
      dtype='object')


Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [17]:
# Split the data into train and test sets randomly and stratified by the target variable
from sklearn.model_selection import train_test_split

sub_train, sub_test = train_test_split(train_df, test_size=0.2, random_state=42)

In [18]:
# get the target variable and features
X_sub_train = sub_train.drop(columns=['Listening_Time_minutes'])
y_sub_train = sub_train['Listening_Time_minutes']
X_sub_test = sub_test.drop(columns=['Listening_Time_minutes'])
y_sub_test = sub_test['Listening_Time_minutes']
print(X_sub_train.shape, y_sub_train.shape)
print(X_sub_test.shape, y_sub_test.shape)

(599999, 10) (599999,)
(150000, 10) (150000,)


In [19]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class GroupMedianImputer(BaseEstimator, TransformerMixin):
    """
    Imputes missing values in col_to_impute by taking
    the median of that column within groups defined by group_cols.
    """
    def __init__(self, group_cols, col_to_impute):
        self.group_cols = group_cols
        self.col_to_impute = col_to_impute
        self.group_medians_ = None
        self.global_median_ = None

    def fit(self, X, y=None):
        # Convert to DataFrame if needed
        df = pd.DataFrame(X.copy())
        
        # Compute groupwise median
        self.group_medians_ = (
            df.groupby(self.group_cols)[self.col_to_impute]
              .median()
              .to_dict()
        )
        # Compute overall median as fallback
        self.global_median_ = df[self.col_to_impute].median()
        return self

    def transform(self, X, y=None):
        df = pd.DataFrame(X.copy())
        
        # We'll apply groupwise median for each row
        def fill_with_group_median(row):
            group_key = tuple(row[col] for col in self.group_cols)
            if pd.isna(row[self.col_to_impute]):
                # if group exists in dictionary, use that median
                # otherwise, use global median
                return self.group_medians_.get(group_key, self.global_median_)
            else:
                return row[self.col_to_impute]
        
        df[self.col_to_impute] = df.apply(fill_with_group_median, axis=1)
        
        # Fill any remaining missing with global median
        df[self.col_to_impute].fillna(self.global_median_, inplace=True)
        
        return df



class MissingValueIndcator(BaseEstimator, TransformerMixin):
    """
    Adds a binary indicator for missing values in specified columns.
    """
    def __init__(self, cols):
        self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Add a dummy column 'missing_<col>' for each column in self.cols
        df = pd.DataFrame(X.copy())
        for col in self.cols:
            df[f'missing_{col}'] = df[col].isna().astype(int)

        return df

# Model 2 Use median of Podcast name and episode title to predict add a dummy variable indicate whether value is missing.

In [20]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# Create a pipeline with the custom and simple imputers

group_imputer = GroupMedianImputer(
    group_cols=['Podcast_Name', 'Episode_Title'], 
    col_to_impute='Episode_Length_minutes'
)

group_imputer2 = GroupMedianImputer(
    group_cols=['Podcast_Name', 'Episode_Title'], 
    col_to_impute='Guest_Popularity_percentage'
)

missing_value_indicator = MissingValueIndcator(
    cols=['Episode_Length_minutes', 'Guest_Popularity_percentage']
)

simple_imputer = SimpleImputer(strategy='median')

imputer_pipeline = Pipeline(
    steps=[
        ('group_imputer', group_imputer),
        ('group_imputer2', group_imputer2)
    ]
)
num_attributes = X_sub_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_attributes = X_sub_train.select_dtypes(include=['object']).columns.tolist()

print(num_attributes)

print(cat_attributes)

num_pipeline = Pipeline([
    ('minmax', MinMaxScaler())
])

cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder())
])

combine_pipeline = ColumnTransformer([
    ('scaler', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes)
])


# Fit the imputer pipeline on the training data
full_pipeline = Pipeline(steps=[
    ('missing_value_indicator', missing_value_indicator),
    ('imputer', imputer_pipeline),
    ('combine', combine_pipeline)
])

['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']
['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']


In [21]:
train_df_grouped_imputed = full_pipeline.fit_transform(X_sub_train)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[self.col_to_impute].fillna(self.global_median_, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[self.col_to_impute].fillna(self.global_median_, inplace=True)


In [22]:
print(train_df_grouped_imputed[0])

  (0, 0)	0.5611997660623276
  (0, 1)	0.580991875423155
  (0, 2)	0.7112834625969477
  (0, 40)	1.0
  (0, 147)	1.0
  (0, 161)	1.0
  (0, 163)	1.0
  (0, 172)	1.0
  (0, 173)	1.0


In [23]:
# Use PCA to reduce dimensionality
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)  # Keep 95% of variance
X_sub_train_pca = pca.fit_transform(train_df_grouped_imputed.toarray())

In [24]:
# Cross-validation
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import numpy as np

lin_reg = LinearRegression()

# Evaluate the models using cross-validation

lin_scores = cross_val_score(lin_reg, X_sub_train_pca, 
                             y_sub_train, scoring='neg_mean_squared_error', cv=5)


def display_scores(scores):
    scores = np.sqrt(-scores)  # Convert to RMSE
    #print the variable name
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(lin_scores)

Scores: [13.651131   13.57403178 13.60718125 13.50663172 13.56989717]
Mean: 13.581774582721277
Standard deviation: 0.047546369130095446


In [25]:
dec_reg = DecisionTreeRegressor(max_depth=3, random_state=42, min_samples_leaf=1000)

dec_scores = cross_val_score(dec_reg, X_sub_train_pca,
                             y_sub_train, scoring='neg_mean_squared_error', cv=5)

display_scores(dec_scores)

Scores: [14.17242386 14.09494837 14.14241181 14.03929165 14.08862937]
Mean: 14.107541009546347
Standard deviation: 0.0460433633931517
