In [11]:
import pandas as pd

train_df = pd.read_csv('train.csv').drop(columns=['id'])
train_df = train_df.dropna(subset=['Number_of_Ads'])
print(train_df.shape)
print(train_df.columns)

(749999, 11)
Index(['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre',
       'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment',
       'Listening_Time_minutes'],
      dtype='object')


In [12]:
test_df = pd.read_csv('test.csv')
test_id = test_df['id']
test_df = test_df.drop(columns=['id'])

In [13]:
# get the target variable and features
X_sub_train = train_df.drop(columns=['Listening_Time_minutes'])
y_sub_train = train_df['Listening_Time_minutes']

print(X_sub_train.shape)
print(y_sub_train.shape)
print(test_df.columns)

(749999, 10)
(749999,)
Index(['Podcast_Name', 'Episode_Title', 'Episode_Length_minutes', 'Genre',
       'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time',
       'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
      dtype='object')


In [14]:
num_attributes = X_sub_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_attributes = X_sub_train.select_dtypes(include=['object']).columns.tolist()

num_attributes

['Episode_Length_minutes',
 'Host_Popularity_percentage',
 'Guest_Popularity_percentage',
 'Number_of_Ads']

In [15]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class GroupMedianImputer(BaseEstimator, TransformerMixin):
    """
    Imputes missing values in col_to_impute by taking
    the median of that column within groups defined by group_cols.
    """
    def __init__(self, group_cols, col_to_impute):
        self.group_cols = group_cols
        self.col_to_impute = col_to_impute
        self.group_medians_ = None
        self.global_median_ = None

    def fit(self, X, y=None):
        # Convert to DataFrame if needed
        df = pd.DataFrame(X.copy())
        
        # Compute groupwise median
        self.group_medians_ = (
            df.groupby(self.group_cols)[self.col_to_impute]
              .median()
              .to_dict()
        )
        # Compute overall median as fallback
        self.global_median_ = df[self.col_to_impute].median()
        return self

    def transform(self, X, y=None):
        df = pd.DataFrame(X.copy())
        
        # We'll apply groupwise median for each row
        def fill_with_group_median(row):
            group_key = tuple(row[col] for col in self.group_cols)
            if pd.isna(row[self.col_to_impute]):
                # if group exists in dictionary, use that median
                # otherwise, use global median
                return self.group_medians_.get(group_key, self.global_median_)
            else:
                return row[self.col_to_impute]
        
        df[self.col_to_impute] = df.apply(fill_with_group_median, axis=1)
        
        # Fill any remaining missing with global median
        df[self.col_to_impute].fillna(self.global_median_, inplace=True)
        
        return df



In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np


col_names = num_attributes

#find the column number of the column 'Episode_Length_minutes', 'Guest_Popularity_percentage', 'Host_Popularity_percentage'
episode_length_index = col_names.index('Episode_Length_minutes')
guest_popularity_index = col_names.index('Guest_Popularity_percentage')
host_popularity_index = col_names.index('Host_Popularity_percentage')

class NewAttributesAdder(BaseEstimator, TransformerMixin):
    """
    Adds new attributes to the DataFrame.
    """
    def __init__(self, add_new_attributes=1):
        self.add_new_attributes = add_new_attributes
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
       
        ideal_listening_time_guest1 = X[:, episode_length_index] * X[:, host_popularity_index] * 0.01

        ideal_listening_time_guest2 = X[:, episode_length_index] * X[:, guest_popularity_index] * 0.01

        # Add the new attributes to the DataFrame
        ideal_listening_time_guest3 = X[:, episode_length_index] * (X[:, host_popularity_index] * X[:, guest_popularity_index] * 0.0001)

        return np.c_[X, ideal_listening_time_guest1, ideal_listening_time_guest2]
    





# Model 5 Use median of global data set to impute.

Also add new attribute that popularity 

In [17]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# Create a pipeline with the custom and simple imputers

from sklearn.impute import KNNImputer

# Assume X is your feature matrix
knn_imputer = KNNImputer(n_neighbors=5)



group_imputer = GroupMedianImputer(
    group_cols=['Podcast_Name', 'Episode_Title'], 
    col_to_impute='Episode_Length_minutes'
)

group_imputer2 = GroupMedianImputer(
    group_cols=['Podcast_Name', 'Episode_Title'], 
    col_to_impute='Guest_Popularity_percentage'
)

simple_imputer = SimpleImputer(strategy='median')


imputer_pipeline = Pipeline(
    steps=[
        ('group_imputer', group_imputer),
        ('group_imputer2', group_imputer2)
    ]
)


col_names = X_sub_train.columns.tolist()


attr_adder = NewAttributesAdder(add_new_attributes=1)


num_attributes = X_sub_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_attributes = X_sub_train.select_dtypes(include=['object']).columns.tolist()

print(num_attributes)

print(cat_attributes)

num_pipeline = Pipeline([
    ('minmax', MinMaxScaler()),
    ('imputer', simple_imputer),
    ('attr_adder', attr_adder)
])

num_pipeline2 = Pipeline([
    ('minmax', RobustScaler()),
    ('imputer', simple_imputer),
    ('attr_adder', attr_adder)
])


cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder())
])


combine_pipeline = ColumnTransformer([
    ('scaler', num_pipeline, num_attributes),
    ('cat', cat_pipeline, cat_attributes)
])

combine_pipeline2 = ColumnTransformer([
    ('scaler', num_pipeline2, num_attributes),
    ('cat', cat_pipeline, cat_attributes)
])

# Fit the imputer pipeline on the training data
full_pipeline = Pipeline(steps=[
    ('imputer', imputer_pipeline),
    ('combine', combine_pipeline)
])

['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']
['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']


In [18]:
train_df_grouped_imputed = combine_pipeline2.fit_transform(X_sub_train)
test_df_grouped_imputed = combine_pipeline2.transform(test_df)


In [19]:
print(train_df_grouped_imputed[0])

  (0, 1)	0.3678963110667997
  (0, 3)	-0.5
  (0, 40)	1.0
  (0, 152)	1.0
  (0, 163)	1.0
  (0, 168)	1.0
  (0, 174)	1.0
  (0, 177)	1.0


In [None]:
from xgboost import XGBRegressor

#xgb_reg = XGBRegressor(n_estimators=500, learning_rate=0.04, max_depth=15, random_state=42)
xgb_reg = XGBRegressor(n_estimators=500, min_child_weight=2, max_depth=16, learning_rate=0.03)


xgb_reg.fit(train_df_grouped_imputed, y_sub_train)
y_pred = xgb_reg.predict(test_df_grouped_imputed)
print(y_pred[0])

# Combine the id with the predictions into a DataFrame
submission_df = pd.DataFrame({'id': test_id, 'Listening_Time_minutes': y_pred})
submission_df.to_csv('submission.csv', index=False)
print(submission_df.head())
print(submission_df.shape)

SyntaxError: invalid syntax (2965981469.py, line 4)