In [188]:
import pandas as pd
import sklearn
import xgboost

In [237]:
def map_episode_title(x):
    episode_nr = int(x.split()[-1])
    if episode_nr <= 25:
        return 'Early'
    if episode_nr <= 50:
        return 'Mid'
    if episode_nr <= 75:
        return 'Late'
    if episode_nr <= 100:
        return 'End'

In [238]:
train_df = pd.read_csv('train.csv')
train_df = train_df.drop(columns=['id'])
# train_df.loc[56597, ['Episode_Length_minutes']] = None
# train_df.loc[54434, ['Episode_Length_minutes']] = None
# train_df.loc[178393, ['Number_of_Ads']] = None
# train_df.loc[37939, ['Number_of_Ads']] = None
train_df['Episode_Title'] = train_df['Episode_Title'].map(lambda x: map_episode_title(x))

In [239]:
train_df, y = train_df.drop(columns=['Listening_Time_minutes']), train_df['Listening_Time_minutes'].tolist()

In [240]:
train_df.count()

Podcast_Name                   750000
Episode_Title                  750000
Episode_Length_minutes         662907
Genre                          750000
Host_Popularity_percentage     750000
Publication_Day                750000
Publication_Time               750000
Guest_Popularity_percentage    603970
Number_of_Ads                  749999
Episode_Sentiment              750000
dtype: int64

In [241]:
train_df

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
0,Mystery Matters,End,,True Crime,74.81,Thursday,Night,,0.0,Positive
1,Joke Junction,Mid,119.80,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative
2,Study Sessions,Early,73.90,Education,69.97,Tuesday,Evening,8.97,0.0,Negative
3,Digital Digest,Mid,67.17,Technology,57.22,Monday,Morning,78.70,2.0,Positive
4,Mind & Body,End,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral
...,...,...,...,...,...,...,...,...,...,...
749995,Learning Lab,Early,75.66,Education,69.36,Saturday,Morning,,0.0,Negative
749996,Business Briefs,Early,75.75,Business,35.21,Saturday,Night,,2.0,Neutral
749997,Lifestyle Lounge,Late,30.98,Lifestyle,78.58,Thursday,Morning,84.89,0.0,Negative
749998,Style Guide,Mid,108.98,Lifestyle,45.39,Thursday,Morning,93.27,0.0,Negative


In [307]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numerical_cols = ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads']
categorical_cols = ['Podcast_Name', 'Episode_Title', 'Genre', 'Publication_Time', 'Episode_Sentiment']

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

full_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    # ("regressor", sklearn.neural_network.MLPRegressor(hidden_layer_sizes=16, max_iter=500, early_stopping=True))
    #("regressor", xgboost.XGBRegressor(eval_metric='rmse'))
    ("regressor", keras_regressor)
])

X = full_pipeline.fit(train_df, y)

Epoch 1/5
[1m5860/5860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 7ms/step - loss: 350.1501 - root_mean_squared_error: 17.8870 
Epoch 2/5
[1m  17/5860[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m39s[0m 7ms/step - loss: 194.7102 - root_mean_squared_error: 13.9503  

  current = self.get_monitor_value(logs)


[1m5860/5860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 7ms/step - loss: 189.6345 - root_mean_squared_error: 13.7707
Epoch 3/5
[1m5860/5860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 7ms/step - loss: 182.7729 - root_mean_squared_error: 13.5193
Epoch 4/5
[1m5860/5860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 7ms/step - loss: 182.0299 - root_mean_squared_error: 13.4918
Epoch 5/5
[1m5860/5860[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 7ms/step - loss: 180.4366 - root_mean_squared_error: 13.4326


In [243]:
test_df = pd.read_csv('test.csv')
test_df = test_df.drop(columns=['id'])
test_df['Episode_Title'] = test_df['Episode_Title'].map(lambda x: map_episode_title(x))

In [244]:
preds = full_pipeline.predict(test_df)

In [245]:
ans_df = pd.DataFrame({
    'id': [x + 750000 for x in range(len(preds))],
    'Listening_Time_minutes': preds
})

In [246]:
ans_df.set_index('id')

Unnamed: 0_level_0,Listening_Time_minutes
id,Unnamed: 1_level_1
750000,55.627402
750001,18.348289
750002,49.523658
750003,83.904832
750004,47.786964
...,...
999995,11.572395
999996,58.317651
999997,5.403034
999998,78.581919


In [247]:
ans_df.to_csv('ans.csv',index=False)

In [249]:
import os
os.environ['KERAS_BACKEND'] = 'torch'
import keras

In [306]:
def build_keras_model(X, y):
    # The meta parameter contains information about your data
    # like number of features after preprocessing
    n_features_in = 72
    
    model = keras.models.Sequential([
        keras.layers.Input(shape=(n_features_in,)),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(16, activation='relu'),
        keras.layers.Dense(1)
    ])
    
    model.compile(
        optimizer='adam',
        loss='mse',  # Use 'mse' instead of 'rmse'
        metrics=[keras.metrics.RootMeanSquaredError()]
    )
    
    return model

keras_regressor = keras.wrappers.SKLearnRegressor(
    model=build_keras_model,
    fit_kwargs={"callbacks": [keras.callbacks.EarlyStopping()], "batch_size":128, "epochs":5}
)

In [299]:
personal_regressor