In [49]:
import pandas as pd
import sklearn
import xgboost

In [50]:
def map_episode_title(x):
    episode_nr = int(x.split()[-1])
    return episode_nr
    if episode_nr <= 25:
        return 'Early'
    if episode_nr <= 50:
        return 'Mid'
    if episode_nr <= 75:
        return 'Late'
    if episode_nr <= 100:
        return 'End'

In [51]:
train_df = pd.read_csv('train.csv')
train_df = train_df.drop(columns=['id'])

train_df['Episode_Title'] = train_df['Episode_Title'].map(lambda x: map_episode_title(x))

In [52]:
train_df, y = train_df.drop(columns=['Listening_Time_minutes']), train_df['Listening_Time_minutes'].tolist()

In [53]:
train_df.count()

Podcast_Name                   750000
Episode_Title                  750000
Episode_Length_minutes         662907
Genre                          750000
Host_Popularity_percentage     750000
Publication_Day                750000
Publication_Time               750000
Guest_Popularity_percentage    603970
Number_of_Ads                  749999
Episode_Sentiment              750000
dtype: int64

In [54]:
train_df

Unnamed: 0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment
0,Mystery Matters,98,,True Crime,74.81,Thursday,Night,,0.0,Positive
1,Joke Junction,26,119.80,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative
2,Study Sessions,16,73.90,Education,69.97,Tuesday,Evening,8.97,0.0,Negative
3,Digital Digest,45,67.17,Technology,57.22,Monday,Morning,78.70,2.0,Positive
4,Mind & Body,86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral
...,...,...,...,...,...,...,...,...,...,...
749995,Learning Lab,25,75.66,Education,69.36,Saturday,Morning,,0.0,Negative
749996,Business Briefs,21,75.75,Business,35.21,Saturday,Night,,2.0,Neutral
749997,Lifestyle Lounge,51,30.98,Lifestyle,78.58,Thursday,Morning,84.89,0.0,Negative
749998,Style Guide,47,108.98,Lifestyle,45.39,Thursday,Morning,93.27,0.0,Negative


In [56]:
train_df['Episode_Sentiment'] = train_df['Episode_Sentiment'].map({'Positive':1, 'Negative':-1, 'Neutral':0})
train_df['Popularity_Average'] = (train_df['Host_Popularity_percentage'] + train_df['Guest_Popularity_percentage'])/2
train_df['Episode_Length/Ads'] = train_df['Number_of_Ads']/train_df['Episode_Length_minutes']

In [57]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numerical_cols = ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Episode_Title']
categorical_cols = ['Podcast_Name', 'Genre', 'Publication_Time']

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer()),
    ("scaler", sklearn.preprocessing.RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

full_pipeline = Pipeline(steps=[
    ("preprocessing", preprocessor),
    # ("regressor", sklearn.neural_network.MLPRegressor(hidden_layer_sizes=16, max_iter=500, early_stopping=True))
    ("regressor", xgboost.XGBRegressor(n_estimators=930, learning_rate=0.031629338917482916, max_depth=15, subsample=0.9468611176708397,
                                      colsample_bytree=0.8042585491461298, reg_alpha=9.193697404267928, reg_lambda=2.4499368941014024,
                                      min_child_weight=3))
    #("regressor", keras_regressor)
])

X = full_pipeline.fit(train_df, y)

In [58]:
test_df = pd.read_csv('test.csv')
test_df = test_df.drop(columns=['id'])
test_df['Episode_Title'] = test_df['Episode_Title'].map(lambda x: map_episode_title(x))

In [59]:
test_df.loc[56597, ['Episode_Length_minutes']] = None
test_df.loc[54434, ['Episode_Length_minutes']] = None
test_df.loc[178393, ['Number_of_Ads']] = None
test_df.loc[37939, ['Number_of_Ads']] = None

In [60]:
test_df['Episode_Sentiment'] = test_df['Episode_Sentiment'].map({'Positive':1, 'Negative':-1, 'Neutral':0})
test_df['Popularity_Average'] = (test_df['Host_Popularity_percentage'] + test_df['Guest_Popularity_percentage'])/2
test_df['Episode_Length/Ads'] = test_df['Number_of_Ads']/test_df['Episode_Length_minutes']

In [61]:
preds = full_pipeline.predict(test_df)

In [62]:
ans_df = pd.DataFrame({
    'id': [x + 750000 for x in range(len(preds))],
    'Listening_Time_minutes': preds
})

In [63]:
ans_df.set_index('id')

Unnamed: 0_level_0,Listening_Time_minutes
id,Unnamed: 1_level_1
750000,53.036953
750001,19.765808
750002,49.292244
750003,75.722664
750004,48.389900
...,...
999995,12.220708
999996,62.146507
999997,8.741622
999998,83.679420


In [64]:
ans_df.to_csv('ans.csv',index=False)

In [31]:
import os
os.environ['KERAS_BACKEND'] = 'torch'
import keras

In [32]:
def build_keras_model(X, y):
    # The meta parameter contains information about your data
    # like number of features after preprocessing
    n_features_in = 72
    
    model = keras.models.Sequential([
        keras.layers.Input(shape=(n_features_in,)),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dense(16, activation='relu'),
        keras.layers.Dense(1)
    ])
    
    model.compile(
        optimizer='adam',
        loss='mse',  # Use 'mse' instead of 'rmse'
        metrics=[keras.metrics.RootMeanSquaredError()]
    )
    
    return model

keras_regressor = keras.wrappers.SKLearnRegressor(
    model=build_keras_model,
    fit_kwargs={"callbacks": [keras.callbacks.EarlyStopping()], "batch_size":128, "epochs":5}
)

In [33]:
personal_regressor

NameError: name 'personal_regressor' is not defined