In [1]:
import pandas as pd
import numpy as np
import os

In [31]:
trainingSet = pd.read_csv('./data/train.csv')
testingSet = pd.read_csv('./data/test.csv')


print("train.csv shape is ", trainingSet.shape)
print("test.csv shape is ", testingSet.shape)

print()

print(trainingSet.head())
print()
print(testingSet.head())

print()

print(trainingSet.describe())

train.csv shape is  (750000, 12)
test.csv shape is  (250000, 11)

   id     Podcast_Name Episode_Title  Episode_Length_minutes       Genre   
0   0  Mystery Matters    Episode 98                     NaN  True Crime  \
1   1    Joke Junction    Episode 26                  119.80      Comedy   
2   2   Study Sessions    Episode 16                   73.90   Education   
3   3   Digital Digest    Episode 45                   67.17  Technology   
4   4      Mind & Body    Episode 86                  110.51      Health   

   Host_Popularity_percentage Publication_Day Publication_Time   
0                       74.81        Thursday            Night  \
1                       66.95        Saturday        Afternoon   
2                       69.97         Tuesday          Evening   
3                       57.22          Monday          Morning   
4                       80.07          Monday        Afternoon   

   Guest_Popularity_percentage  Number_of_Ads Episode_Sentiment   
0            

In [19]:
trainingSet = trainingSet.sample(frac=0.4, random_state=1)
# save to csv
trainingSet.to_csv('./data/train_sample.csv', index=False)

In [32]:
# Preprocessing
# Remove the columns that are not useful for training
train_data = trainingSet.drop(columns=['id', 'Podcast_Name', 'Episode_Title'])
test_data = testingSet.drop(columns=['id', 'Podcast_Name','Episode_Title'])

# Encode categorical variables
categorical_columns = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
continuous_columns = ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage']

train_data = pd.get_dummies(train_data, columns=categorical_columns)
test_data = pd.get_dummies(test_data, columns=categorical_columns)

# Drop missing values
train_data = train_data.dropna()



In [33]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = train_data.drop(columns=['Listening_Time_minutes'])
y = train_data['Listening_Time_minutes']

# normalize only the continuous features, not all
scaler = StandardScaler()
X[continuous_columns] = scaler.fit_transform(X[continuous_columns])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

params = {
    "reg_alpha": 0.006873126119178912,
    "reg_lambda": 0.145379437299848,
    "num_leaves": 85,
    "max_bin": 2**8,  
    "n_estimators": 483,
    "learning_rate": 0.034522923546535,
    "colsample_bytree": 0.5935293824087292,
    "min_child_samples": 5,
    "random_state": 42  
}

model = LGBMRegressor(**params)

model.fit(X_train, y_train)

y_pred = model.predict(X_val)

train_rmse = mean_squared_error(y_train, model.predict(X_train), squared=False)
val_rmse = mean_squared_error(y_val, y_pred, squared=False)

print(f"Train RMSE: {train_rmse:.2f}")
print(f"Validation RMSE: {val_rmse:.2f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 820
[LightGBM] [Info] Number of data points in the train set: 431238, number of used features: 28
[LightGBM] [Info] Start training from score 45.829445
Train RMSE: 10.17
Validation RMSE: 10.38


In [38]:
# Make predictions on the test set
X_test = test_data
# fill missing values
X_test = X_test.fillna(0)

X_test[continuous_columns] = scaler.transform(X_test[continuous_columns])
y_test_pred = model.predict(X_test)

# Save the predictions to CSV 
submission = pd.DataFrame({'id': testingSet['id'], 'Listening_Time_minutes': y_test_pred})
submission.to_csv('./data/submission.csv', index=False)