In [3]:
!pip install lightgbm --quiet


In [4]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder


In [5]:
# Adjust paths if running locally. On Kaggle, just use '../input/...'
TRAIN_PATH = "train.csv"
TEST_PATH  = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample = pd.read_csv(SAMPLE_SUB_PATH)

print("Train shape:", train.shape)
print("Test shape :", test.shape)
train.head()


Train shape: (524164, 11)
Test shape : (174722, 10)


Unnamed: 0,id,RhythmScore,AudioLoudness,VocalContent,AcousticQuality,InstrumentalScore,LivePerformanceLikelihood,MoodScore,TrackDurationMs,Energy,BeatsPerMinute
0,0,0.60361,-7.636942,0.0235,5e-06,1e-06,0.051385,0.409866,290715.645,0.826267,147.5302
1,1,0.639451,-16.267598,0.07152,0.444929,0.349414,0.170522,0.65101,164519.5174,0.1454,136.15963
2,2,0.514538,-15.953575,0.110715,0.173699,0.453814,0.029576,0.423865,174495.5667,0.624667,55.31989
3,3,0.734463,-1.357,0.052965,0.001651,0.159717,0.086366,0.278745,225567.4651,0.487467,147.91212
4,4,0.532968,-13.056437,0.0235,0.068687,1e-06,0.331345,0.477769,213960.6789,0.947333,89.58511


In [6]:
print("\n--- Dtypes ---")
print(train.dtypes.value_counts())
print("\nMissing values (train top 20):")
print(train.isna().sum().sort_values(ascending=False).head(20))

if "BeatsPerMinute" in train.columns:
    print("\nTarget summary:")
    print(train["BeatsPerMinute"].describe())



--- Dtypes ---
float64    10
int64       1
Name: count, dtype: int64

Missing values (train top 20):
id                           0
RhythmScore                  0
AudioLoudness                0
VocalContent                 0
AcousticQuality              0
InstrumentalScore            0
LivePerformanceLikelihood    0
MoodScore                    0
TrackDurationMs              0
Energy                       0
BeatsPerMinute               0
dtype: int64

Target summary:
count    524164.000000
mean        119.034899
std          26.468077
min          46.718000
25%         101.070410
50%         118.747660
75%         136.686590
max         206.037000
Name: BeatsPerMinute, dtype: float64


In [7]:
ID_COL = "ID" if "ID" in train.columns else train.columns[0]
TARGET = "BeatsPerMinute"

def prepare_features(df, is_train=True):
    X = df.copy()
    if ID_COL in X.columns:
        X = X.drop(columns=[ID_COL])
    if is_train and TARGET in X.columns:
        X = X.drop(columns=[TARGET])
    return X

X_full = pd.concat([prepare_features(train, True),
                    prepare_features(test, False)], axis=0, ignore_index=True)

numeric_cols = X_full.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_full.select_dtypes(exclude=[np.number]).columns.tolist()

print("Numeric columns:", len(numeric_cols))
print("Categorical columns:", len(cat_cols))


Numeric columns: 9
Categorical columns: 0


In [8]:
# Impute numerics
for c in numeric_cols:
    X_full[c].fillna(X_full[c].median(), inplace=True)

# Impute categoricals
for c in cat_cols:
    X_full[c].fillna("__MISSING__", inplace=True)

if cat_cols:
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    X_full[cat_cols] = enc.fit_transform(X_full[cat_cols])


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_full[c].fillna(X_full[c].median(), inplace=True)


In [9]:
# Missing counts per row
na_series = pd.concat([train.isna().sum(axis=1),
                       test.isna().sum(axis=1)], ignore_index=True)
X_full["num_missing_orig"] = na_series.values

# Row-level statistics (only if numerics exist)
if len(numeric_cols) > 0:
    X_full["row_mean"] = X_full[numeric_cols].mean(axis=1)
    X_full["row_std"]  = X_full[numeric_cols].std(axis=1).fillna(0)
    X_full["row_sum"]  = X_full[numeric_cols].sum(axis=1)


In [10]:
n_train = train.shape[0]
X = X_full.iloc[:n_train, :].reset_index(drop=True)
X_test = X_full.iloc[n_train:, :].reset_index(drop=True)
y = train[TARGET].values

print("Final X shape:", X.shape)
print("Final X_test shape:", X_test.shape)


Final X shape: (524164, 13)
Final X_test shape: (174722, 13)


In [16]:
from lightgbm import early_stopping, log_evaluation

use_log_target = False
y_train = np.log1p(y) if use_log_target else y.copy()

folds = 5
kf = KFold(n_splits=folds, shuffle=True, random_state=42)

oof_preds = np.zeros(n_train)
test_preds = np.zeros(X_test.shape[0])

lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "max_depth": -1,
    "subsample": 0.8,
    "colsample_bytree": 0.7,
    "n_estimators": 10000,
    "random_state": 42,
    "verbosity": -1
}

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y_train)):
    print(f"Fold {fold+1}")
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    valid_data = lgb.Dataset(X_val, label=y_val)

    clf = lgb.train(
        params=lgb_params,
        train_set=train_data,
        valid_sets=[train_data, valid_data],
        valid_names=["train","valid"],
        callbacks=[
            early_stopping(stopping_rounds=200),
            log_evaluation(period=100)
        ]
    )

    oof_preds[val_idx] = clf.predict(X_val, num_iteration=clf.best_iteration)
    test_preds += clf.predict(X_test, num_iteration=clf.best_iteration) / folds


Fold 1
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 26.3342	valid's rmse: 26.4428
[200]	train's rmse: 26.2171	valid's rmse: 26.4485
Early stopping, best iteration is:
[32]	train's rmse: 26.419	valid's rmse: 26.4393
Fold 2
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 26.3265	valid's rmse: 26.4879
[200]	train's rmse: 26.2098	valid's rmse: 26.492
Early stopping, best iteration is:
[33]	train's rmse: 26.4075	valid's rmse: 26.486
Fold 3
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 26.3106	valid's rmse: 26.5266
[200]	train's rmse: 26.1892	valid's rmse: 26.5313
Early stopping, best iteration is:
[23]	train's rmse: 26.4106	valid's rmse: 26.5258
Fold 4
Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 26.3371	valid's rmse: 26.4477
[200]	train's rmse: 26.2206	valid's rmse: 26.4545
Early stopping, best iteration is:
[60]	train's rmse: 26.3858	valid's rmse:

In [None]:
# Evaluate (compatible with older sklearn versions)
import numpy as np
from sklearn.metrics import mean_squared_error

# If you trained on log target, oof_preds are on log scale; convert back before scoring
if use_log_target:
    y_true = np.expm1(y_train)       # original-scale targets
    oof_true = np.expm1(oof_preds)   # original-scale oof preds
else:
    y_true = y_train
    oof_true = oof_preds

mse = mean_squared_error(y_true, oof_true)   # returns MSE
oof_rmse = np.sqrt(mse)                       # RMSE

print(f"OOF RMSE: {oof_rmse:.5f}")


OOF RMSE: 26.46124


In [18]:
final_test_pred = np.expm1(test_preds) if use_log_target else test_preds
final_test_pred = np.maximum(final_test_pred, 0.0)  # no negatives
final_test_pred = np.clip(final_test_pred, 30, 250) # realistic BPM range

ids = test[ID_COL].values
sub = pd.DataFrame({ID_COL: ids, "BeatsPerMinute": final_test_pred})

sub.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")
sub.head()


Submission file saved as submission.csv


Unnamed: 0,id,BeatsPerMinute
0,524164,119.174534
1,524165,118.735203
2,524166,119.409258
3,524167,119.280198
4,524168,119.556224
