In [95]:
import os
import numpy as np
import pandas as pd
import librosa
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=22050)
        features = {
            "duration": librosa.get_duration(y=y, sr=sr),
            "rms": np.mean(librosa.feature.rms(y=y)),
            "zcr": np.mean(librosa.feature.zero_crossing_rate(y=y)),
            "tempo": librosa.beat.tempo(y=y, sr=sr)[0]
        }

        # MFCCs
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        for i in range(13):
            features[f"mfcc_{i}_mean"] = np.mean(mfcc[i])
            features[f"mfcc_{i}_std"] = np.std(mfcc[i])

        # Chroma
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        features["chroma_mean"] = np.mean(chroma)
        features["chroma_std"] = np.std(chroma)

        # Spectral Contrast
        contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        features["contrast_mean"] = np.mean(contrast)
        features["contrast_std"] = np.std(contrast)

        # Tonnetz
        tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
        features["tonnetz_mean"] = np.mean(tonnetz)
        features["tonnetz_std"] = np.std(tonnetz)

        return features

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Extract train features
train_features = []
for filename in tqdm(train_df['filename'], desc="Extracting train features"):
    feats = extract_features(os.path.join('audios_train', filename))
    if feats is not None:
        train_features.append(feats)


train_feats = pd.DataFrame(train_features)
train_feats["label"] = train_df.loc[train_feats.index, "label"]

# Extract test features
test_features = []
for filename in tqdm(test_df['filename'], desc="Extracting test features"):
    feats = extract_features(os.path.join('audios_test', filename))
    test_features.append(feats)


test_feats = pd.DataFrame(test_features)

# Handle missing values
train_feats.dropna(inplace=True)
test_feats.fillna(0, inplace=True)

# Split features and labels
X = train_feats.drop("label", axis=1)
y = train_feats["label"]
X_test = test_feats.copy()

# Standard scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Cross-validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
val_scores = []

# LightGBM model parameters
params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.01,
    "num_leaves": 64,
    "max_depth": -1,
    "feature_fraction": 0.85,
    "bagging_fraction": 0.8,
    "verbosity": -1,
    "seed": 42
}

# Train and validate
final_preds = np.zeros(len(X_test_scaled))
for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(
    params,
    train_data,
    valid_sets=[val_data],
    num_boost_round=2000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)


    val_pred = model.predict(X_val)
    score = r2_score(y_val, val_pred)
    val_scores.append(score)

    # Predict on test set and add to final_preds
    final_preds += model.predict(X_test_scaled) / kf.n_splits

print(f"\n✅ Average R² Score: {np.mean(val_scores):.4f}")

# Create submission
submission = pd.DataFrame({
    "filename": test_df["filename"],
    "label": final_preds.clip(1, 5)
})
submission.to_csv("submission2.csv", index=False)
print("📁 submission2.csv created.")


Extracting train features: 100%|██████████| 444/444 [34:08<00:00,  4.61s/it]
Extracting test features: 100%|██████████| 195/195 [13:54<00:00,  4.28s/it]


Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 1.05849
[200]	valid_0's rmse: 1.02706
[300]	valid_0's rmse: 1.0182
[400]	valid_0's rmse: 1.01707
[500]	valid_0's rmse: 1.01001
[600]	valid_0's rmse: 1.00308
[700]	valid_0's rmse: 1.0007
[800]	valid_0's rmse: 0.997943
[900]	valid_0's rmse: 0.996304
Early stopping, best iteration is:
[893]	valid_0's rmse: 0.996112
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.953623
[200]	valid_0's rmse: 0.935478
Early stopping, best iteration is:
[138]	valid_0's rmse: 0.93119
Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.933744
[200]	valid_0's rmse: 0.859128
[300]	valid_0's rmse: 0.823599
[400]	valid_0's rmse: 0.79913
[500]	valid_0's rmse: 0.789766
[600]	valid_0's rmse: 0.779637
[700]	valid_0's rmse: 0.776042
[800]	valid_0's rmse: 0.773738
[900]	valid_0's rmse: 0.773488
[1000]	valid_0's rmse: 0.7725
[1100]	valid_0's rmse: 0.772691
Early stopp