In [65]:
import os
import librosa
import numpy as np
import pandas as pd
import soundfile as sf
from tqdm import tqdm

In [66]:
TARGET_SR = 16000  # Target sample rate
MAX_DURATION = 60  # seconds
N_MFCC = 40        # Number of MFCCs to extract

In [67]:
def load_and_preprocess_audio(file_path, target_sr=TARGET_SR, max_duration=MAX_DURATION):
    try:
        audio, sr = librosa.load(file_path, sr=target_sr)
        max_len = target_sr * max_duration

        # Normalize audio
        audio = librosa.util.normalize(audio)

        # Pad or truncate
        if len(audio) < max_len:
            pad_width = max_len - len(audio)
            audio = np.pad(audio, (0, pad_width))
        else:
            audio = audio[:max_len]

        return audio
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None


In [68]:
def extract_mfcc(audio, sr=TARGET_SR, n_mfcc=N_MFCC):
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    return mfcc

In [69]:
def extract_log_melspec(audio, sr=TARGET_SR, n_mels=64):
    mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels)
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    return log_mel_spec

In [70]:
def preprocess_all_audio(dataframe, audio_dir, feature_type='mfcc'):
    features = []

    for fname in tqdm(dataframe['filename']):
        path = os.path.join(audio_dir, fname)
        audio = load_and_preprocess_audio(path)

        if audio is not None:
            if feature_type == 'mfcc':
                feat = extract_mfcc(audio)
            elif feature_type == 'log_mel':
                feat = extract_log_melspec(audio)
            else:
                raise ValueError("Invalid feature type")
            
            # Flatten or aggregate (e.g., mean over time)
            feat = np.mean(feat, axis=1)
            features.append(feat)
        else:
            features.append(np.zeros(N_MFCC))  # Handle errors with zero vector

    return np.array(features)

In [71]:
df_train = pd.read_csv('train.csv')
audio_dir = 'audios_train'  # adjust path accordingly

X = preprocess_all_audio(df_train, audio_dir, feature_type='mfcc')
y = df_train['label'].values

100%|████████████████████████████████████████████████████████████████████████████████| 444/444 [01:08<00:00,  6.53it/s]


In [72]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
import lightgbm as lgb
from tqdm import tqdm

In [73]:
df_test = pd.read_csv('test.csv')
audio_dir_t = 'audios_test'
X_test=preprocess_all_audio(df_test, audio_dir_t, feature_type='mfcc')

100%|████████████████████████████████████████████████████████████████████████████████| 195/195 [00:30<00:00,  6.50it/s]


In [78]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
val_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))
fold_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
    print(f"\n📂 Fold {fold + 1}")
    X_train_fold, y_train_fold = X_scaled[train_idx], y[train_idx]
    X_val_fold, y_val_fold = X_scaled[val_idx], y[val_idx]

    model = lgb.LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.01,
        max_depth=3,
        num_leaves=8,
        subsample=0.9,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )

    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=100)
        ]
    )

    val_pred = model.predict(X_val_fold)
    val_preds[val_idx] = val_pred

    score = pearsonr(y_val_fold, val_pred)[0]
    fold_scores.append(score)
    print(f"✅ Fold {fold + 1} Pearson Correlation: {score:.4f}")

    test_preds += model.predict(X_test_scaled) / kf.n_splits

# Final CV results
final_score = pearsonr(y, val_preds)[0]
rmse = np.sqrt(mean_squared_error(y, val_preds))
print("\n📊 Final Pearson Correlation:", round(final_score, 4))
print("📉 Final RMSE:", round(rmse, 4))
print("Per Fold Scores:", [round(s, 4) for s in fold_scores])


📂 Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000801 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4771
[LightGBM] [Info] Number of data points in the train set: 355, number of used features: 40
[LightGBM] [Info] Start training from score 3.635211
Training until validation scores don't improve for 50 rounds
[100]	valid_0's l2: 1.24206
[200]	valid_0's l2: 1.20868
[300]	valid_0's l2: 1.20726
Early stopping, best iteration is:
[258]	valid_0's l2: 1.20095
✅ Fold 1 Pearson Correlation: 0.3485

📂 Fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000880 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4770
[LightGBM] [Info] Number of data points in the train set: 355, number of used features: 40
[LightGBM] [Info] Start training from score 3.622535
Training until validation scores don't improve for 50 r

In [79]:
submission = pd.DataFrame({
    'filename': df_test['filename'],
    'label': test_preds
})
submission['label'] = np.clip(np.round(submission['label']), 0, 5).astype(int)
submission.to_csv('submission.csv', index=False)
print("✅ Submission saved as submission.csv")

✅ Submission saved as submission.csv


In [80]:
from sklearn.metrics import accuracy_score
import numpy as np

# Round predictions and true values
y_pred_rounded = np.clip(np.round(val_preds), 1, 5).astype(int)
y_true_rounded = np.clip(np.round(y), 1, 5).astype(int)

# Classification-style accuracy
acc = accuracy_score(y_true_rounded, y_pred_rounded)
print("✅ Train Accuracy (rounded):", round(acc * 100, 2), "%")

✅ Train Accuracy (rounded): 33.11 %
