## 1. Imports & Setup

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from scipy.stats import pearsonr

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Base raw data directory (notebooks → ../data/raw)
DATA_DIR = os.path.join("..", "data", "raw")

# CSV paths
TRAIN_CSV = os.path.join(DATA_DIR, "csvs", "train.csv")
TEST_CSV  = os.path.join(DATA_DIR, "csvs", "test.csv")

# Audio directories
TRAIN_AUDIO_DIR = os.path.join(DATA_DIR, "audios", "train")
TEST_AUDIO_DIR  = os.path.join(DATA_DIR, "audios", "test")

# Feature file (created in Day 2)
FEATURES_CSV = os.path.join("..", "data", "processed", "train_audio_features.csv")

## 2. Load Features & Labels

In [3]:
# # Load extracted features
# X = pd.read_csv(FEATURES_CSV, index_col=0)

# # Load training labels
# train_df = pd.read_csv(TRAIN_CSV)
# y = train_df["label"]

# print("Feature matrix shape:", X.shape)
# print("Target vector shape:", y.shape)

In [4]:
features_df = pd.read_csv(FEATURES_CSV)
train_df = pd.read_csv(TRAIN_CSV)

print("Features shape:", features_df.shape)
print("Train CSV shape:", train_df.shape)

Features shape: (276, 36)
Train CSV shape: (409, 2)


In [5]:
train_df

Unnamed: 0,filename,label
0,audio_173,3.0
1,audio_138,3.0
2,audio_127,2.0
3,audio_95,2.0
4,audio_73,3.5
...,...,...
404,audio_72,3.0
405,audio_107,3.5
406,audio_271,3.0
407,audio_349,2.5


In [6]:
# Merge to keep only valid audio files
merged_df = features_df.merge(
    train_df,
    on="filename",
    how="inner"
)

print("Merged shape:", merged_df.shape)

Merged shape: (0, 37)


In [7]:
print("Features DF columns:")
print(features_df.columns.tolist())

print("\nTrain DF columns:")
print(train_df.columns.tolist())


Features DF columns:
['Unnamed: 0', 'filename', 'mfcc_0', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12', 'chroma_0', 'chroma_1', 'chroma_2', 'chroma_3', 'chroma_4', 'chroma_5', 'chroma_6', 'chroma_7', 'chroma_8', 'chroma_9', 'chroma_10', 'chroma_11', 'contrast_0', 'contrast_1', 'contrast_2', 'contrast_3', 'contrast_4', 'contrast_5', 'contrast_6', 'zcr', 'rmse']

Train DF columns:
['filename', 'label']


## 3. Train–Validation Split

In [8]:
# X_train, X_val, y_train, y_val = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )