In [25]:
import os
import pandas as pd
import numpy as np
import librosa
import joblib
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [26]:
# ========== Preprocessing ==========
def preprocess_data(train_df, test_df):
    label_encoder = LabelEncoder()
    train_df['label'] = label_encoder.fit_transform(train_df['label'])
    test_df['label'] = -1  # Placeholder
    joblib.dump(label_encoder, 'label_encoder.pkl')
    return train_df, test_df, label_encoder

In [30]:
# ========== Feature Extraction ==========
def extract_features(file_path, n_mfcc=13):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        mfccs_mean = np.mean(mfccs, axis=1)
        mfccs_std = np.std(mfccs, axis=1)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        chroma_mean = np.mean(chroma, axis=1)
        zcr = librosa.feature.zero_crossing_rate(y)
        zcr_mean = np.mean(zcr)
        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
        contrast_mean = np.mean(spectral_contrast, axis=1)
        features = np.hstack([mfccs_mean, mfccs_std, chroma_mean, [zcr_mean], contrast_mean])
        return features
    except Exception as e:
        print(f"Error with {file_path}: {e}")
        return np.zeros(n_mfcc*2 + 12 + 1 + 7)

def extract_all_features(df, base_path="data/audios/", n_mfcc=13, is_train=True):
    features = []
    folder = "train" if is_train else "test"
    for filename in df["filename"]:
        full_path = os.path.join(base_path, folder, filename)
        mfcc = extract_features(full_path, n_mfcc)
        features.append(mfcc)
    return np.array(features)

In [31]:
# ========== Save/Load ==========
def save_model_and_scaler(model, scaler, model_path="model/random_forest_model.pkl", scaler_path="model/scaler.pkl"):
    os.makedirs("model", exist_ok=True)
    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)

def load_model_and_scaler(model_path="model/random_forest_model.pkl", scaler_path="model/scaler.pkl"):
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    return model, scaler

In [32]:
# ========== Main Execution ==========
# Load Data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [33]:
# Preprocess
train_df, test_df, label_encoder = preprocess_data(train_df, test_df)
X_features = extract_all_features(train_df, is_train=True)
y_labels = train_df['label'].values

In [34]:
# Split and Normalize
X_train, X_val, y_train, y_val = train_test_split(X_features, y_labels, test_size=0.2, random_state=42, stratify=y_labels)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [35]:
# SMOTE
smote = SMOTE(random_state=42, k_neighbors=1)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

In [36]:
# Train with GridSearch
model = RandomForestClassifier(class_weight='balanced', random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=cv, n_jobs=-1, verbose=2)
grid_search.fit(X_resampled, y_resampled)
best_model = grid_search.best_estimator_

Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [37]:
# Evaluate
print("Validation Results:")
evaluate_model = lambda m, x, y: print(classification_report(y, m.predict(x)))
evaluate_model(best_model, X_val_scaled, y_val)

Validation Results:
              precision    recall  f1-score   support

           2       0.46      0.67      0.55         9
           3       0.25      0.20      0.22         5
           4       0.46      0.60      0.52        10
           5       0.00      0.00      0.00         4
           6       0.46      0.35      0.40        17
           7       0.14      0.07      0.10        14
           8       0.54      0.67      0.60        30

    accuracy                           0.45        89
   macro avg       0.33      0.37      0.34        89
weighted avg       0.41      0.45      0.42        89



In [38]:
# Save
save_model_and_scaler(best_model, scaler)

In [39]:
# ========== Test Prediction ==========
def preprocess_test_only(test_df, label_encoder_path='label_encoder.pkl'):
    test_df['label'] = -1
    return test_df

In [40]:
# Load Model and Preprocess Test
model, scaler = load_model_and_scaler()
test_df = preprocess_test_only(test_df)
X_test = extract_all_features(test_df, is_train=False)
X_test_scaled = scaler.transform(X_test)
y_pred = model.predict(X_test_scaled)
decoded_predictions = label_encoder.inverse_transform(y_pred)

In [41]:
# Save to CSV
submission_df = test_df[['filename']].copy()
submission_df['label'] = decoded_predictions
submission_df.to_csv('predictions.csv', index=False)
print("✅ predictions.csv saved successfully.")

✅ predictions.csv saved successfully.
