In [None]:
pip install pandas
pip install tqdm

In [None]:
import os
import shutil
import random
from pathlib import Path
import librosa
import soundfile as sf
import pandas as pd
from tqdm import tqdm

# Source folders (replace with your actual paths)
real_src = r"D:\Devanshi_Himanshi_SingFox\_6_Tracks\T5\real"
fake_src = r"D:\Devanshi_Himanshi_SingFox\_6_Tracks\T5\fake"
target_root = r"D:\Devanshi_Himanshi_SingFox\SingFox_ICASSP\T5"

# Config
split_ratio = [0.7, 0.15, 0.15]
supported_ext = ('.mp3', '.wav', '.flac', '.WAV', '.m4a', '.aac', '.ogg')
target_sr = 16000

# Create dataset structure
splits = ['train', 'val', 'test']
labels = ['real', 'fake']
for split in splits:
    for label in labels:
        os.makedirs(os.path.join(target_root, split, label), exist_ok=True)

def collect_files(src_root, label):
    files = []
    for root, _, filenames in os.walk(src_root):
        for fname in filenames:
            if fname.lower().endswith(supported_ext):
                # Extract language as last-but-one folder
                parts = Path(root).parts
                lang = parts[-1] if len(parts) >= 2 else "unknown"
                full_path = os.path.join(root, fname)
                files.append({
                    'original_path': full_path,
                    'original_filename': fname,
                    'language': lang,
                    'label': label
                })
    return files

def convert_to_wav_librosa(src_path, dest_path):
    y, _ = librosa.load(src_path, sr=target_sr, mono=True)
    sf.write(dest_path, y, target_sr)

def process_and_split(file_list, label):
    random.shuffle(file_list)
    total = len(file_list)
    n_train = int(total * split_ratio[0])
    n_val = int(total * split_ratio[1])
    n_test = total - n_train - n_val
    splits_list = ['train'] * n_train + ['val'] * n_val + ['test'] * n_test

    dataset_info = []
    missing_files = []

    print(f"🔄 Processing {label.upper()} files...")
    for file_info, split in tqdm(zip(file_list, splits_list), total=len(splits_list), desc=f"{label.upper()}"):
        src_file = file_info['original_path']

        # Check if file actually exists
        if not os.path.exists(src_file):
            print(f"⚠️ Missing file: {src_file}")
            missing_files.append(src_file)
            continue

        base_name = Path(src_file).stem
        dest_dir = os.path.join(target_root, split, label)
        dest_path = os.path.join(dest_dir, f"{base_name}.wav")

        # Avoid overwriting
        i = 1
        while os.path.exists(dest_path):
            dest_path = os.path.join(dest_dir, f"{base_name}_{i}.wav")
            i += 1

        try:
            convert_to_wav_librosa(src_file, dest_path)
            rel_path = os.path.relpath(dest_path, target_root)

            dataset_info.append({
                'original_filename': Path(src_file).name,
                'language': file_info['language'],
                'original_path': src_file,
                'label': label,
                'split': split,
                'saved_path': rel_path
            })

        except Exception as e:
            print(f"⚠️ Failed to convert {src_file}: {e}")
            missing_files.append(src_file)

    if missing_files:
        miss_log_path = os.path.join(target_root, f'missing_files_{label}.txt')
        with open(miss_log_path, 'w', encoding='utf-8') as f:
            for path in missing_files:
                f.write(f"{path}\n")
        print(f"📝 Missing file list saved to: {miss_log_path}")

    return dataset_info

# Main execution
real_files = collect_files(real_src, 'real')
fake_files = collect_files(fake_src, 'fake')

info_real = process_and_split(real_files, 'real')
info_fake = process_and_split(fake_files, 'fake')

# Save Excel
df = pd.DataFrame(info_real + info_fake)
df.to_excel(os.path.join(target_root, 'dataset_info.xlsx'), index=False)

print("✅ Dataset prepared and Excel saved.")
