Data Cleaning 

In [2]:
import os
import glob
import re
from tqdm import tqdm
import numpy as np

ABC_RAW = "C:/Users/Divya Kindi/OneDrive/Desktop/ML_Project/lmd_abc"

abc_files = glob.glob(os.path.join(ABC_RAW, "*.abc"))
print("Total ABC files found:", len(abc_files))

lengths = []
missing_X = 0
missing_K = 0
missing_T = 0
contains_null = 0
contains_bad_unicode = 0
suspicious_repeat = 0

note_line_re = re.compile(r"^[A-Ga-gz\^_=,\d'\s|\[\]]+$")

def looks_like_music(text):
    lines = text.split("\n")
    music_lines = 0
    for line in lines:
        if note_line_re.match(line.strip()):
            music_lines += 1
        if music_lines >= 2:
            return True
    return False

music_like = 0

for path in tqdm(abc_files):
    try:
        text = open(path, "r", encoding="utf-8", errors="ignore").read()
        L = len(text)
        lengths.append(L)

        # missing fields
        if "X:" not in text: missing_X += 1
        if "K:" not in text: missing_K += 1
        if "T:" not in text: missing_T += 1

        # corruption tests
        if "\x00" in text: contains_null += 1
        if "�" in text: contains_bad_unicode += 1
        if re.search(r"(.)\1{40,}", text): suspicious_repeat += 1

        # music-like test
        if looks_like_music(text):
            music_like += 1

    except:
        continue

# Summary
print("\n==================== DATASET STATISTICS ====================\n")

print("Total files:", len(abc_files))
print("Avg length:", int(np.mean(lengths)))
print("Median length:", int(np.median(lengths)))
print("Min length:", min(lengths))
print("Max length:", max(lengths))

print("\nMissing X: ", missing_X)
print("Missing K: ", missing_K)
print("Missing T: ", missing_T)

print("\nContains NULL bytes:", contains_null)
print("Contains � bad unicode:", contains_bad_unicode)
print("Contains repeated chars:", suspicious_repeat)

print("\nLooks like music (>=2 note lines):", music_like)
print("Percentage music-like:", round(100 * music_like / len(abc_files), 2), "%")

print("\n============================================================\n")


Total ABC files found: 178553


100%|██████████| 178553/178553 [38:02<00:00, 78.22it/s]   




Total files: 178553
Avg length: 84686
Median length: 19068
Min length: 0
Max length: 1894875850

Missing X:  2633
Missing K:  2633
Missing T:  2633

Contains NULL bytes: 0
Contains � bad unicode: 0
Contains repeated chars: 25

Looks like music (>=2 note lines): 154242
Percentage music-like: 86.38 %




In [2]:
from tqdm import tqdm
import os
import shutil
import re

SOURCE  = "C:/Users/Divya Kindi/OneDrive/Desktop/ML_Project/lmd_abc"     
DEST = "C:/Users/Divya Kindi/OneDrive/Desktop/ML_Project/ABC-4_clean" 

os.makedirs(DEST, exist_ok=True)

def is_music_like(content):
    note_lines = 0
    for line in content.splitlines():
        if re.search(r"[A-Ga-g]", line):
            note_lines += 1
        if note_lines >= 2:
            return True
    return False

def is_corrupted(path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            content = f.read()
    except Exception:
        return True
    
    if len(content.strip()) < 50:
        return True
    if "X:" not in content or "K:" not in content:
        return True
    if "\x00" in content:
        return True
    if len(content) > 200000: 
        return True
    return False

count_kept = 0
count_removed = 0
corrupted_files = []
non_music_files = []

all_files = []
for root, dirs, files in os.walk(SOURCE):
    for file in files:
        if file.endswith(".abc"):
            all_files.append(os.path.join(root, file))

# Use tqdm to show progress
for path in tqdm(all_files, desc="Cleaning ABC files"):
    if is_corrupted(path):
        count_removed += 1
        corrupted_files.append(path)
        continue

    with open(path, "r", encoding="utf-8") as f:
        content = f.read()

    if not is_music_like(content):
        count_removed += 1
        non_music_files.append(path)
        continue

    shutil.copy2(path, os.path.join(DEST, os.path.basename(path)))
    count_kept += 1

print("\n====== CLEANING COMPLETE ======")
print("Kept:", count_kept)
print("Removed:", count_removed)
print("Final size:", round(count_kept / (count_kept + count_removed) * 100, 2), "%")

print("\nCorrupted files:", len(corrupted_files))
print("Non-music-like files:", len(non_music_files))


Cleaning ABC files: 100%|██████████| 178553/178553 [15:55<00:00, 186.88it/s] 


Kept: 175609
Removed: 2944
Final size: 98.35 %

Corrupted files: 2944
Non-music-like files: 0



