In [3]:
# ================================
# COMBINE TRAIN AND TEST DATA CLEAN FILES
# Final Data Combination for Sinhala ASR Dataset
# ================================

import pandas as pd
import os
from datetime import datetime

print("🔗 ASR Data Combination Tool")
print("=" * 50)

🔗 ASR Data Combination Tool


In [4]:
# ================================
# FILE PATHS CONFIGURATION
# ================================

# Define input and output paths
train_file = "processed_asr_data/train_data_clean.csv"
test_file = "processed_asr_data/test_data_clean.csv"
output_dir = "processed_asr_data"
output_file = os.path.join(output_dir, "combined_asr_data.csv")

print(f"📁 Configuration:")
print(f"   📊 Train file: {train_file}")
print(f"   📊 Test file: {test_file}")
print(f"   📁 Output directory: {output_dir}")
print(f"   💾 Output file: {output_file}")

📁 Configuration:
   📊 Train file: processed_asr_data/train_data_clean.csv
   📊 Test file: processed_asr_data/test_data_clean.csv
   📁 Output directory: processed_asr_data
   💾 Output file: processed_asr_data\combined_asr_data.csv


In [5]:
# ================================
# DATA LOADING AND VALIDATION
# ================================

def load_and_validate_files(train_path, test_path):
    """Load and validate input files"""
    
    print("\n📊 Loading data files...")
    
    # Check if files exist
    if not os.path.exists(train_path):
        raise FileNotFoundError(f"❌ Train file not found: {train_path}")
    if not os.path.exists(test_path):
        raise FileNotFoundError(f"❌ Test file not found: {test_path}")
    
    # Load the files
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        
        print(f"✅ Train data loaded: {len(train_df):,} samples")
        print(f"✅ Test data loaded: {len(test_df):,} samples")
        
        return train_df, test_df
        
    except Exception as e:
        raise Exception(f"❌ Error loading files: {e}")

# Load the data
train_df, test_df = load_and_validate_files(train_file, test_file)


📊 Loading data files...
✅ Train data loaded: 72,155 samples
✅ Test data loaded: 18,039 samples
✅ Train data loaded: 72,155 samples
✅ Test data loaded: 18,039 samples


In [6]:
# ================================
# DATA STRUCTURE ANALYSIS
# ================================

print(f"\n🔍 Data Structure Analysis:")
print(f"=" * 40)

print(f"📋 Train Data Structure:")
print(f"   Shape: {train_df.shape}")
print(f"   Columns: {list(train_df.columns)}")
print(f"   Data types: {train_df.dtypes.to_dict()}")

print(f"\n📋 Test Data Structure:")
print(f"   Shape: {test_df.shape}")
print(f"   Columns: {list(test_df.columns)}")
print(f"   Data types: {test_df.dtypes.to_dict()}")

# Check column compatibility
print(f"\n🔍 Column Compatibility Check:")
train_cols = set(train_df.columns)
test_cols = set(test_df.columns)

if train_cols == test_cols:
    print("✅ Column names match perfectly")
    common_cols = list(train_df.columns)
else:
    print("⚠️ Column mismatch detected")
    common_cols = list(train_cols & test_cols)
    train_only = train_cols - test_cols
    test_only = test_cols - train_cols
    
    print(f"   🔗 Common columns: {common_cols}")
    if train_only:
        print(f"   🏋️ Train-only columns: {list(train_only)}")
    if test_only:
        print(f"   🧪 Test-only columns: {list(test_only)}")
    
    print(f"   💡 Will use common columns only")


🔍 Data Structure Analysis:
📋 Train Data Structure:
   Shape: (72155, 2)
   Columns: ['file', 'sentence_cleaned']
   Data types: {'file': dtype('O'), 'sentence_cleaned': dtype('O')}

📋 Test Data Structure:
   Shape: (18039, 2)
   Columns: ['file', 'sentence_cleaned']
   Data types: {'file': dtype('O'), 'sentence_cleaned': dtype('O')}

🔍 Column Compatibility Check:
✅ Column names match perfectly


In [7]:
# ================================
# SAMPLE DATA PREVIEW
# ================================

print(f"\n📝 Sample Data Preview:")
print(f"=" * 30)

print(f"\n🏋️ Train Data Sample:")
print(train_df.head(3))

print(f"\n🧪 Test Data Sample:")
print(test_df.head(3))


📝 Sample Data Preview:

🏋️ Train Data Sample:
                                  file  \
0  asr_sinhala/data/98/983e3c6613.flac   
1  asr_sinhala/data/29/29ab15c6d4.flac   
2  asr_sinhala/data/b0/b0072f9ac0.flac   

                                sentence_cleaned  
0   එය මිලිටරිය තුළ ති‍යන ප්‍රධානම ප්‍රතිමානයක්.  
1  සාහිත්‍යකරුවාට ඊට වැඩිය ලොකු වගකීමක් තියෙනවා.  
2                        ඕගොල්ලන්ට දකින්න ලැබෙයි  

🧪 Test Data Sample:
                                  file  \
0  asr_sinhala/data/b6/b63bfb7a34.flac   
1  asr_sinhala/data/f1/f134378295.flac   
2  asr_sinhala/data/d3/d3bf1e10b0.flac   

                                  sentence_cleaned  
0                                මෙකී සිව් මහ ධාතු  
1                           එදිනෙදා ජීවිතයේ සිදුවන  
2  අධිවේගී මාර්ගයේ මැද සේවා ස්ථානයක් ඉදි කළ පමණින්  


In [8]:
# ================================
# DATA PREPARATION AND COMBINATION
# ================================

def prepare_and_combine_data(train_data, test_data, include_split_column=False):
    """Prepare and combine train and test data"""
    
    print(f"\n🔗 Preparing data for combination...")
    
    # Use common columns only
    train_cols = set(train_data.columns)
    test_cols = set(test_data.columns)
    common_cols = list(train_cols & test_cols)
    
    train_prepared = train_data[common_cols].copy()
    test_prepared = test_data[common_cols].copy()
    
    # Add split indicator column (disabled by default)
    if include_split_column:
        train_prepared['data_split'] = 'train'
        test_prepared['data_split'] = 'test'
        print("✅ Added 'data_split' column")
    else:
        print("ℹ️ Skipping 'data_split' column - combining without split indicator")
    
    # Combine datasets
    print("🔗 Combining datasets...")
    combined_df = pd.concat([train_prepared, test_prepared], ignore_index=True)
    
    print(f"📊 Combination Results:")
    print(f"   🏋️ Train samples: {len(train_prepared):,}")
    print(f"   🧪 Test samples: {len(test_prepared):,}")
    print(f"   📈 Combined total: {len(combined_df):,}")
    print(f"   📋 Final columns: {list(combined_df.columns)}")
    
    return combined_df

# Combine the data WITHOUT split column
combined_df = prepare_and_combine_data(train_df, test_df, include_split_column=False)


🔗 Preparing data for combination...
ℹ️ Skipping 'data_split' column - combining without split indicator
🔗 Combining datasets...
📊 Combination Results:
   🏋️ Train samples: 72,155
   🧪 Test samples: 18,039
   📈 Combined total: 90,194
   📋 Final columns: ['sentence_cleaned', 'file']


In [9]:
# ================================
# DATA QUALITY CHECKS
# ================================

print(f"\n🔍 Data Quality Assessment:")
print(f"=" * 35)

# Check for duplicates
initial_count = len(combined_df)
duplicates = combined_df.duplicated().sum()
print(f"🔄 Duplicate Analysis:")
print(f"   Total rows: {initial_count:,}")
print(f"   Duplicate rows: {duplicates:,}")

if duplicates > 0:
    print(f"   🧹 Removing {duplicates:,} duplicates...")
    combined_df = combined_df.drop_duplicates()
    final_count = len(combined_df)
    print(f"   ✅ After deduplication: {final_count:,} samples")
    print(f"   📉 Removed: {initial_count - final_count:,} duplicates")
else:
    print(f"   ✅ No duplicates found")

# Check for missing values
print(f"\n🔍 Missing Value Analysis:")
missing_data = combined_df.isnull().sum()
total_missing = missing_data.sum()

if total_missing > 0:
    print(f"   ⚠️ Missing values detected:")
    for col, count in missing_data.items():
        if count > 0:
            percentage = (count / len(combined_df)) * 100
            print(f"      📊 {col}: {count:,} missing ({percentage:.1f}%)")
else:
    print(f"   ✅ No missing values found")

# Data split distribution (only if column exists)
if 'data_split' in combined_df.columns:
    print(f"\n📊 Data Split Distribution:")
    split_counts = combined_df['data_split'].value_counts()
    for split, count in split_counts.items():
        percentage = (count / len(combined_df)) * 100
        print(f"   {split.title()}: {count:,} samples ({percentage:.1f}%)")
else:
    print(f"\n📊 Combined Dataset:")
    print(f"   🔗 Total samples: {len(combined_df):,} (train + test combined)")
    print(f"   ℹ️ No split indicator column included")


🔍 Data Quality Assessment:
🔄 Duplicate Analysis:
   Total rows: 90,194
   Duplicate rows: 0
   ✅ No duplicates found

🔍 Missing Value Analysis:
   ✅ No missing values found

📊 Combined Dataset:
   🔗 Total samples: 90,194 (train + test combined)
   ℹ️ No split indicator column included


In [10]:
# ================================
# FINAL COMBINED DATA PREVIEW
# ================================

print(f"\n📝 Final Combined Dataset Preview:")
print(f"=" * 40)

print(f"📊 Dataset Shape: {combined_df.shape}")
print(f"📋 Columns: {list(combined_df.columns)}")

print(f"\n🔍 Sample Combined Data:")
print(combined_df.head(5))

print(f"\n📈 Dataset Statistics:")
print(f"   📊 Total samples: {len(combined_df):,}")
print(f"   📋 Features: {len(combined_df.columns)}")
print(f"   💾 Memory usage: {combined_df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")


📝 Final Combined Dataset Preview:
📊 Dataset Shape: (90194, 2)
📋 Columns: ['sentence_cleaned', 'file']

🔍 Sample Combined Data:
                                sentence_cleaned  \
0   එය මිලිටරිය තුළ ති‍යන ප්‍රධානම ප්‍රතිමානයක්.   
1  සාහිත්‍යකරුවාට ඊට වැඩිය ලොකු වගකීමක් තියෙනවා.   
2                        ඕගොල්ලන්ට දකින්න ලැබෙයි   
3                     බොදු බල සේනා හිමිවරුන් අතර   
4                            ඔබතුමා දන්නා විදියට   

                                  file  
0  asr_sinhala/data/98/983e3c6613.flac  
1  asr_sinhala/data/29/29ab15c6d4.flac  
2  asr_sinhala/data/b0/b0072f9ac0.flac  
3  asr_sinhala/data/85/85fc577f7b.flac  
4  asr_sinhala/data/7e/7e6bd7795f.flac  

📈 Dataset Statistics:
   📊 Total samples: 90,194
   📋 Features: 2
   💾 Memory usage: 23.81 MB


In [11]:
# ================================
# SAVE COMBINED DATA
# ================================

def save_combined_data(data, output_path):
    """Save combined data to file"""
    
    print(f"\n💾 Saving combined data...")
    
    # Create output directory if needed
    output_directory = os.path.dirname(output_path)
    if output_directory and not os.path.exists(output_directory):
        os.makedirs(output_directory, exist_ok=True)
        print(f"📁 Created output directory: {output_directory}")
    
    try:
        # Save to CSV
        data.to_csv(output_path, index=False)
        
        # Verify saved file
        file_size = os.path.getsize(output_path) / (1024**2)  # MB
        
        print(f"✅ Data saved successfully!")
        print(f"📁 File location: {output_path}")
        print(f"📊 File size: {file_size:.2f} MB")
        print(f"📈 Rows saved: {len(data):,}")
        
        return True
        
    except Exception as e:
        print(f"❌ Error saving file: {e}")
        return False

# Save the combined data
save_success = save_combined_data(combined_df, output_file)


💾 Saving combined data...
✅ Data saved successfully!
📁 File location: processed_asr_data\combined_asr_data.csv
📊 File size: 9.50 MB
📈 Rows saved: 90,194
✅ Data saved successfully!
📁 File location: processed_asr_data\combined_asr_data.csv
📊 File size: 9.50 MB
📈 Rows saved: 90,194


In [12]:
# ================================
# CREATE METADATA FILE
# ================================

def create_metadata_file(data, output_directory):
    """Create detailed metadata file"""
    
    metadata_path = os.path.join(output_directory, "dataset_metadata.txt")
    
    print(f"\n📄 Creating metadata file...")
    
    try:
        with open(metadata_path, 'w', encoding='utf-8') as f:
            f.write("COMBINED SINHALA ASR DATASET METADATA\n")
            f.write("=" * 60 + "\n\n")
            
            # Basic information
            f.write(f"Creation Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Total Samples: {len(data):,}\n")
            f.write(f"Total Columns: {len(data.columns)}\n")
            f.write(f"Dataset Type: Combined (train + test without split indicator)\n\n")
            
            # Column information
            f.write("COLUMN INFORMATION:\n")
            f.write("-" * 30 + "\n")
            for i, col in enumerate(data.columns, 1):
                dtype = str(data[col].dtype)
                non_null = data[col].count()
                f.write(f"{i:2d}. {col:<20} | Type: {dtype:<10} | Non-null: {non_null:,}\n")
            
            # Data split information (only if column exists)
            if 'data_split' in data.columns:
                f.write(f"\nDATA SPLIT DISTRIBUTION:\n")
                f.write("-" * 30 + "\n")
                split_counts = data['data_split'].value_counts()
                for split, count in split_counts.items():
                    percentage = (count / len(data)) * 100
                    f.write(f"{split.title():<10}: {count:>8,} samples ({percentage:5.1f}%)\n")
            else:
                f.write(f"\nDATASET STRUCTURE:\n")
                f.write("-" * 30 + "\n")
                f.write(f"Combined Dataset: {len(data):,} total samples\n")
                f.write(f"Split Indicator: Not included (unified dataset)\n")
                f.write(f"Source: train_data_clean.csv + test_data_clean.csv\n")
            
            # Missing values
            f.write(f"\nMISSING VALUES SUMMARY:\n")
            f.write("-" * 30 + "\n")
            missing_summary = data.isnull().sum()
            for col, missing_count in missing_summary.items():
                if missing_count > 0:
                    percentage = (missing_count / len(data)) * 100
                    f.write(f"{col:<20}: {missing_count:>6,} missing ({percentage:5.1f}%)\n")
            if missing_summary.sum() == 0:
                f.write("No missing values found.\n")
            
            # Sample data
            f.write(f"\nSAMPLE DATA (First 5 rows):\n")
            f.write("-" * 50 + "\n")
            f.write(data.head(5).to_string())
            
            f.write(f"\n\nEND OF METADATA\n")
        
        print(f"✅ Metadata file created: {metadata_path}")
        return True
        
    except Exception as e:
        print(f"❌ Error creating metadata: {e}")
        return False

# Create metadata file
if save_success:
    create_metadata_file(combined_df, output_dir)


📄 Creating metadata file...
✅ Metadata file created: processed_asr_data\dataset_metadata.txt


In [13]:
# ================================
# VALIDATION AND SUMMARY
# ================================

def validate_output_file(file_path):
    """Validate the saved output file"""
    
    print(f"\n🔍 Validating output file...")
    
    if not os.path.exists(file_path):
        print(f"❌ Output file not found: {file_path}")
        return False
    
    try:
        # Load and verify
        validation_df = pd.read_csv(file_path)
        
        print(f"✅ File validation successful!")
        print(f"   📊 Loaded shape: {validation_df.shape}")
        print(f"   📋 Columns match: {list(validation_df.columns) == list(combined_df.columns)}")
        print(f"   🔢 Row count match: {len(validation_df) == len(combined_df)}")
        
        return True
        
    except Exception as e:
        print(f"❌ Validation failed: {e}")
        return False

# Validate the output
if save_success:
    validation_success = validate_output_file(output_file)


🔍 Validating output file...
✅ File validation successful!
   📊 Loaded shape: (90194, 2)
   📋 Columns match: True
   🔢 Row count match: True
✅ File validation successful!
   📊 Loaded shape: (90194, 2)
   📋 Columns match: True
   🔢 Row count match: True


In [14]:
# ================================
# FINAL SUMMARY REPORT
# ================================

print(f"\n" + "=" * 60)
print(f"🎉 DATA COMBINATION COMPLETED!")
print(f"=" * 60)

print(f"📊 PROCESSING SUMMARY:")
print(f"   📁 Input Files: 2 (train + test)")
print(f"   🏋️ Train Samples: {len(train_df):,}")
print(f"   🧪 Test Samples: {len(test_df):,}")
print(f"   📈 Combined Total: {len(combined_df):,}")
print(f"   📋 Final Columns: {len(combined_df.columns)}")

print(f"\n💾 OUTPUT FILES:")
print(f"   📊 Main Dataset: {output_file}")
print(f"   📄 Metadata: {os.path.join(output_dir, 'dataset_metadata.txt')}")

print(f"\n🔍 DATA QUALITY:")
duplicates_removed = initial_count - len(combined_df) if 'initial_count' in locals() else 0
print(f"   🧹 Duplicates Removed: {duplicates_removed:,}")
print(f"   ✅ Missing Values: {'None' if combined_df.isnull().sum().sum() == 0 else 'Present'}")
print(f"   💯 Data Integrity: {'✅ Verified' if save_success else '❌ Issues Found'}")

print(f"\n🚀 READY FOR ASR TRAINING!")
print(f"   Use '{output_file}' for your Sinhala ASR model training")
print(f"=" * 60)

# Display file locations for easy access
print(f"\n📁 Quick Access:")
print(f"   Combined Data: ./{output_file}")
print(f"   Metadata: ./{os.path.join(output_dir, 'dataset_metadata.txt')}")


🎉 DATA COMBINATION COMPLETED!
📊 PROCESSING SUMMARY:
   📁 Input Files: 2 (train + test)
   🏋️ Train Samples: 72,155
   🧪 Test Samples: 18,039
   📈 Combined Total: 90,194
   📋 Final Columns: 2

💾 OUTPUT FILES:
   📊 Main Dataset: processed_asr_data\combined_asr_data.csv
   📄 Metadata: processed_asr_data\dataset_metadata.txt

🔍 DATA QUALITY:
   🧹 Duplicates Removed: 0
   ✅ Missing Values: None
   💯 Data Integrity: ✅ Verified

🚀 READY FOR ASR TRAINING!
   Use 'processed_asr_data\combined_asr_data.csv' for your Sinhala ASR model training

📁 Quick Access:
   Combined Data: ./processed_asr_data\combined_asr_data.csv
   Metadata: ./processed_asr_data\dataset_metadata.txt


In [19]:
# ================================
# EXTRACT 10,000 SAMPLES FOR SINHALA ASR
# ================================

print(f"\n" + "=" * 60)
print(f"📊 EXTRACTING 10,000 SAMPLES")
print(f"=" * 60)

# Check available samples
total_samples = len(combined_df)
target_samples = 15000

print(f"📈 Dataset Info:")
print(f"   📊 Total available samples: {total_samples:,}")
print(f"   🎯 Target samples: {target_samples:,}")

if total_samples < target_samples:
    print(f"⚠️ Warning: Only {total_samples:,} samples available, less than requested {target_samples:,}")
    sample_count = total_samples
    print(f"   💡 Will extract all {total_samples:,} samples")
else:
    sample_count = target_samples
    print(f"   ✅ Sufficient samples available")

# Extract samples (random sampling for diversity)
print(f"\n🔀 Sampling Strategy:")
print(f"   📊 Using random sampling for diversity")

# Set random seed for reproducibility
import random
random.seed(42)

# Sample the data
sampled_df = combined_df.sample(n=sample_count, random_state=42).reset_index(drop=True)

print(f"   ✅ Extracted {len(sampled_df):,} samples")

# Define output file path
sample_output_file = os.path.join(output_dir, "15000-sinhala-asr-data.csv")

print(f"\n💾 Saving sampled data...")
print(f"   📁 Output file: {sample_output_file}")

# Save the sampled data
try:
    sampled_df.to_csv(sample_output_file, index=False)
    
    # Verify saved file
    file_size = os.path.getsize(sample_output_file) / (1024**2)  # MB
    
    print(f"✅ Sampled data saved successfully!")
    print(f"📊 File size: {file_size:.2f} MB")
    print(f"📈 Rows saved: {len(sampled_df):,}")
    print(f"📋 Columns: {list(sampled_df.columns)}")
    
    # Show sample preview
    print(f"\n📝 Sample Preview:")
    print(sampled_df.head(3))
    
    print(f"\n🎉 SUCCESS!")
    print(f"   📁 File saved: {sample_output_file}")
    print(f"   🚀 Ready for ASR training with {len(sampled_df):,} samples!")
    
except Exception as e:
    print(f"❌ Error saving sampled data: {e}")

print(f"=" * 60)


📊 EXTRACTING 10,000 SAMPLES
📈 Dataset Info:
   📊 Total available samples: 90,194
   🎯 Target samples: 15,000
   ✅ Sufficient samples available

🔀 Sampling Strategy:
   📊 Using random sampling for diversity
   ✅ Extracted 15,000 samples

💾 Saving sampled data...
   📁 Output file: processed_asr_data\15000-sinhala-asr-data.csv
✅ Sampled data saved successfully!
📊 File size: 1.58 MB
📈 Rows saved: 15,000
📋 Columns: ['sentence_cleaned', 'file']

📝 Sample Preview:
         sentence_cleaned                                 file
0       හැමෝම අනිවාර්යෙන්  asr_sinhala/data/a2/a22f6ae045.flac
1  ඇති හැකි ඇත්තෝ පෙරට ආහ  asr_sinhala/data/b9/b9666683a1.flac
2         බෞද්ධයන් වන අප,  asr_sinhala/data/66/66160f5aa8.flac

🎉 SUCCESS!
   📁 File saved: processed_asr_data\15000-sinhala-asr-data.csv
   🚀 Ready for ASR training with 15,000 samples!


In [20]:
# ================================
# SPLIT 15,000 SAMPLES INTO TRAIN/TEST (80/20)
# ================================

print(f"\n" + "=" * 60)
print(f"📊 SPLITTING 15,000 SAMPLES INTO TRAIN/TEST")
print(f"=" * 60)

# Check if sampled_df exists
if 'sampled_df' not in locals():
    print(f"❌ Error: sampled_df not found. Please run the sampling cell first.")
else:
    total_samples = len(sampled_df)
    
    print(f"📈 Split Configuration:")
    print(f"   📊 Total samples: {total_samples:,}")
    print(f"   🏋️ Train split: 80%")
    print(f"   🧪 Test split: 20%")
    
    # Calculate split sizes
    train_size = int(0.8 * total_samples)
    test_size = total_samples - train_size
    
    print(f"\n📊 Split Sizes:")
    print(f"   🏋️ Train samples: {train_size:,}")
    print(f"   🧪 Test samples: {test_size:,}")
    
    # Shuffle the data before splitting for better randomization
    shuffled_df = sampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Split the data
    train_10k_df = shuffled_df[:train_size].reset_index(drop=True)
    test_10k_df = shuffled_df[train_size:].reset_index(drop=True)
    
    print(f"\n🔀 Data Split:")
    print(f"   ✅ Train data: {len(train_10k_df):,} samples")
    print(f"   ✅ Test data: {len(test_10k_df):,} samples")
    
    # Define output file paths
    train_10k_file = os.path.join(output_dir, "15-train.csv")
    test_10k_file = os.path.join(output_dir, "15-test.csv")
    
    print(f"\n💾 Saving split files...")
    print(f"   📁 Train file: {train_10k_file}")
    print(f"   📁 Test file: {test_10k_file}")
    
    # Save train file
    try:
        train_10k_df.to_csv(train_10k_file, index=False)
        train_file_size = os.path.getsize(train_10k_file) / (1024**2)  # MB
        print(f"   ✅ Train file saved: {train_file_size:.2f} MB")
        train_success = True
    except Exception as e:
        print(f"   ❌ Error saving train file: {e}")
        train_success = False
    
    # Save test file
    try:
        test_10k_df.to_csv(test_10k_file, index=False)
        test_file_size = os.path.getsize(test_10k_file) / (1024**2)  # MB
        print(f"   ✅ Test file saved: {test_file_size:.2f} MB")
        test_success = True
    except Exception as e:
        print(f"   ❌ Error saving test file: {e}")
        test_success = False
    
    if train_success and test_success:
        print(f"\n📝 Sample Previews:")
        print(f"\n🏋️ Train Data Sample:")
        print(train_10k_df.head(3))
        
        print(f"\n🧪 Test Data Sample:")
        print(test_10k_df.head(3))
        
        print(f"\n🎉 SPLIT COMPLETED SUCCESSFULLY!")
        print(f"   📁 Train file: {train_10k_file}")
        print(f"   📁 Test file: {test_10k_file}")
        print(f"   🏋️ Training samples: {len(train_10k_df):,} (80%)")
        print(f"   🧪 Testing samples: {len(test_10k_df):,} (20%)")
        print(f"   🚀 Ready for ASR model training and evaluation!")
    else:
        print(f"\n❌ Some files failed to save. Please check the errors above.")

print(f"=" * 60)


📊 SPLITTING 15,000 SAMPLES INTO TRAIN/TEST
📈 Split Configuration:
   📊 Total samples: 15,000
   🏋️ Train split: 80%
   🧪 Test split: 20%

📊 Split Sizes:
   🏋️ Train samples: 12,000
   🧪 Test samples: 3,000

🔀 Data Split:
   ✅ Train data: 12,000 samples
   ✅ Test data: 3,000 samples

💾 Saving split files...
   📁 Train file: processed_asr_data\15-train.csv
   📁 Test file: processed_asr_data\15-test.csv
   ✅ Train file saved: 1.27 MB
   ✅ Test file saved: 0.32 MB

📝 Sample Previews:

🏋️ Train Data Sample:
               sentence_cleaned                                 file
0         නමුත් කතන්දර කියන ඒවා  asr_sinhala/data/db/db176e4b7c.flac
1         අපට තිබුණේ පැළ සිටුවා  asr_sinhala/data/30/3070cc3c17.flac
2  නීති විරෝධී ලෙස ඔට්ටු ඇල්ලීම  asr_sinhala/data/1c/1cb59c2460.flac

🧪 Test Data Sample:
           sentence_cleaned                                 file
0        කවුරුවත් හොයා ගන්න  asr_sinhala/data/ab/ab01f6eea4.flac
1      එතුමාට දැන් තියෙන්නේ  asr_sinhala/data/24/2492478f22.flac