In [None]:
import pandas as pd
import os
import shutil

# ==============================================================================
# Configuration: List of files to clean
# ==============================================================================
base_path = '../../data/ready/'

# List of all involved files
# Note: If files like sg_..._onset.csv are generated from sg...all.csv, 
# it is recommended to clean 'all' only and regenerate the subsets.
# However, assuming all listed files need independent cleaning here.
files_to_clean = [
    f'{base_path}flare_1975_2017.csv',
    f'{base_path}sg_1874_2025_onset.csv',
    f'{base_path}sg_1874_2025_diss.csv',
    f'{base_path}sg_1874_2025_dur.csv',
    f'{base_path}sg_1874_2025_daily.csv',
    f'{base_path}sg_1874_2025_all.csv'
]

def clean_csv(file_path):
    if not os.path.exists(file_path):
        print(f"Skipping (file not found): {file_path}")
        return

    print(f"Processing: {os.path.basename(file_path)}")
    
    # 1. Attempt to read
    try:
        df = pd.read_csv(file_path)
    except:
        try:
            df = pd.read_csv(file_path, encoding='gbk')
        except Exception as e:
            print(f"  !!! Read failed: {e}")
            return

    original_count = len(df)
    
    # 2. Remove duplicates (exact match)
    df_clean = df.drop_duplicates()
    new_count = len(df_clean)
    diff = original_count - new_count
    
    if diff == 0:
        print("  -> Data is clean, no changes needed.")
        return

    # 3. Backup and Save
    backup_path = file_path + '.bak'
    try:
        # Create backup
        shutil.copy2(file_path, backup_path)
        print(f"  -> Backup created: {os.path.basename(backup_path)}")
        
        # Overwrite original file (keep float format, do not save index)
        # Note: float_format='%.6f' prevents precision errors (e.g., 0.000000001)
        # Remove float_format if you wish to keep data exactly as is.
        df_clean.to_csv(file_path, index=False, float_format='%.6f') 
        print(f"  -> Cleaned and saved. Removed {diff} duplicate rows.")
        
    except Exception as e:
        print(f"  !!! Save/Backup failed: {e}")

# ==============================================================================
# Execution
# ==============================================================================
print("=== Starting raw data source cleaning ===\n")
for f in files_to_clean:
    clean_csv(f)
print("\n=== All tasks completed ===")
print("Note: Original files backed up as *.csv.bak. Delete manually after verification.")