In [13]:
import pandas as pd
import numpy as np
import os
import csv

def optimize_and_convert_dataset(input_csv_path, output_path=None):
    """
    Optimizes memory usage of a pandas DataFrame by downcasting numeric types
    and converting low-cardinality string columns to categories.
    Handles variable separators (;, \t, ,) automatically.
    """
    
    print(f"--- Starting optimization for: {input_csv_path} ---")
    
    if not os.path.exists(input_csv_path):
        print(f"ERROR: File not found at {input_csv_path}")
        return

    # 1. Load Data with Robust Settings
    print("Attempting to load CSV with auto-detection of separator...")
    try:
        # 'sep=None' forces python engine to sniff the delimiter
        # 'on_bad_lines' skips lines that are truly broken (and warns us)
        # 'encoding' handles special German characters (umlauts like ä, ö, ü)
        df = pd.read_csv(
            input_csv_path, 
            sep=None, 
            engine='python', 
            on_bad_lines='warn', 
            encoding='utf-8', # Try 'latin-1' if this fails
            low_memory=False
        )
        print(f"Load successful! Found {len(df)} rows and {len(df.columns)} columns.")
        
    except Exception as e:
        print(f"Critical Error loading CSV: {e}")
        # Fallback: Try reading strictly with semicolon if auto-detect fails
        print("Retrying with explicit semicolon separator...")
        try:
            df = pd.read_csv(input_csv_path, sep=';', on_bad_lines='skip', encoding='latin-1', low_memory=False)
            print(f"Retry successful! Found {len(df)} rows.")
        except Exception as e2:
             print(f"Retry failed: {e2}")
             return

    # Calculate initial memory usage
    start_mem = df.memory_usage().sum() / 1024**2
    print(f'Initial memory usage: {start_mem:.2f} MB')

    # 2. Iterate through columns to optimize types
    for col in df.columns:
        col_type = df[col].dtype

        # A. Optimize Numeric Columns
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32) 
                else:
                    df[col] = df[col].astype(np.float64)

        # B. Optimize Object (String) Columns
        else:
            num_unique = len(df[col].unique())
            num_total = len(df[col])
            
            # If unique values are less than 50% of total rows, convert to category
            if num_unique / num_total < 0.5:
                df[col] = df[col].astype('category')

    # Calculate final memory usage
    end_mem = df.memory_usage().sum() / 1024**2
    print(f'Optimized memory usage: {end_mem:.2f} MB')
    print(f'Reduction: {100 * (start_mem - end_mem) / start_mem:.1f}%')

    # 3. Save to Parquet
    if output_path is None:
        output_path = input_csv_path.replace('.csv', '.parquet')
        
    print(f"Saving optimized data to: {output_path}...")
    try:
        df.to_parquet(output_path, engine='pyarrow', compression='snappy')
        print("Success! File saved.")
    except Exception as e:
        print(f"Error saving to Parquet: {e}")

# Get the current working directory of this notebook
current_dir = os.getcwd()

# REPLACE 'Dataset_Berlin_Marathon_1999-2025_original.csv' WITH YOUR ACTUAL FILENAME
filename = 'Dataset_Berlin_Marathon_1999-2025_original.csv' 

# Create the full absolute path safely
input_path = os.path.join(current_dir, filename)

# Run the optimization
optimize_and_convert_dataset(input_path)

--- Starting optimization for: C:\Users\netos\Downloads\Berlin\Dataset_Berlin_Marathon_1999-2025_original.csv ---
Attempting to load CSV with auto-detection of separator...
Critical Error loading CSV: The 'low_memory' option is not supported with the 'python' engine
Retrying with explicit semicolon separator...
Retry successful! Found 880779 rows.
Initial memory usage: 107.52 MB
Optimized memory usage: 40.31 MB
Reduction: 62.5%
Saving optimized data to: C:\Users\netos\Downloads\Berlin\Dataset_Berlin_Marathon_1999-2025_original.parquet...
Success! File saved.
