In [1]:
%pip install pandas 

import os
import pandas as pd

def split_csv_in_subfolders(root_dir, chunk_size_mb=25):
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".csv"):
                file_path = os.path.join(subdir, file)
                file_size = os.path.getsize(file_path) / (1024 * 1024)  # Convert to MB
                
                if file_size > chunk_size_mb:
                    print(f"Splitting {file_path} ({file_size:.2f} MB)")
                    
                    # Read CSV with error handling
                    try:
                        df = pd.read_csv(file_path, low_memory=False, on_bad_lines='skip')
                    except Exception as e:
                        print(f"Error reading {file_path}: {e}")
                        continue
                    
                    num_chunks = int(file_size // chunk_size_mb) + 1
                    chunk_size = len(df) // num_chunks
                    
                    for i in range(num_chunks):
                        start_idx = i * chunk_size
                        end_idx = (i + 1) * chunk_size if i < num_chunks - 1 else len(df)
                        chunk_df = df.iloc[start_idx:end_idx]
                        chunk_file = os.path.join(subdir, f"{file}_chunk_{i}.csv")
                        chunk_df.to_csv(chunk_file, index=False)
                        print(f"Saved {chunk_file} ({os.path.getsize(chunk_file) / (1024 * 1024):.2f} MB)")
                    
                    os.remove(file_path)  # Remove original file after splitting
                    print(f"Original file {file_path} removed.")

def merge_csv_in_subfolders(root_dir):
    for subdir, _, files in os.walk(root_dir):
        chunk_files = sorted([os.path.join(subdir, f) for f in files if "_chunk_" in f and f.endswith(".csv")])
        if chunk_files:
            merged_filename = chunk_files[0].rsplit("_chunk_", 1)[0]  # Remove chunk suffix
            df_list = []
            
            for chunk in chunk_files:
                try:
                    df_list.append(pd.read_csv(chunk, low_memory=False, on_bad_lines='skip'))
                except Exception as e:
                    print(f"Error reading {chunk}: {e}")
                    continue
            
            if df_list:
                merged_df = pd.concat(df_list, ignore_index=True)
                merged_df.to_csv(merged_filename, index=False)
                print(f"Merged file saved as {merged_filename}")
                
                for chunk_file in chunk_files:
                    os.remove(chunk_file)
                print("Chunk files removed.")
                
 # Example Usage
root_directory = "airlines_raw"



Note: you may need to restart the kernel to use updated packages.


In [4]:

# Split large CSV files in subfolders
split_csv_in_subfolders(root_directory)


Splitting airlines_raw/hu-chh/hu-chh_aircraft_data.csv (58.71 MB)
Saved airlines_raw/hu-chh/hu-chh_aircraft_data.csv_chunk_0.csv (19.08 MB)
Saved airlines_raw/hu-chh/hu-chh_aircraft_data.csv_chunk_1.csv (19.73 MB)
Saved airlines_raw/hu-chh/hu-chh_aircraft_data.csv_chunk_2.csv (19.89 MB)
Original file airlines_raw/hu-chh/hu-chh_aircraft_data.csv removed.
Splitting airlines_raw/6e-igo/6e-igo_aircraft_data.csv (152.63 MB)
Saved airlines_raw/6e-igo/6e-igo_aircraft_data.csv_chunk_0.csv (22.00 MB)
Saved airlines_raw/6e-igo/6e-igo_aircraft_data.csv_chunk_1.csv (21.96 MB)
Saved airlines_raw/6e-igo/6e-igo_aircraft_data.csv_chunk_2.csv (21.95 MB)
Saved airlines_raw/6e-igo/6e-igo_aircraft_data.csv_chunk_3.csv (22.04 MB)
Saved airlines_raw/6e-igo/6e-igo_aircraft_data.csv_chunk_4.csv (21.94 MB)
Saved airlines_raw/6e-igo/6e-igo_aircraft_data.csv_chunk_5.csv (21.51 MB)
Saved airlines_raw/6e-igo/6e-igo_aircraft_data.csv_chunk_6.csv (21.24 MB)
Original file airlines_raw/6e-igo/6e-igo_aircraft_data.csv 

In [3]:
# Merge split CSV files back in subfolders
merge_csv_in_subfolders(root_directory)

Merged file saved as airlines_raw/hu-chh/hu-chh_aircraft_data.csv
Chunk files removed.
Merged file saved as airlines_raw/6e-igo/6e-igo_aircraft_data.csv
Chunk files removed.
Merged file saved as airlines_raw/tk-thy/tk-thy_aircraft_data.csv
Chunk files removed.
Merged file saved as airlines_raw/ba-baw/ba-baw_aircraft_data.csv
Chunk files removed.
Merged file saved as airlines_raw/id-btk/id-btk_aircraft_data.csv
Chunk files removed.
Merged file saved as airlines_raw/ua-ual/ua-ual_aircraft_data.csv
Chunk files removed.
Merged file saved as airlines_raw/gs-gcr/gs-gcr_aircraft_data.csv
Chunk files removed.
Merged file saved as airlines_raw/qr-qtr/qr-qtr_aircraft_data.csv
Chunk files removed.
Merged file saved as airlines_raw/mf-cxa/mf-cxa_aircraft_data.csv
Chunk files removed.
Merged file saved as airlines_raw/ek-uae/ek-uae_aircraft_data.csv
Chunk files removed.
Merged file saved as airlines_raw/sc-cdg/sc-cdg_aircraft_data.csv
Chunk files removed.
Merged file saved as airlines_raw/lh-dlh/lh