## Splitting a DataFrame into multiple files

split by file count

In [183]:
import pandas as pd
import math

def split_dataframe(df, file_prefix, file_count, start=1):
    row_count = math.ceil(len(df) / file_count)

    for n, i in enumerate(range(0, len(df), row_count), start=start):
        split_df = df[i:i + row_count] 
        filename = f'{file_prefix}_{n}.feather'
        split_df.to_feather(filename)
        
df = pd.read_feather('hh.feather')
split_dataframe(df, 'hh_split', 5)

split by row count

In [23]:
import pandas as pd
import math

def split_dataframe_by_row_count(df, file_prefix, row_count, start=1):
    count = math.ceil(len(df) / row_count)
    
    for n, i in enumerate(range(0, count), start=start):
        split_df = df[i * row_count:(i + 1) * row_count] 
        filename = f'{file_prefix}_{n}.feather'
        split_df.to_feather(filename)
        
df = pd.read_feather('hh.feather')
split_dataframe_by_row_count(df, 'hh_rows', 1_000_000)

## Joining multiple files into one DataFrame

In [187]:
from pathlib import Path

def join_files(file_prefix):
    n = 1
    dfs = []
    
    while (file := Path(f'{file_prefix}_{n}.feather')).exists():
        dfs.append(pd.read_feather(file))
        n += 1
    
    return pd.concat(dfs, ignore_index=True)

ndf = join_files('hh_split')


compare with original

In [188]:
len(ndf), len(df)

(2075259, 2075259)

In [189]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075259 entries, 0 to 2075258
Data columns (total 9 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Date                   object 
 1   Time                   object 
 2   Global_active_power    float64
 3   Global_reactive_power  float64
 4   Voltage                float64
 5   Global_intensity       float64
 6   Sub_metering_1         float64
 7   Sub_metering_2         float64
 8   Sub_metering_3         float64
dtypes: float64(7), object(2)
memory usage: 142.5+ MB


In [190]:
ndf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075259 entries, 0 to 2075258
Data columns (total 9 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Date                   object 
 1   Time                   object 
 2   Global_active_power    float64
 3   Global_reactive_power  float64
 4   Voltage                float64
 5   Global_intensity       float64
 6   Sub_metering_1         float64
 7   Sub_metering_2         float64
 8   Sub_metering_3         float64
dtypes: float64(7), object(2)
memory usage: 142.5+ MB


In [191]:
ndf.compare(df)

## reading CSV by chunks

In [192]:
import pandas as pd

rows = 100_000
chunks = []

filename = 'household_power_consumption.csv'
for chunk in pd.read_csv(filename, chunksize=rows):
    # add operations here
    
    chunks.append(chunk)

print(f'Number of chunks: {len(chunks)}')
df = pd.concat(chunks, ignore_index=True)

print(f'Total rows: {len(df):,}')

Number of chunks: 21
Total rows: 2,075,259
