In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import fastparquet

In [2]:
# Optimizing already at the stage of loading data into pandas DataFrame
# loading data from a file "at once" - an optimal solution for small files
df = pd.read_csv('zamowienia.csv', header=0, sep=';')
display(df.head())
df.info()

Unnamed: 0,Kraj,Sprzedawca,Data zamowienia,idZamowienia,Utarg
0,Polska,Kowalski,2003-07-16,10248,440.0
1,Polska,Sowiński,2003-07-10,10249,1863.4
2,Niemcy,Peacock,2003-07-12,10250,1552.6
3,Niemcy,Leverling,2003-07-15,10251,654.06
4,Niemcy,Peacock,2003-07-11,10252,3597.9


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 799 entries, 0 to 798
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Kraj             799 non-null    object 
 1   Sprzedawca       799 non-null    object 
 2   Data zamowienia  799 non-null    object 
 3   idZamowienia     799 non-null    int64  
 4   Utarg            799 non-null    float64
dtypes: float64(1), int64(1), object(3)
memory usage: 31.3+ KB


In [3]:
df.memory_usage()  # returned data type - pandas series

Index               132
Kraj               6392
Sprzedawca         6392
Data zamowienia    6392
idZamowienia       6392
Utarg              6392
dtype: int64

In [4]:
sum(df.memory_usage())  # default deep = False

32092

In [5]:
sum(df.memory_usage(deep=True))

150452

In [6]:
def size_of_format(num, suffix="B"):
    for unit in ("", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"):
        if abs(num) < 1024.0:
            return f"{num:3.1f}{unit}{suffix}"
        num /= 1024.0
    return f"{num:.1f}Yi{suffix}"

In [7]:
size_of_format(sum(df.memory_usage(deep=True)))

'146.9KiB'

## Optimization of file loading in pandas library

In [8]:
new_df = pd.concat([df.sample(frac=1) for n in range(10_000)])

In [9]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7990000 entries, 85 to 93
Data columns (total 5 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Kraj             object 
 1   Sprzedawca       object 
 2   Data zamowienia  object 
 3   idZamowienia     int64  
 4   Utarg            float64
dtypes: float64(1), int64(1), object(3)
memory usage: 365.8+ MB


In [10]:
size_of_format(sum(new_df.memory_usage(deep=True)))

'1.5GiB'

In [11]:
new_df.to_csv('zamowienia_expanded.csv', header=True, index=False)

In [12]:
start = datetime.now()
new_df = pd.read_csv('zamowienia_expanded.csv', header=0)
print(f"Reading time - case 1: {datetime.now() - start} seconds")

Reading time - case 1: 0:00:05.249842 seconds


In [13]:
def count_time(func):
    def wrapper(*args, **kwargs):
        start = datetime.now()
        func(*args, **kwargs)
        print(f"Czas wczytywania {func.__name__}: {datetime.now() - start} sekund")
        return func(*args, **kwargs)
    return wrapper

In [14]:
@count_time
def read_file_1():
    return pd.read_csv('zamowienia_expanded.csv', header=0)
    
@count_time
def read_file_2():
    chunks = pd.read_csv('zamowienia_expanded.csv', header=0, chunksize=800_000)
    return pd.concat(chunks)

In [15]:
df1 = read_file_1()
df2 = read_file_2()

Czas wczytywania read_file_1: 0:00:04.956843 sekund
Czas wczytywania read_file_2: 0:00:05.015193 sekund


In [16]:
size_of_format(sum(df1.memory_usage(deep=True))), size_of_format(sum(df2.memory_usage(deep=True)))

('1.4GiB', '1.4GiB')

## Other files formats

In [17]:
df1.to_parquet('zamowienia_expanded.parquet', engine='fastparquet')

In [18]:
@count_time
def read_parquet_1():
    df = pd.read_parquet('zamowienia_expanded.parquet', engine='fastparquet')
    return df

In [19]:
df3 = read_parquet_1()

Czas wczytywania read_parquet_1: 0:00:03.003315 sekund


In [21]:
size_of_format(sum(df3.memory_usage(deep=True)))

'1.4GiB'

## Multiprocessing

In [ ]:
# from itertools import repeat
# import pandas as pd
# from datetime import datetime
# from filesplit.split import Split
# from multiprocessing import Pool
# import os
# 
# 
# def apply_args_and_kwargs(func, args, kwargs):
#     return func(*args, **kwargs)
# 
# 
# def starmap_with_kwargs(pool, func, args_iter, kwargs_iter):
#     args_for_starmap = zip(repeat(func), args_iter, kwargs_iter)
#     return pool.starmap(apply_args_and_kwargs, args_for_starmap)
# 
# 
# def split_file(filepath, chunksize, destination):
#     split = Split(filepath, destination)
#     split.bylinecount(linecount=chunksize, includeheader=True)
# 
# 
# @count_time
# def load_files(directory):
# 
#     files = [[f"{directory}/{f}"] for f in os.listdir(directory) if f.endswith(".csv")]
# 
#     kwargs_list = [
#         {
#             'on_bad_lines': "skip",
#         }
#         for n in range(len(files))
#     ]
# 
#     pool = Pool(processes=5)
#     args_iter = files
# 
#     results = starmap_with_kwargs(pool, pd.read_csv, args_iter, kwargs_list)
#     results = pd.concat(results)
# 
#     return results
# 
# 
# if __name__ == '__main__':
#     split_file('zamowienia_expanded.csv', 1_600_000, 'data')
#     df4 = load_files('data')
#     df4.info()

Czas wczytywania load_files: 0:00:04.820281 sekund
<class 'pandas.core.frame.DataFrame'>
Index: 7990000 entries, 0 to 1590003
Data columns (total 5 columns):
 \#   Column           Dtype  
\---  ------           -----  
 0   Kraj             object 
 1   Sprzedawca       object 
 2   Data zamowienia  object 
 3   idZamowienia     int64  
 4   Utarg            float64
dtypes: float64(1), int64(1), object(3)
memory usage: 365.8+ MB

## Optimizing RAM usage of pandas library frames

In [23]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7990000 entries, 0 to 7989999
Data columns (total 5 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Kraj             object 
 1   Sprzedawca       object 
 2   Data zamowienia  object 
 3   idZamowienia     int64  
 4   Utarg            float64
dtypes: float64(1), int64(1), object(3)
memory usage: 304.8+ MB


In [24]:
df1.describe()

Unnamed: 0,idZamowienia,Utarg
count,7990000.0,7990000.0
mean,10647.18,1537.331
std,230.9473,1859.426
min,10248.0,12.5
25%,10447.0,465.7
50%,10647.0,956.67
75%,10847.0,1892.25
max,11057.0,16387.5


In [25]:
pd.options.display.float_format = '{:.5f}'.format
df1.describe()

Unnamed: 0,idZamowienia,Utarg
count,7990000.0,7990000.0
mean,10647.17522,1537.33091
std,230.94727,1859.42618
min,10248.0,12.5
25%,10447.0,465.7
50%,10647.0,956.67
75%,10847.0,1892.25
max,11057.0,16387.5


In [26]:
for column in df1.columns:
    print(f'{column}: {size_of_format(df1[column].memory_usage(deep=True))}')

Kraj: 419.1MiB
Sprzedawca: 443.0MiB
Data zamowienia: 449.6MiB
idZamowienia: 61.0MiB
Utarg: 61.0MiB


In [27]:
size_of_format(df1['idZamowienia'].astype(np.int16).memory_usage(deep=True))

'15.2MiB'

In [29]:
size_of_format(df1['Kraj'].astype('category').memory_usage(deep=True))

'7.6MiB'

In [30]:
size_of_format(df1['Sprzedawca'].astype('category').memory_usage(deep=True))

'7.6MiB'

In [31]:
size_of_format(pd.to_datetime(df1['Data zamowienia']).memory_usage(deep=True))

'61.0MiB'

In [32]:
df2 = pd.DataFrame()

In [33]:
df2['Kraj'] = df1['Kraj'].astype('category')
df2['Sprzedawca'] = df1['Sprzedawca'].astype('category')
df2['Data zamowienia'] = pd.to_datetime(df1['Data zamowienia'])
df2['idZamowienia'] = df1['idZamowienia'].astype(np.int16)
df2['Utarg'] = df1['Utarg']

In [34]:
size_of_format(sum(df2.memory_usage(deep=True)))

'152.4MiB'

In [35]:
utarg_downcast = pd.to_numeric(df2["Utarg"], downcast='float')
size_of_format(utarg_downcast.memory_usage(deep=True)), utarg_downcast.dtype

('30.5MiB', dtype('float32'))

In [36]:
df2['Utarg'] =  pd.to_numeric(df1["Utarg"], downcast='float')
size_of_format(sum(df2.memory_usage(deep=True)))

'121.9MiB'

In [37]:
for column in df2.columns:
    print(f'{column}: {size_of_format(df2[column].memory_usage(deep=True))}')

Kraj: 7.6MiB
Sprzedawca: 7.6MiB
Data zamowienia: 61.0MiB
idZamowienia: 15.2MiB
Utarg: 30.5MiB


In [38]:
df2.info()
df2.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7990000 entries, 0 to 7989999
Data columns (total 5 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Kraj             category      
 1   Sprzedawca       category      
 2   Data zamowienia  datetime64[ns]
 3   idZamowienia     int16         
 4   Utarg            float32       
dtypes: category(2), datetime64[ns](1), float32(1), int16(1)
memory usage: 121.9 MB


Unnamed: 0,Data zamowienia,idZamowienia,Utarg
count,7990000,7990000.0,7990000.0
mean,2004-08-05 22:13:40.025031424,10647.17522,1537.33032
min,2003-07-10 00:00:00,10248.0,12.5
25%,2004-02-26 00:00:00,10447.0,465.70001
50%,2004-09-03 00:00:00,10647.0,956.66998
75%,2005-02-02 00:00:00,10847.0,1892.25
max,2005-05-01 00:00:00,11057.0,16387.5
std,,230.94727,1859.42615


In [41]:
start = datetime.now()
display(df1.groupby(['Sprzedawca']).agg({'Utarg': ['mean']}))
print(f'Time: {datetime.now() - start}')

Unnamed: 0_level_0,Utarg
Unnamed: 0_level_1,mean
Sprzedawca,Unnamed: 1_level_2
Callahan,1242.75424
Davolio,1559.82983
Dudek,1830.44
Fuller,1766.34543
King,1745.71627
Kowalski,1637.91071
Leverling,1609.57016
Peacock,1495.12371
Sowiński,1115.80969


Time: 0:00:00.945330


In [42]:
start = datetime.now()
display(df2.groupby(['Sprzedawca']).agg({'Utarg': ['mean']}))
print(f'Time: {datetime.now() - start}')

  display(df2.groupby(['Sprzedawca']).agg({'Utarg': ['mean']}))


Unnamed: 0_level_0,Utarg
Unnamed: 0_level_1,mean
Sprzedawca,Unnamed: 1_level_2
Callahan,1242.75415
Davolio,1559.82983
Dudek,1830.43994
Fuller,1766.34546
King,1745.71631
Kowalski,1637.91064
Leverling,1609.57019
Peacock,1495.12378
Sowiński,1115.80969


Time: 0:00:00.076219
