### Optimising memory and disk usage

Strategies have been taken from:

https://www.kaggle.com/frankherfert/tips-tricks-for-working-with-large-datasets

### Classes and Methods

In [None]:
class dataframe_optimiser():
    def __init__(self, conversion):
        self.conversion = conversion
        
    def convert_series(self,array_of_series):
        returned_series=[]
        for series in array_of_series:
            print("converting: ",series.name,"\t\t\tsize (MB):", round(series.memory_usage(deep=True) * 1e-6,2),end="\t")
#             series=eval("series"+test.conversion)
            series=eval(self.conversion)
            print("->\t", round(series.memory_usage(deep=True) * 1e-6,2))
            returned_series.append(series)
        return returned_series


### Import Statements


In [3]:
import pandas as pd
import os

### Load Raw Data

https://stackoverflow.com/questions/14262433/large-data-work-flows-using-pandas/14268804#14268804

In [4]:
%%time
i=1
chunksize = 10 ** 6
for chunk in pd.read_csv("../data/raw/train.csv", chunksize=chunksize):
    i+=1
    print(i)
    print(chunk.head())
    if i==2:
        break
        

2
   event_id      game_session                 timestamp  \
0  27253bdc  45bb1e1b6b50c07b  2019-09-06T17:53:46.937Z   
1  27253bdc  17eeb7f223665f53  2019-09-06T17:54:17.519Z   
2  77261ab5  0848ef14a8dc6892  2019-09-06T17:54:56.302Z   
3  b2dba42b  0848ef14a8dc6892  2019-09-06T17:54:56.387Z   
4  1bb5fbdb  0848ef14a8dc6892  2019-09-06T17:55:03.253Z   

                                          event_data installation_id  \
0             {"event_code": 2000, "event_count": 1}        0001e90f   
1             {"event_code": 2000, "event_count": 1}        0001e90f   
2  {"version":"1.0","event_count":1,"game_time":0...        0001e90f   
3  {"description":"Let's build a sandcastle! Firs...        0001e90f   
4  {"description":"Let's build a sandcastle! Firs...        0001e90f   

   event_count  event_code  game_time                          title  \
0            1        2000          0        Welcome to Lost Lagoon!   
1            1        2000          0           Magma Peak - Level

In [5]:
sample_data=chunk

In [6]:
sample_data.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


### Investigating memory usage

In [7]:
%%time
sample_data.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 11 columns):
event_id           1000000 non-null object
game_session       1000000 non-null object
timestamp          1000000 non-null object
event_data         1000000 non-null object
installation_id    1000000 non-null object
event_count        1000000 non-null int64
event_code         1000000 non-null int64
game_time          1000000 non-null int64
title              1000000 non-null object
type               1000000 non-null object
world              1000000 non-null object
dtypes: int64(3), object(8)
memory usage: 734.4 MB
CPU times: user 4.59 s, sys: 31.2 ms, total: 4.62 s
Wall time: 4.1 s


In [8]:
%%time
sample_data.memory_usage(deep=True) * 1e-6

CPU times: user 3.66 s, sys: 10.3 ms, total: 3.67 s
Wall time: 3.68 s


Index                0.000128
event_id            65.000000
game_session        73.000000
timestamp           81.000000
event_data         256.591707
installation_id     65.000000
event_count          8.000000
event_code           8.000000
game_time            8.000000
title               75.011046
type                63.142324
world               67.372441
dtype: float64

In [9]:
initial_memory_usage=sample_data.memory_usage(deep=True).sum() * 1e-6
print("The space on disk is (MB): ",round(initial_memory_usage,2))

The space on disk is (MB):  770.12


### Timestamp conversion

Converting the timestamp to a numpy datetime type reduces the memory usage by a factor of 10.

In [10]:

#instantiate category_converter object from dataframe_optimiser class
category_converter=dataframe_optimiser("pd.to_datetime(series)")

#convert array of series from dataframe to optimised datatype
[sample_data["timestamp"]]=category_converter.convert_series([sample_data["timestamp"]])


converting:  timestamp 			size (MB): 81.0	->	 8.0


### Categorical value conversion

Converting the categorical values from strings to a numerical index leads to reductions of up to 75x

In [11]:
#instantiate category_converter object from dataframe_optimiser class
category_converter=dataframe_optimiser("series.astype('category')")

#convert array of series from dataframe to optimised datatype
[sample_data["title"],sample_data["world"],sample_data["event_code"],sample_data["event_count"],sample_data["installation_id"]]=category_converter.convert_series([sample_data["title"],sample_data["world"],sample_data["event_code"],sample_data["event_count"],sample_data["installation_id"]])


converting:  title 			size (MB): 75.01	->	 1.0
converting:  world 			size (MB): 67.37	->	 1.0
converting:  event_code 			size (MB): 8.0	->	 1.0
converting:  event_count 			size (MB): 8.0	->	 2.1
converting:  installation_id 			size (MB): 65.0	->	 2.19


### Final Data Frame

In [18]:
sample_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 11 columns):
event_id           1000000 non-null object
game_session       1000000 non-null object
timestamp          1000000 non-null datetime64[ns, UTC]
event_data         1000000 non-null object
installation_id    1000000 non-null category
event_count        1000000 non-null category
event_code         1000000 non-null category
game_time          1000000 non-null int64
title              1000000 non-null category
type               1000000 non-null object
world              1000000 non-null category
dtypes: category(5), datetime64[ns, UTC](1), int64(1), object(4)
memory usage: 52.6+ MB


### Final size in memory

The size in memory has been reduced by almost a factor of 2.

It will be determined whether the conversion to category impacts the machine learning models.

In [12]:
final_memory_usage=sample_data.memory_usage(deep=True).sum() * 1e-6
print("The space on disk is (MB): ",round(initial_memory_usage,2))

The space on disk is (MB):  770.12


In [13]:
print("The initial space on disk was (MB): ",round(initial_memory_usage,2)," and the final space on disk is (MB): ",round(final_memory_usage,2))

The initial space on disk was (MB):  770.12  and the final space on disk is (MB):  481.03


### Store data as a pickle

By storing the data as a pickle, the size on disk is reduced to 67% of the original size, and loads 10x faster.

In [14]:
sample_data.to_csv("../data/processed/sample_data.csv")
sample_data.to_pickle("../data/processed/sample_data.pkl")

In [15]:
# size is shown in bytes again and needs to be converted to megabytes
print("sample_data.csv (MB):", round(os.stat('../data/processed/sample_data.csv').st_size * 1e-6,2))
print("sample_data.pkl (MB):", round(os.stat('../data/processed/sample_data.pkl').st_size * 1e-6,2))

sample_data.csv (MB): 356.31
sample_data.pkl (MB): 239.06


In [16]:
%%time
train = pd.read_pickle("../data/processed/sample_data.pkl")

CPU times: user 285 ms, sys: 148 ms, total: 433 ms
Wall time: 434 ms


In [17]:
%%time
train = pd.read_csv("../data/processed/sample_data.csv")

CPU times: user 4.61 s, sys: 375 ms, total: 4.98 s
Wall time: 4.43 s
