# Pickle

Pickle is used for serializing and de-serializing Python objects.
Serialization is the process of encoding an object as bytes that can be easily stored and later de-serialized back to a Python object.
<br>
When an object is pickled, it contains all of the information needed to reconstruct that object later on.

**Pros**
- Easy to use & lightweight.
- Allows for moving of data across a network.
- Is useful if you want to pause and resume a long-running script (you can dump the system's state and resume it later on).
- Is useful for persistency across program runs.

**Cons**
- Other programming languages cannot reconstruct pickled objects.
- Not human readable.
- Security (you can accidently unpickle malicious code if you don't know what it is you're unpickling).

## Pickling a dictionary of a data frame

In [1]:
import numpy as np
import pandas as pd
import pickle

Load a CSV file.

In [2]:
url = 'https://ddc-datascience.s3.amazonaws.com/Projects/Project.1-Transactions/Data/Transaction.train.csv'


In [3]:
!curl -O {url}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 70.0M  100 70.0M    0     0  14.8M      0  0:00:04  0:00:04 --:--:-- 17.0M


In [4]:
data = pd.read_csv( url )
data = data.drop(['Unnamed: 0'], axis=1)

In [5]:
data.shape

(180000, 52)

In [6]:
data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49
0,train_0,0,8.9255,-6.7863,11.9081,5.1187,5.747,14.0137,0.5745,4.284,...,5.4879,-4.7645,3.1531,18.5618,1.7202,25.8398,4.4354,3.9642,3.1364,12.7803
1,train_1,0,11.5006,-4.1473,13.8588,5.6208,8.0851,14.0239,8.4135,7.8,...,5.7999,5.5378,5.5134,30.2645,11.0752,22.5441,7.6421,7.7214,2.5837,18.356
2,train_2,0,8.6093,-2.7457,12.0805,6.9427,5.9525,14.1929,7.3124,4.7011,...,5.769,-7.0927,-5.8234,25.682,2.4013,23.0866,2.9057,9.7905,1.6704,14.7222
3,train_3,0,11.0604,-2.1518,8.9522,5.8428,8.245,13.8463,11.9704,15.9426,...,5.343,-7.1541,11.7134,14.7483,5.6961,-0.4639,4.4666,4.7433,0.7178,17.9697
4,train_4,0,9.8369,-1.4834,12.8746,5.9405,7.6784,13.8481,7.8895,6.5263,...,5.5518,1.4493,2.3705,18.4685,5.1743,11.8503,-1.4905,9.5214,-0.1508,17.9974


In [7]:
data.iloc[:,0:3].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   ID_code  180000 non-null  object 
 1   target   180000 non-null  int64  
 2   var_0    180000 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 4.1+ MB


In [8]:
data['target'].value_counts()


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,161960
1,18040


In [9]:
data['target'] = data['target'].astype(bool)


In [10]:
data.iloc[:,0:3].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   ID_code  180000 non-null  object 
 1   target   180000 non-null  bool   
 2   var_0    180000 non-null  float64
dtypes: bool(1), float64(1), object(1)
memory usage: 2.9+ MB


Pickle the file.

In [11]:
dict_data = data.to_dict(orient = 'series')
dict_data


{'ID_code': 0              train_0
 1              train_1
 2              train_2
 3              train_3
 4              train_4
               ...     
 179995    train_179995
 179996    train_179996
 179997    train_179997
 179998    train_179998
 179999    train_179999
 Name: ID_code, Length: 180000, dtype: object,
 'target': 0         False
 1         False
 2         False
 3         False
 4         False
           ...  
 179995    False
 179996    False
 179997    False
 179998    False
 179999    False
 Name: target, Length: 180000, dtype: bool,
 'var_0': 0          8.9255
 1         11.5006
 2          8.6093
 3         11.0604
 4          9.8369
            ...   
 179995     8.9112
 179996     8.2722
 179997    12.4902
 179998     8.4894
 179999    11.6252
 Name: var_0, Length: 180000, dtype: float64,
 'var_1': 0        -6.7863
 1        -4.1473
 2        -2.7457
 3        -2.1518
 4        -1.4834
            ...  
 179995   -4.4900
 179996    7.1923
 179997   -2.9379
 1

In [12]:
ls -la


total 71740
drwxr-xr-x 1 root root     4096 Nov 14 17:45 [0m[01;34m.[0m/
drwxr-xr-x 1 root root     4096 Nov 14 17:36 [01;34m..[0m/
drwxr-xr-x 4 root root     4096 Nov 12 14:24 [01;34m.config[0m/
drwxr-xr-x 1 root root     4096 Nov 12 14:25 [01;34msample_data[0m/
-rw-r--r-- 1 root root 73445257 Nov 14 17:45 Transaction.train.csv


In [13]:
with open('dict_data.p', 'wb') as file:
    pickle.dump(dict_data, file)


In [14]:
ls -la

total 144772
drwxr-xr-x 1 root root     4096 Nov 14 17:49 [0m[01;34m.[0m/
drwxr-xr-x 1 root root     4096 Nov 14 17:36 [01;34m..[0m/
drwxr-xr-x 4 root root     4096 Nov 12 14:24 [01;34m.config[0m/
-rw-r--r-- 1 root root 74782378 Nov 14 17:49 dict_data.p
drwxr-xr-x 1 root root     4096 Nov 12 14:25 [01;34msample_data[0m/
-rw-r--r-- 1 root root 73445257 Nov 14 17:45 Transaction.train.csv


In [15]:
!ls -la --si ./dict_data.p


-rw-r--r-- 1 root root 75M Nov 14 17:49 ./dict_data.p


Read the pickle file.

In [16]:
with open('dict_data.p', 'rb') as file:
    dict_data_read = pickle.load(file)

type(dict_data_read)

dict

In [17]:
df_dict_data_read = pd.DataFrame.from_dict(dict_data_read)
df_dict_data_read.head()


Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49
0,train_0,False,8.9255,-6.7863,11.9081,5.1187,5.747,14.0137,0.5745,4.284,...,5.4879,-4.7645,3.1531,18.5618,1.7202,25.8398,4.4354,3.9642,3.1364,12.7803
1,train_1,False,11.5006,-4.1473,13.8588,5.6208,8.0851,14.0239,8.4135,7.8,...,5.7999,5.5378,5.5134,30.2645,11.0752,22.5441,7.6421,7.7214,2.5837,18.356
2,train_2,False,8.6093,-2.7457,12.0805,6.9427,5.9525,14.1929,7.3124,4.7011,...,5.769,-7.0927,-5.8234,25.682,2.4013,23.0866,2.9057,9.7905,1.6704,14.7222
3,train_3,False,11.0604,-2.1518,8.9522,5.8428,8.245,13.8463,11.9704,15.9426,...,5.343,-7.1541,11.7134,14.7483,5.6961,-0.4639,4.4666,4.7433,0.7178,17.9697
4,train_4,False,9.8369,-1.4834,12.8746,5.9405,7.6784,13.8481,7.8895,6.5263,...,5.5518,1.4493,2.3705,18.4685,5.1743,11.8503,-1.4905,9.5214,-0.1508,17.9974


In [18]:
df_dict_data_read.shape

(180000, 52)

In [19]:
type(df_dict_data_read)

In [20]:
df_dict_data_read.iloc[:,0:3].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   ID_code  180000 non-null  object 
 1   target   180000 non-null  bool   
 2   var_0    180000 non-null  float64
dtypes: bool(1), float64(1), object(1)
memory usage: 2.9+ MB


In [21]:
df_dict_data_read['target'] = df_dict_data_read['target'].astype(int)


In [22]:
df_dict_data_read.iloc[:,0:3].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   ID_code  180000 non-null  object 
 1   target   180000 non-null  int64  
 2   var_0    180000 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 4.1+ MB


In [23]:
df_dict_data_read.head()


Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49
0,train_0,0,8.9255,-6.7863,11.9081,5.1187,5.747,14.0137,0.5745,4.284,...,5.4879,-4.7645,3.1531,18.5618,1.7202,25.8398,4.4354,3.9642,3.1364,12.7803
1,train_1,0,11.5006,-4.1473,13.8588,5.6208,8.0851,14.0239,8.4135,7.8,...,5.7999,5.5378,5.5134,30.2645,11.0752,22.5441,7.6421,7.7214,2.5837,18.356
2,train_2,0,8.6093,-2.7457,12.0805,6.9427,5.9525,14.1929,7.3124,4.7011,...,5.769,-7.0927,-5.8234,25.682,2.4013,23.0866,2.9057,9.7905,1.6704,14.7222
3,train_3,0,11.0604,-2.1518,8.9522,5.8428,8.245,13.8463,11.9704,15.9426,...,5.343,-7.1541,11.7134,14.7483,5.6961,-0.4639,4.4666,4.7433,0.7178,17.9697
4,train_4,0,9.8369,-1.4834,12.8746,5.9405,7.6784,13.8481,7.8895,6.5263,...,5.5518,1.4493,2.3705,18.4685,5.1743,11.8503,-1.4905,9.5214,-0.1508,17.9974


## Pickling a data frame

In [24]:
data = pd.read_csv( url )
data = data.drop(['Unnamed: 0'], axis=1)


In [25]:
data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49
0,train_0,0,8.9255,-6.7863,11.9081,5.1187,5.747,14.0137,0.5745,4.284,...,5.4879,-4.7645,3.1531,18.5618,1.7202,25.8398,4.4354,3.9642,3.1364,12.7803
1,train_1,0,11.5006,-4.1473,13.8588,5.6208,8.0851,14.0239,8.4135,7.8,...,5.7999,5.5378,5.5134,30.2645,11.0752,22.5441,7.6421,7.7214,2.5837,18.356
2,train_2,0,8.6093,-2.7457,12.0805,6.9427,5.9525,14.1929,7.3124,4.7011,...,5.769,-7.0927,-5.8234,25.682,2.4013,23.0866,2.9057,9.7905,1.6704,14.7222
3,train_3,0,11.0604,-2.1518,8.9522,5.8428,8.245,13.8463,11.9704,15.9426,...,5.343,-7.1541,11.7134,14.7483,5.6961,-0.4639,4.4666,4.7433,0.7178,17.9697
4,train_4,0,9.8369,-1.4834,12.8746,5.9405,7.6784,13.8481,7.8895,6.5263,...,5.5518,1.4493,2.3705,18.4685,5.1743,11.8503,-1.4905,9.5214,-0.1508,17.9974


In [26]:
data.iloc[:,0:3].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   ID_code  180000 non-null  object 
 1   target   180000 non-null  int64  
 2   var_0    180000 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 4.1+ MB


In [27]:
data['target'] = data['target'].astype(bool)


In [28]:
data.iloc[:,0:3].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   ID_code  180000 non-null  object 
 1   target   180000 non-null  bool   
 2   var_0    180000 non-null  float64
dtypes: bool(1), float64(1), object(1)
memory usage: 2.9+ MB


Pickle the file.

In [29]:
with open('data.p', 'wb') as file:
    pickle.dump(data, file)


In [30]:
!ls -la --si ./*.p


-rw-r--r-- 1 root root 75M Nov 14 17:53 ./data.p
-rw-r--r-- 1 root root 75M Nov 14 17:49 ./dict_data.p


Read the pickle file.

In [31]:
with open('data.p', 'rb') as file:
    data_read = pickle.load(file)
data_read.head()


Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49
0,train_0,False,8.9255,-6.7863,11.9081,5.1187,5.747,14.0137,0.5745,4.284,...,5.4879,-4.7645,3.1531,18.5618,1.7202,25.8398,4.4354,3.9642,3.1364,12.7803
1,train_1,False,11.5006,-4.1473,13.8588,5.6208,8.0851,14.0239,8.4135,7.8,...,5.7999,5.5378,5.5134,30.2645,11.0752,22.5441,7.6421,7.7214,2.5837,18.356
2,train_2,False,8.6093,-2.7457,12.0805,6.9427,5.9525,14.1929,7.3124,4.7011,...,5.769,-7.0927,-5.8234,25.682,2.4013,23.0866,2.9057,9.7905,1.6704,14.7222
3,train_3,False,11.0604,-2.1518,8.9522,5.8428,8.245,13.8463,11.9704,15.9426,...,5.343,-7.1541,11.7134,14.7483,5.6961,-0.4639,4.4666,4.7433,0.7178,17.9697
4,train_4,False,9.8369,-1.4834,12.8746,5.9405,7.6784,13.8481,7.8895,6.5263,...,5.5518,1.4493,2.3705,18.4685,5.1743,11.8503,-1.4905,9.5214,-0.1508,17.9974


In [32]:
type(data_read)

In [33]:
data_read.iloc[:,0:3].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   ID_code  180000 non-null  object 
 1   target   180000 non-null  bool   
 2   var_0    180000 non-null  float64
dtypes: bool(1), float64(1), object(1)
memory usage: 2.9+ MB


In [34]:
data_read['target'] = data_read['target'].astype(np.int8)


In [35]:
data_read.iloc[:,0:3].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180000 entries, 0 to 179999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   ID_code  180000 non-null  object 
 1   target   180000 non-null  int8   
 2   var_0    180000 non-null  float64
dtypes: float64(1), int8(1), object(1)
memory usage: 2.9+ MB


In [36]:
data_read.head()


Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_40,var_41,var_42,var_43,var_44,var_45,var_46,var_47,var_48,var_49
0,train_0,0,8.9255,-6.7863,11.9081,5.1187,5.747,14.0137,0.5745,4.284,...,5.4879,-4.7645,3.1531,18.5618,1.7202,25.8398,4.4354,3.9642,3.1364,12.7803
1,train_1,0,11.5006,-4.1473,13.8588,5.6208,8.0851,14.0239,8.4135,7.8,...,5.7999,5.5378,5.5134,30.2645,11.0752,22.5441,7.6421,7.7214,2.5837,18.356
2,train_2,0,8.6093,-2.7457,12.0805,6.9427,5.9525,14.1929,7.3124,4.7011,...,5.769,-7.0927,-5.8234,25.682,2.4013,23.0866,2.9057,9.7905,1.6704,14.7222
3,train_3,0,11.0604,-2.1518,8.9522,5.8428,8.245,13.8463,11.9704,15.9426,...,5.343,-7.1541,11.7134,14.7483,5.6961,-0.4639,4.4666,4.7433,0.7178,17.9697
4,train_4,0,9.8369,-1.4834,12.8746,5.9405,7.6784,13.8481,7.8895,6.5263,...,5.5518,1.4493,2.3705,18.4685,5.1743,11.8503,-1.4905,9.5214,-0.1508,17.9974
