In [1]:
import pandas as pd
import sparkpickle

### Load CSV file as spark dataframe
Note: books.csv is in HDFS

In [5]:
df_csv = spark.read.csv("books.csv", header="True")

In [6]:
df_csv.dtypes

[('bookID', 'string'),
 ('title', 'string'),
 ('authors', 'string'),
 ('average_rating', 'string'),
 ('isbn', 'string'),
 ('isbn13', 'string'),
 ('language_code', 'string'),
 ('  num_pages', 'string'),
 ('ratings_count', 'string'),
 ('text_reviews_count', 'string'),
 ('publication_date', 'string'),
 ('publisher', 'string')]

In [7]:
df_csv.count()

11127

In [4]:
spark

### Convert Spark Dataframe to Pickled File
Note: 

- This operation works only with RDD and resource intensive.
- Pickled files are currenty stored locally and not in HDFS.


In [8]:
df_csv.rdd.saveAsPickleFile(r"file:///Users//ruksvaithy//projects//python_titbits//pickling//pickled_file//")

### View the Pickled File

In [14]:
! ls -lh pickled_file/

total 4048
-rw-r--r--  1 ruksvaithy  staff     0B Jul 17 16:01 _SUCCESS
-rw-r--r--  1 ruksvaithy  staff   2.0M Jul 17 16:01 part-00000


In [21]:
! head -n 10 pickled_file/part-00000

SEQ!org.apache.hadoop.io.NullWritable"org.apache.hadoop.io.BytesWritable      �����Au�����F�  L7      L3�� ur [[BK�gg�7  xp   
ur [B���T�  xp  
��
      ]�(�pyspark.sql.types��_create_row���]�(�bookID��title��authors��average_rating��isbn��isbn13��language_code��  num_pages��ratings_count��text_reviews_count��publication_date��	publisher�e(�1��9Harry Potter and the Half-Blood Prince (Harry Potter  #6)��J.K. Rowling/Mary GrandPré��4.57��
0439785960��9780439785969��eng��652��2095690��27591��	9/16/2006��Scholastic Inc.�t���R�h]�(�bookID��title��authors��average_rating��isbn��isbn13��language_code��  num_pages��ratings_count��text_reviews_count��publication_date��	publisher�e(�2��<Harry Potter and the Order of the Phoenix (Harry Potter  #5)��J.K. Rowling/Mary GrandPré��4.49��
0439358078��9780439358071��eng��870��2153167��29221��9/1/2004��Scholastic Inc.�t���R�hh (�4��:Harry Potter and the Chamber of Secrets (Harry Potter  #

### Convert Pickled file to Pandas Dataframe
Note: This will **FAIL** as the spark's pickled is not compatible with pandas dataframe's read_pickle

In [None]:
pd_df_from_pickle = pd.read_pickle(r"file:///Users//ruksvaithy//projects//python_titbits//pickling//pickled_file//*")

### Workaround: To use `sparkpickle` python library
- Allows accessing spark's dataframe object from a non-spark environment
- Pickled file has to be read as bytes object.
- Supports three operation - load, loads, load_gen
- load_gen yields a generator object to be converted into pandas dataframe.


In [41]:
row_list = []
with open("./pickled_file/part-00000","rb") as pkl_file:
    for obj in sparkpickle.load_gen(pkl_file):
        row_list.append(obj)

In [54]:
pd.DataFrame(row_list, columns= obj.__fields__)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic
...,...,...,...,...,...,...,...,...,...,...,...,...
11122,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael He...,4.06,1560254416,9781560254416,eng,512,156,20,12/21/2004,Da Capo Press
11123,45633,You Bright and Risen Angels,William T. Vollmann,4.08,0140110879,9780140110876,eng,635,783,56,12/1/1988,Penguin Books
11124,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,0140131965,9780140131963,eng,415,820,95,8/1/1993,Penguin Books
11125,45639,Poor People,William T. Vollmann,3.72,0060878827,9780060878825,eng,434,769,139,2/27/2007,Ecco
