In [7]:
APP_NAME = 'hacking'

In [8]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName(APP_NAME)\
    .enableHiveSupport()\
    .getOrCreate()

### Helper functions

In [14]:
from IPython.display import display
import pandas as pd


def display_df(df, rows=20, transpose=False):
    """Helper function that 'pretty prints' the first 'x' rows of a Spark Dataframe.
    
    This function is helpful when exploring data. PySpark does not generally display
    the content of a DataFrame in a very readable fashion. Using pandas and IPython
    this can be worked around.
    
    This function works by first taking 'x' number of rows of the data from the PySpark
    DataFrame and casting it to a pandas DataFrame. It is then (optionally) transposed, 
    and fed into the display function of IPython. Jupyter then outputs the result as a 
    pretty formatted, human-readable table.
    
    Depends on pandas and IPython display packages. Necessary imports:
    - from IPython.display import display
    - import pandas as pd
    
    Parameters
    ----------
    df : pyspark.sql.DataFrame
        Input Spark DataFrame
    
    rows
        Number of rows one wishes to display should either be an int value between 1 and 30
        Can also be set to 'all', upon which all the rows of the dataframe will be used 
        while still maintaining a maximum of 30 rows.
        Default value is 20.
    
    transpose : bool
        Allows toggling if to transpose the data or not, default is False
        
    Returns
    -------
    None
    """
    # Parsing rows value to set value of 'x'
    x = 1
    if isinstance(rows, str):
        # 'all' is for dynamically determining the value of 'x'
        if rows.lower() == 'all':
            x = df.count()
    else:
        x = rows
    
    # Minimum value of 'x' is 1, Maximum is 30
    # We need at least one row to display, minimum is 1
    if x < 1:
        print('%s rows requested, displaying 1 row' % rows)
        x = 1
    
    # Max 30 rows, to avoid overloading the display
    elif x > 30:
        print('%s rows requested, displaying 30 row' % rows)
        x = 30
    
    # Cast 'x' number of rows from Spark DataFrame to Pandas
    p_df = df.limit(x).toPandas()
    # p_df = pd.DataFrame(df.take(x), columns=df.columns)
    
    # Optionally transposes the data (based on 'transpose' variable)
    if transpose:
        p_df = p_df.transpose()
    
    # Have Jupyter display the dataframe
    display(p_df)

In [15]:
base_df = spark.read.csv('query-hive-8.csv', header=True, inferSchema=True, nullValue='NULL', sep=';')

display_df(base_df)

Unnamed: 0,0,1,2,3,4
uuid,b5d84b94-87d4-4678-bc38-ea68dbb3dabc,8f2e15b5-73e6-4759-9788-7e7420788052,00a6a27c-69e4-43f6-b709-8ae2fea5155b,a22a4dc1-563a-419d-a76b-55645da61a77,90083032-1fc6-4462-9b3c-d3e70fd1a950
store_code,ISH027,ISH027,ISH027,ISH027,ISH027
trading_date,2019-06-30 00:00:00,2019-06-30 00:00:00,2019-06-30 00:00:00,2019-06-30 00:00:00,2019-06-30 00:00:00
product_gtin,91201051595,91201051625,91201051779,91201051830,91201051847
currency_code,ILS,ILS,ILS,ILS,ILS
vat_percentage,17,17,17,17,17
sales_quantity,0,0,0,0,0
sales_amount,0,0,0,0,0
sales_discount,,,,,
cost_of_goods_sold,,,,,
