# Resources
**1) Pandas API on Spark:** https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/index.html

**2) Pandas API on Spark:** https://api-docs.databricks.com/python/pyspark/latest/pyspark.pandas/index.html

# Part 1: Migration from pandas to pandas API on Spark

## Object creation - Series

In [0]:
import numpy as np
import pandas as pd
import pyspark.pandas as ps

In [0]:
# Create pandas series
pser = pd.Series([1, 3, 5, np.nan, 6, 8])

In [0]:
print(pser)

In [0]:
type(pser)

In [0]:
# Create pandas-on-spark series
psser = ps.Series([1, 3, 5, np.nan, 6, 8])

In [0]:
print(psser)

In [0]:
type(psser)

In [0]:
# Create a pandas-on-spark series by passing a pandas series
psser_1 = ps.Series(pser)

In [0]:
psser_1

In [0]:
type(psser_1)

In [0]:
# Create a pandas-on-spark series by passing a pandas series
psser_2 = ps.from_pandas(pser)

In [0]:
psser_2

In [0]:
type(psser_2)

In [0]:
# sort_index method
print(psser_1.sort_index())

## Object creation - Dataframe

In [0]:
my_dict = {"A": np.random.rand(5),
           "B": np.random.rand(5)}

In [0]:
my_dict

In [0]:
# Create a pandas dataframe
pdf = pd.DataFrame(my_dict)

In [0]:
pdf

In [0]:
type(pdf)

In [0]:
# Create a pandas-on-spark dataframe
psdf = ps.DataFrame(my_dict)

In [0]:
psdf

In [0]:
type(psdf)

In [0]:
# Create a pandas-on-spark dataframe by passing a pandas dataframe
psdf_1 = ps.DataFrame(pdf)
psdf_2 = ps.from_pandas(pdf)

In [0]:
psdf_1

In [0]:
psdf_2

In [0]:
print(type(psdf_1))
print(type(psdf_2))

In [0]:
# sort_index method
psdf_1.sort_index()

## Viewing data

In [0]:
# Create pandas-on-spark series
psser = ps.Series([1, 3, 5, np.nan, 6, 8])

In [0]:

# Create a pandas-on-spark dataframe
psdf = ps.DataFrame(my_dict)

In [0]:
psser

In [0]:
psdf

In [0]:
psser.head(2)

In [0]:
psdf.head(3)

In [0]:
# Summary statistics
psser.describe()

In [0]:
# Summary statistics
psdf.describe()

In [0]:
# Sort values method
psser.sort_values()

In [0]:
psdf

In [0]:
# Sort values method
psdf.sort_values(by="A")

In [0]:
psdf

In [0]:
# Transpose method
psdf.transpose()

In [0]:
# Transpose method
psser.transpose()

In [0]:
ps.get_option('compute.max_rows')

In [0]:
ps.set_option('compute.max_rows', 2000)

In [0]:
ps.get_option('compute.max_rows')

## Selection

In [0]:
# Create pandas-on-spark series
psser = ps.Series([1, 3, 5, np.nan, 6, 8])

In [0]:
# Create a pandas-on-spark dataframe
psdf = ps.DataFrame(my_dict)

In [0]:
psser

In [0]:
psdf

In [0]:
psdf['A']

In [0]:
psdf['B']

In [0]:
psdf[['A', 'B']]

In [0]:
psdf.B

In [0]:
psdf

In [0]:
psdf.loc[0:3]

In [0]:
psser

In [0]:
psser.loc[0:2]

In [0]:
psser.loc[4:5]

In [0]:
psdf

In [0]:
# Slicing
psdf.iloc[0:5, 0:2]

In [0]:
psdf.iloc[0:3, 0:2]

In [0]:
psser = ps.Series([100, 200, 300, 400, 500], index=[0, 1, 2, 3, 4])

In [0]:
psser

In [0]:
psdf["C"] = psser

In [0]:
# Those are needed for managing options
from pyspark.pandas.config import set_option, reset_option
set_option("compute.ops_on_diff_frames", True)
psdf['C'] = psser
 
# Reset to default to avoid potential expensive operation in the future
reset_option("compute.ops_on_diff_frames")
print(psdf)

## Applying Python function with pandas-on-Spark object

In [0]:
psdf

In [0]:
psdf.apply(np.cumsum)

In [0]:
psdf

In [0]:
psdf.apply(np.cumsum, axis=1)

In [0]:
psdf.apply(lambda x: x ** 2)

In [0]:
def square(x) -> ps.Series[np.float64]:
    return x ** 2

In [0]:
psdf.apply(square)

In [0]:
psdf_5 = ps.DataFrame({"A": range(1000)})

In [0]:
print(psdf_5)

In [0]:
len(psdf_5)

In [0]:
# Working properly since size of data <= compute.shortcut_limit (1000)
ps.DataFrame({'A': range(1000)}).apply(lambda col: col.max())

In [0]:
# Not working properly since size of data > compute.shortcut_limit (1000)
ps.DataFrame({'A': range(1200)}).apply(lambda col: col.max())

In [0]:
# Set compute.shortcut_limit = 1200
ps.set_option('compute.shortcut_limit', 1200)

In [0]:
# Not working properly since size of data > compute.shortcut_limit (1000)
ps.DataFrame({'A': range(1200)}).apply(lambda col: col.max())

## Grouping Data

In [0]:
# Create a pandas-on-Spark DataFrame
psdf = ps.DataFrame({'A': [1, 2, 2, 3, 4],
                    'B': [10, 20, 30, 30, 50],
                    'C': [5, 7, 9, 11, 13]})

In [0]:
psdf

In [0]:
psdf.groupby("A").sum()

In [0]:
psdf.groupby(["A", "B"]).sum()

## Plotting

In [0]:
# This is needed for visualizing plot on notebook
%matplotlib inline

In [0]:
# bar plot

speed = [0.1, 17.5, 40, 48, 52, 69, 88]
lifespan = [2, 8, 70, 1.5, 25, 12, 28]
index = ['snail', 'pig', 'elephant',
         'rabbit', 'giraffe', 'coyote', 'horse']
         
psdf = ps.DataFrame({'speed': speed,
                     'lifespan': lifespan}, index=index)
psdf.plot.bar()

In [0]:
psdf

In [0]:
# horizontal bar plot
psdf.plot.barh()

In [0]:
#  pie chart

psdf = ps.DataFrame({'mass': [0.330, 4.87, 5.97],
                     'radius': [2439.7, 6051.8, 6378.1]},
                    index=['Mercury', 'Venus', 'Earth'])
psdf.plot.pie(y='mass')

In [0]:
# area plot

psdf = ps.DataFrame({
    'sales': [3, 2, 3, 9, 10, 6, 3],
    'signups': [5, 5, 6, 12, 14, 13, 9],
    'visits': [20, 42, 28, 62, 81, 50, 90],
}, index=pd.date_range(start='2019/08/15', end='2020/03/09',
                       freq='M'))
psdf.plot.area()

In [0]:
# line plot

psdf = ps.DataFrame({'rabbit': [20, 18, 489, 675, 1776],
                     'horse': [4, 25, 281, 600, 1900]},
                    index=[1990, 1997, 2003, 2009, 2014])
psdf.plot.line()

In [0]:
# Histogram

pdf = pd.DataFrame(
    np.random.randint(1, 7, 6000),
    columns=['one'])
pdf['two'] = pdf['one'] + np.random.randint(1, 7, 6000)
psdf = ps.from_pandas(pdf)
psdf.plot.hist(bins=12, alpha=0.5)

In [0]:
# scatter plot

psdf = ps.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
                    [6.4, 3.2, 1], [5.9, 3.0, 2]],
                   columns=['length', 'width', 'species'])
psdf.plot.scatter(x='length',
                  y='width',
                  c='species')

# Part 2: Missing Functionalities and Workarounds in pandas API on Spark

## Directly use pandas APIs through type conversion

In [0]:
import numpy as np
import pandas as pd
import pyspark.pandas as ps

In [0]:
psdf = ps.DataFrame([[5.1, 3.5, 0], [4.9, 3.0, 0], [7.0, 3.2, 1],
                    [6.4, 3.2, 1], [5.9, 3.0, 2]],
                   columns=['length', 'width', 'species'])

In [0]:
psdf

In [0]:
type(psdf)

In [0]:
psidx = psdf.index

In [0]:
psidx

In [0]:
type(psidx)

In [0]:
ps_list = psidx.to_list()

In [0]:
ps_list

In [0]:
type(ps_list)

## Native Support for pandas Objects

In [0]:
psdf = ps.DataFrame({'A': 1.,
                     'B': pd.Timestamp('20130102'),
                     'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                     'D': np.array([3] * 4, dtype='int32'),
                     'F': 'foo'})

In [0]:
psdf

In [0]:
type(psdf)

## Distributed execution for pandas functions

In [0]:
i = pd.date_range('2018-04-09', periods=2000, freq='1D1min')
ts = ps.DataFrame({'A': ['timestamp']}, index=i)

In [0]:
print(ts)

In [0]:
len(ts)

In [0]:
ts.between_time('0:15', '0:16')

In [0]:
ts.to_pandas().between_time('0:15', '0:16')

In [0]:
ts.pandas_on_spark.apply_batch(func=lambda pdf: pdf.between_time('0:15', '0:16'))

## Using SQL in pandas API on Spark

In [0]:
psdf = ps.DataFrame({'year': [1990, 1997, 2003, 2009, 2014],
                     'rabbit': [20, 18, 489, 675, 1776],
                     'horse': [4, 25, 281, 600, 1900]})

In [0]:
pdf = pd.DataFrame({'year': [1990, 1997, 2003, 2009, 2014],
                    'sheep': [22, 50, 121, 445, 791],
                    'chicken': [250, 326, 589, 1241, 2118]})

In [0]:
psdf

In [0]:
pdf

In [0]:
ps.sql("SELECT * FROM {psdf} WHERE rabbit > 100", psdf = psdf)

In [0]:
ps.sql('''
    SELECT ps.rabbit, pd.chicken
    FROM {psdf} ps INNER JOIN {pdf} pd
    ON ps.year = pd.year
    ORDER BY ps.rabbit, pd.chicken''', psdf=psdf, pdf=pdf)

# Part 3: Working with PySpark

## Conversion from and to PySpark DataFrame

In [0]:
import numpy as np
import pandas as pd
import pyspark.pandas as ps

In [0]:
# Creating a pandas-on-spark DataFrame
psdf = ps.DataFrame({'A': [1, 2, 3, 4, 5], 'B': [10, 20, 30, 40, 50]})

In [0]:
psdf

In [0]:
type(psdf)

In [0]:
# Converting pandas-on-spark DataFrame to Spark DataFrame
sdf = psdf.to_spark()

In [0]:
sdf

In [0]:
sdf.show()

In [0]:
type(sdf)

In [0]:
psdf_2 = sdf.to_pandas_on_spark()

In [0]:
type(psdf_2)

In [0]:
psdf_3 = sdf.pandas_api()

In [0]:
type(psdf_3)

In [0]:
psdf_3

## Checking Spark execution plans

In [0]:
from pyspark.pandas import option_context

with option_context(
        "compute.ops_on_diff_frames", True,
        "compute.default_index_type", 'distributed'):
    df = ps.range(10) + ps.range(10)
    df.spark.explain()

In [0]:
with option_context(
        "compute.ops_on_diff_frames", False,
        "compute.default_index_type", 'distributed'):
    df = ps.range(10)
    df = df + df
    df.spark.explain()

## Caching DataFrames

In [0]:
with option_context("compute.default_index_type", 'distributed'):
    df = ps.range(10)
    new_df = (df + df).spark.cache()  # `(df + df)` is cached here as `df`
    new_df.spark.explain()

In [0]:
new_df.spark.unpersist()

In [0]:
with (df + df).spark.cache() as df:
    df.spark.explain()