# An introduction to Apache Spark

Spark is a distributed machine learning library that allows a user to work with cloud data (Yuge Data) in a cloud environment.

In [1]:
import pyspark
from pyspark.sql.functions import col, expr, desc, asc
from pyspark.sql.functions import concat, sum, avg, count, min, max
from pyspark.sql.functions import lit, regexp_extract, regexp_replace

import pandas as pd
import numpy as np
import multiprocessing as mp

from pydataset import data
from warnings import filterwarnings

filterwarnings('ignore')
np.random.seed(456)

In [2]:
# To use pyspark, we first create a builder object to create
# dataframes that work with spark.
# .getOrCreate() allows a user to create a spark dataframe from
# an existing dataframe or to get data from a non-local source.

# Creates a `Spark Session`
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
# create a dataframe with dummy data
df_dummy = pd.DataFrame(dict(n=np.arange(20),
                       group=np.random.choice(list("abc"), 20)))
df_dummy

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [4]:
# Wrap the pandas dataframe inside a function
# named `spark.createDataFrame()`.

df = spark.createDataFrame(df_dummy)
df

DataFrame[n: bigint, group: string]

In [5]:
type(df)

pyspark.sql.dataframe.DataFrame

In [6]:
# Similar to pandas, spark has its own describe function
# To show a descriptive stats of each column, use .show()
df.describe()

DataFrame[summary: string, n: string, group: string]

Information about the `describe()` function from the docs.

``` python
'''
Computes basic statistics for numeric and string columns.

This include count, mean, stddev, min, and max. If no columns are
given, this function computes statistics for all numerical or string columns.

.. note:: This function is meant for exploratory data analysis, as we make no
    guarantee about the backward compatibility of the schema of the resulting
    :class:`DataFrame`.
    
>>> df.describe().show()
'''
```

In [7]:
# We are telling spark: For each column, give us descriptive statistics.
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



## Fundamentals of Spark using the MPG dataset

Let's create a spark dataframe from one of the curated datasets, 'mpg'.

__mpg dataset Documentation__
```python
'''
mpg

PyDataset Documentation (adopted from R Documentation. The displayed examples are in R)

## Fuel economy data from 1999 and 2008 for 38 popular models of car

### Description

This dataset contains a subset of the fuel economy data that the EPA makes
available on http://fueleconomy.gov. It contains only models which had a new
release every year between 1999 and 2008 - this was used as a proxy for the
popularity of the car.

### Usage

    data(mpg)

### Format

A data frame with 234 rows and 11 variables

### Details

  * manufacturer. 

  * model. 

  * displ. engine displacement, in litres 

  * year. 

  * cyl. number of cylinders 

  * trans. type of transmission 

  * drv. f = front-wheel drive, r = rear wheel drive, 4 = 4wd 

  * cty. city miles per gallon 

  * hwy. highway miles per gallon 

  * fl. 

  * class.
'''
```



In [8]:
# Pass the mpg dataframe returned by data('mpg') into the
# Spark dataframe builder object to create a spark dataframe.
mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [9]:
# The datatype of the dataframe returned from the spark builder object
type(mpg)

pyspark.sql.dataframe.DataFrame

In [10]:
# When accessing a column of a spark object, it will
# only return the column name.
mpg.hwy

Column<b'hwy'>

In [11]:
# To get data from specific columns as a Series
# Use the `.select()` function to access the columns
# and `.show()` to access the data from those columns

# The .show() function will show 20 rows.
mpg.select('hwy').show()

+---+
|hwy|
+---+
| 29|
| 29|
| 31|
| 30|
| 26|
| 26|
| 27|
| 26|
| 25|
| 28|
| 27|
| 25|
| 25|
| 25|
| 25|
| 24|
| 25|
| 23|
| 20|
| 15|
+---+
only showing top 20 rows



In [12]:
mpg.select('hwy', 'cty', 'model').show()

+---+---+------------------+
|hwy|cty|             model|
+---+---+------------------+
| 29| 18|                a4|
| 29| 21|                a4|
| 31| 20|                a4|
| 30| 21|                a4|
| 26| 16|                a4|
| 26| 18|                a4|
| 27| 18|                a4|
| 26| 18|        a4 quattro|
| 25| 16|        a4 quattro|
| 28| 20|        a4 quattro|
| 27| 19|        a4 quattro|
| 25| 15|        a4 quattro|
| 25| 17|        a4 quattro|
| 25| 17|        a4 quattro|
| 25| 15|        a4 quattro|
| 24| 15|        a6 quattro|
| 25| 17|        a6 quattro|
| 23| 16|        a6 quattro|
| 20| 14|c1500 suburban 2wd|
| 15| 11|c1500 suburban 2wd|
+---+---+------------------+
only showing top 20 rows



## Performing Arithmetic Operations

In [13]:
# Spark can perform mathematical operations on entire columns
# similar to numpy broadcasting.
mpg.select(mpg.hwy, mpg.hwy + 1).show(5)

+---+---------+
|hwy|(hwy + 1)|
+---+---------+
| 29|       30|
| 29|       30|
| 31|       32|
| 30|       31|
| 26|       27|
+---+---------+
only showing top 5 rows



In [14]:
mpg.select(mpg.hwy, mpg.hwy/2).show(5)

+---+---------+
|hwy|(hwy / 2)|
+---+---------+
| 29|     14.5|
| 29|     14.5|
| 31|     15.5|
| 30|     15.0|
| 26|     13.0|
+---+---------+
only showing top 5 rows



In [15]:
mpg.select(mpg.hwy, mpg.hwy*2).show(5)

+---+---------+
|hwy|(hwy * 2)|
+---+---------+
| 29|       58|
| 29|       58|
| 31|       62|
| 30|       60|
| 26|       52|
+---+---------+
only showing top 5 rows



## Renaming Columns

In [16]:
# Similar to SQL, we can rename columns
mpg.select(mpg.hwy.alias("highway_mileage")).show(5)

+---------------+
|highway_mileage|
+---------------+
|             29|
|             29|
|             31|
|             30|
|             26|
+---------------+
only showing top 5 rows



In [17]:
mpg.describe()

DataFrame[summary: string, manufacturer: string, model: string, displ: string, year: string, cyl: string, trans: string, drv: string, cty: string, hwy: string, fl: string, class: string]

In [18]:
col1 = mpg.hwy.alias("highway_mileage")
col2 = (mpg.hwy / 2).alias("highway_mileage_halved")
mpg.select(col1, col2).show(5)

+---------------+----------------------+
|highway_mileage|highway_mileage_halved|
+---------------+----------------------+
|             29|                  14.5|
|             29|                  14.5|
|             31|                  15.5|
|             30|                  15.0|
|             26|                  13.0|
+---------------+----------------------+
only showing top 5 rows



## col and expr functions

In [19]:
mpg.select( 
    expr('hwy AS highway_mileage')
).show()

+---------------+
|highway_mileage|
+---------------+
|             29|
|             29|
|             31|
|             30|
|             26|
|             26|
|             27|
|             26|
|             25|
|             28|
|             27|
|             25|
|             25|
|             25|
|             25|
|             24|
|             25|
|             23|
|             20|
|             15|
+---------------+
only showing top 20 rows



## Spark Temporary Tables

In [20]:
textdf = spark.createDataFrame(
    pd.DataFrame(
        {
            "address": [
                "600 Navarro St ste 600, San Antonio, TX 78205",
                "3130 Broadway St, San Antonio, TX 78209",
                "303 Pearl Pkwy, San Antonio, TX 78215",
                "1255 SW Loop 410, San Antonio, TX 78227",
            ]
        }
    )
)

textdf.show(truncate=False)

+---------------------------------------------+
|address                                      |
+---------------------------------------------+
|600 Navarro St ste 600, San Antonio, TX 78205|
|3130 Broadway St, San Antonio, TX 78209      |
|303 Pearl Pkwy, San Antonio, TX 78215        |
|1255 SW Loop 410, San Antonio, TX 78227      |
+---------------------------------------------+



In [21]:
textdf.select(
    "address",
    regexp_extract("address", r"^(\d+)", 1).alias("street_no"),
    regexp_extract("address", r"^\d+\s([\w\s]+?),", 1).alias("street"),
).show(truncate=False)

+---------------------------------------------+---------+------------------+
|address                                      |street_no|street            |
+---------------------------------------------+---------+------------------+
|600 Navarro St ste 600, San Antonio, TX 78205|600      |Navarro St ste 600|
|3130 Broadway St, San Antonio, TX 78209      |3130     |Broadway St       |
|303 Pearl Pkwy, San Antonio, TX 78215        |303      |Pearl Pkwy        |
|1255 SW Loop 410, San Antonio, TX 78227      |1255     |SW Loop 410       |
+---------------------------------------------+---------+------------------+



## Appendix

In [22]:
# Number of multithreads my cpu has.
mp.cpu_count()

12