# Sampling and other df APIs

In [1]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession, Row
import pyspark.sql.functions as F
import pyspark.sql.window as W

In [2]:
spark = SparkSession \
    .builder \
    .appName("Spark Training - DF APIs") \
    .getOrCreate()

### Sampling
It is often required to analyse a large dataset with millions of rows which takes a lot of time, so it's recommended to use a random subset of data from large files. In this section, we'll discuss various sampling methods in spark

### 1.1 sample(withReplacement=None, fraction=None, seed=None)
- withReplacement: If true, it means the same value can occur more than once in the data
- fraction: can be between o to 1. If 0.3 pyspark will try to get 30% or records but it's not guaranteed to be exact

In [4]:
df = spark.range(100)
df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+
only showing top 20 rows



In [6]:
# without replacement
df.sample(fraction=0.2).show()

+---+
| id|
+---+
|  0|
| 10|
| 16|
| 24|
| 27|
| 29|
| 36|
| 47|
| 51|
| 53|
| 56|
| 61|
| 68|
| 75|
| 80|
| 82|
| 88|
| 91|
| 95|
| 99|
+---+



In [11]:
# with replacement
df.sample(fraction=0.2, withReplacement=True, seed=200).show()
# notice some values occur more than once e.g. 21

+---+
| id|
+---+
|  0|
|  2|
| 12|
| 14|
| 14|
| 16|
| 21|
| 21|
| 30|
| 33|
| 35|
| 37|
| 38|
| 46|
| 50|
| 51|
| 52|
| 58|
| 63|
| 74|
+---+
only showing top 20 rows



### 1.2 sampleBy(self, col, fractions, seed=None)
- Returns a stratified sample without replacement based on the fraction given on each stratum.
- If a stratum is not specified, we treat its fraction as zero

In [15]:
df = spark.range(100).select((F.col('id') % 3).alias('key'))
df.show()

+---+
|key|
+---+
|  0|
|  1|
|  2|
|  0|
|  1|
|  2|
|  0|
|  1|
|  2|
|  0|
|  1|
|  2|
|  0|
|  1|
|  2|
|  0|
|  1|
|  2|
|  0|
|  1|
+---+
only showing top 20 rows



In [19]:
df.distinct().show()

+---+
|key|
+---+
|  0|
|  1|
|  2|
+---+



In [25]:
# create a sample where 0 occurs 10% of the time and 1 occurs 20% of the time.
sampled = df.sampleBy('key', fractions={0:0.1, 1:0.2}, seed=10)
sampled.show()

+---+
|key|
+---+
|  0|
|  1|
|  1|
|  1|
|  1|
|  1|
|  1|
+---+



In [26]:
# lets do a groupby to see how many times each key is pulled
sampled.groupBy('key').count().show()

+---+-----+
|key|count|
+---+-----+
|  0|    1|
|  1|    6|
+---+-----+



In [33]:
# Prepare input data
data = [
    ('James', 'Sales', 'NY', None, 34),
    ('Alicia', 'Sales', 'NY', 8600, 56),
    ('Robert', 'Sales', 'CA', 8100, 30),
    ('John', 'Sales', 'AZ', 8600, 31),
    ('Rose', 'Sales', 'AZ', 8100, 33),
    ('Lisa', 'Finance', 'CA', 9000, 24),
    ('Deja', 'Finance', 'CA', 9900, 40),
    ('Sugie', 'Finance', 'NY', 8300, 36),
    ('Ram', 'Finance', 'NY', 7900, 53),
    ('Kyle', 'Marketing', 'CA', 8000, 25),
    ('Reid', 'Marketing', 'NY', 9100, 50)
]

schema = ['empname', 'dept', 'state', 'salary', 'age']
df = spark.createDataFrame(data=data, schema=schema)

df.show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|  null| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Robert|    Sales|   CA|  8100| 30|
|   John|    Sales|   AZ|  8600| 31|
|   Rose|    Sales|   AZ|  8100| 33|
|   Lisa|  Finance|   CA|  9000| 24|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|    Ram|  Finance|   NY|  7900| 53|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



### 1.2 first(col, ingnornulls=False) /similarly last()

In [35]:
df.show(5)

+-------+-----+-----+------+---+
|empname| dept|state|salary|age|
+-------+-----+-----+------+---+
|  James|Sales|   NY|  null| 34|
| Alicia|Sales|   NY|  8600| 56|
| Robert|Sales|   CA|  8100| 30|
|   John|Sales|   AZ|  8600| 31|
|   Rose|Sales|   AZ|  8100| 33|
+-------+-----+-----+------+---+
only showing top 5 rows



In [36]:
df.select(F.first(df.salary)).show()

+-------------+
|first(salary)|
+-------------+
|         null|
+-------------+



In [37]:
# lets see first none null values
df.select(F.first(df.salary, ignorenulls=True)).show()

+-------------+
|first(salary)|
+-------------+
|         8600|
+-------------+



### 1.3 greatest(`*`cols) & similarly least(*)
Returns the greatest value of the list of column names, skipping null values

In [39]:
df.select(F.greatest(df.salary, df.age)).show()

+---------------------+
|greatest(salary, age)|
+---------------------+
|                   34|
|                 8600|
|                 8100|
|                 8600|
|                 8100|
|                 9000|
|                 9900|
|                 8300|
|                 7900|
|                 8000|
|                 9100|
+---------------------+



### 1.4 skewness(col)

In [40]:
df.show(5)

+-------+-----+-----+------+---+
|empname| dept|state|salary|age|
+-------+-----+-----+------+---+
|  James|Sales|   NY|  null| 34|
| Alicia|Sales|   NY|  8600| 56|
| Robert|Sales|   CA|  8100| 30|
|   John|Sales|   AZ|  8600| 31|
|   Rose|Sales|   AZ|  8100| 33|
+-------+-----+-----+------+---+
only showing top 5 rows



In [41]:
df.select(F.skewness(df.salary)).show()

+------------------+
|  skewness(salary)|
+------------------+
|0.9433822103481873|
+------------------+



### 1.5 collect_list(col)
Returns a list of objects with duplicates

In [44]:
df.select(F.collect_list(df.age)).show(truncate=False)

+--------------------------------------------+
|collect_list(age)                           |
+--------------------------------------------+
|[34, 56, 30, 31, 33, 24, 40, 36, 53, 25, 50]|
+--------------------------------------------+



### `1.6 Dataframe built-in functions`
- new column
- encryption
- string
- RegExp
- Date
- Null
- Collection
- Na
- Math & Statistician
- Explode & Flatten
- Formatting
- Json

###  1.6.1 monotonically_increasing_id()
- A column that generates monotonically increasing 64-bit integers
- The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive
- `Use Case`: Create a Primary key/unique column

In [45]:
df.show()

+-------+---------+-----+------+---+
|empname|     dept|state|salary|age|
+-------+---------+-----+------+---+
|  James|    Sales|   NY|  null| 34|
| Alicia|    Sales|   NY|  8600| 56|
| Robert|    Sales|   CA|  8100| 30|
|   John|    Sales|   AZ|  8600| 31|
|   Rose|    Sales|   AZ|  8100| 33|
|   Lisa|  Finance|   CA|  9000| 24|
|   Deja|  Finance|   CA|  9900| 40|
|  Sugie|  Finance|   NY|  8300| 36|
|    Ram|  Finance|   NY|  7900| 53|
|   Kyle|Marketing|   CA|  8000| 25|
|   Reid|Marketing|   NY|  9100| 50|
+-------+---------+-----+------+---+



In [46]:
df.withColumn('id', F.monotonically_increasing_id()).show()

+-------+---------+-----+------+---+-----------+
|empname|     dept|state|salary|age|         id|
+-------+---------+-----+------+---+-----------+
|  James|    Sales|   NY|  null| 34|          0|
| Alicia|    Sales|   NY|  8600| 56| 8589934592|
| Robert|    Sales|   CA|  8100| 30|17179869184|
|   John|    Sales|   AZ|  8600| 31|17179869185|
|   Rose|    Sales|   AZ|  8100| 33|25769803776|
|   Lisa|  Finance|   CA|  9000| 24|34359738368|
|   Deja|  Finance|   CA|  9900| 40|42949672960|
|  Sugie|  Finance|   NY|  8300| 36|42949672961|
|    Ram|  Finance|   NY|  7900| 53|51539607552|
|   Kyle|Marketing|   CA|  8000| 25|60129542144|
|   Reid|Marketing|   NY|  9100| 50|60129542145|
+-------+---------+-----+------+---+-----------+



### 1.6.2 lit()
- Create a column with a fixed/static value
- Also useful when concatenation values

In [48]:
# static columns
df.withColumn('country', F.lit('USA')).show()

+-------+---------+-----+------+---+-------+
|empname|     dept|state|salary|age|country|
+-------+---------+-----+------+---+-------+
|  James|    Sales|   NY|  null| 34|    USA|
| Alicia|    Sales|   NY|  8600| 56|    USA|
| Robert|    Sales|   CA|  8100| 30|    USA|
|   John|    Sales|   AZ|  8600| 31|    USA|
|   Rose|    Sales|   AZ|  8100| 33|    USA|
|   Lisa|  Finance|   CA|  9000| 24|    USA|
|   Deja|  Finance|   CA|  9900| 40|    USA|
|  Sugie|  Finance|   NY|  8300| 36|    USA|
|    Ram|  Finance|   NY|  7900| 53|    USA|
|   Kyle|Marketing|   CA|  8000| 25|    USA|
|   Reid|Marketing|   NY|  9100| 50|    USA|
+-------+---------+-----+------+---+-------+



In [60]:
# concatenating values
df.withColumn('sal_n_age', F.concat('salary', F.lit('$ & '),'age',F.lit('yrs'))).show()

+-------+---------+-----+------+---+-------------+
|empname|     dept|state|salary|age|    sal_n_age|
+-------+---------+-----+------+---+-------------+
|  James|    Sales|   NY|  null| 34|         null|
| Alicia|    Sales|   NY|  8600| 56|8600$ & 56yrs|
| Robert|    Sales|   CA|  8100| 30|8100$ & 30yrs|
|   John|    Sales|   AZ|  8600| 31|8600$ & 31yrs|
|   Rose|    Sales|   AZ|  8100| 33|8100$ & 33yrs|
|   Lisa|  Finance|   CA|  9000| 24|9000$ & 24yrs|
|   Deja|  Finance|   CA|  9900| 40|9900$ & 40yrs|
|  Sugie|  Finance|   NY|  8300| 36|8300$ & 36yrs|
|    Ram|  Finance|   NY|  7900| 53|7900$ & 53yrs|
|   Kyle|Marketing|   CA|  8000| 25|8000$ & 25yrs|
|   Reid|Marketing|   NY|  9100| 50|9100$ & 50yrs|
+-------+---------+-----+------+---+-------------+



### 1.6.3 expr(str)
- Takes an SQL expression as string argument, executes the expression and returns a column Type
- We can use SQL-like functions that are not present in pyspark column type and built-in functions (pyspark.sql.functions) e.g. `CASE WHEN`, `Concat operator` etc

In [62]:
# use case 1
df.withColumn('empname_len', F.expr("length(empname)")).show()

+-------+---------+-----+------+---+-----------+
|empname|     dept|state|salary|age|empname_len|
+-------+---------+-----+------+---+-----------+
|  James|    Sales|   NY|  null| 34|          5|
| Alicia|    Sales|   NY|  8600| 56|          6|
| Robert|    Sales|   CA|  8100| 30|          6|
|   John|    Sales|   AZ|  8600| 31|          4|
|   Rose|    Sales|   AZ|  8100| 33|          4|
|   Lisa|  Finance|   CA|  9000| 24|          4|
|   Deja|  Finance|   CA|  9900| 40|          4|
|  Sugie|  Finance|   NY|  8300| 36|          5|
|    Ram|  Finance|   NY|  7900| 53|          3|
|   Kyle|Marketing|   CA|  8000| 25|          4|
|   Reid|Marketing|   NY|  9100| 50|          4|
+-------+---------+-----+------+---+-----------+



In [63]:
# use case 2
df.withColumn('age_groups', F.expr("case when age > 50 then 'senior' else 'adult' end")).show()

+-------+---------+-----+------+---+----------+
|empname|     dept|state|salary|age|age_groups|
+-------+---------+-----+------+---+----------+
|  James|    Sales|   NY|  null| 34|     adult|
| Alicia|    Sales|   NY|  8600| 56|    senior|
| Robert|    Sales|   CA|  8100| 30|     adult|
|   John|    Sales|   AZ|  8600| 31|     adult|
|   Rose|    Sales|   AZ|  8100| 33|     adult|
|   Lisa|  Finance|   CA|  9000| 24|     adult|
|   Deja|  Finance|   CA|  9900| 40|     adult|
|  Sugie|  Finance|   NY|  8300| 36|     adult|
|    Ram|  Finance|   NY|  7900| 53|    senior|
|   Kyle|Marketing|   CA|  8000| 25|     adult|
|   Reid|Marketing|   NY|  9100| 50|     adult|
+-------+---------+-----+------+---+----------+



In [69]:
# use case 3
df.withColumn('age_plus_10', F.expr("age + 10")).show()

+-------+---------+-----+------+---+-----------+
|empname|     dept|state|salary|age|age_plus_10|
+-------+---------+-----+------+---+-----------+
|  James|    Sales|   NY|  null| 34|         44|
| Alicia|    Sales|   NY|  8600| 56|         66|
| Robert|    Sales|   CA|  8100| 30|         40|
|   John|    Sales|   AZ|  8600| 31|         41|
|   Rose|    Sales|   AZ|  8100| 33|         43|
|   Lisa|  Finance|   CA|  9000| 24|         34|
|   Deja|  Finance|   CA|  9900| 40|         50|
|  Sugie|  Finance|   NY|  8300| 36|         46|
|    Ram|  Finance|   NY|  7900| 53|         63|
|   Kyle|Marketing|   CA|  8000| 25|         35|
|   Reid|Marketing|   NY|  9100| 50|         60|
+-------+---------+-----+------+---+-----------+



### 1.6.4 spark_partition_id()
- Generates a column with partition ids

In [70]:
df1 = spark.range(10)
df.rdd.getNumPartitions()

8

In [71]:
df1 = df1.repartition(5)
df1.rdd.getNumPartitions()

5

In [76]:
# since we have 5 partitions, 5 partition ids will be created
df1.select('id', F.spark_partition_id()).show()

+---+--------------------+
| id|SPARK_PARTITION_ID()|
+---+--------------------+
|  3|                   0|
|  0|                   1|
|  1|                   1|
|  4|                   1|
|  7|                   2|
|  8|                   2|
|  5|                   3|
|  6|                   3|
|  9|                   3|
|  2|                   4|
+---+--------------------+



### 1.6.5 rand(seed) and randn(seed)
- `rand:` Generates a column with independent and identically distributed (iid) samples from uniform distribution (constant freqs)
- `randn:` generates a column with independent and identically distributed (iid) samples from a standard normal distribution (bell shaped)

In [80]:
# only positive values
df.withColumn('rand_col', F.rand(seed=70)).show(truncate=False)

+-------+---------+-----+------+---+-------------------+
|empname|dept     |state|salary|age|rand_col           |
+-------+---------+-----+------+---+-------------------+
|James  |Sales    |NY   |null  |34 |0.9686366478115398 |
|Alicia |Sales    |NY   |8600  |56 |0.7578638408379902 |
|Robert |Sales    |CA   |8100  |30 |0.9142913539241686 |
|John   |Sales    |AZ   |8600  |31 |0.6124331254386841 |
|Rose   |Sales    |AZ   |8100  |33 |0.3576198156706625 |
|Lisa   |Finance  |CA   |9000  |24 |0.28526473713733846|
|Deja   |Finance  |CA   |9900  |40 |0.8358218713663149 |
|Sugie  |Finance  |NY   |8300  |36 |0.7937570767235927 |
|Ram    |Finance  |NY   |7900  |53 |0.3876281875521618 |
|Kyle   |Marketing|CA   |8000  |25 |0.25100516160693154|
|Reid   |Marketing|NY   |9100  |50 |0.8977705819290853 |
+-------+---------+-----+------+---+-------------------+



In [81]:
# both positive and negative values
df.withColumn('rand_norm', F.randn(seed=90)).show()

+-------+---------+-----+------+---+--------------------+
|empname|     dept|state|salary|age|           rand_norm|
+-------+---------+-----+------+---+--------------------+
|  James|    Sales|   NY|  null| 34|  0.5879349573780887|
| Alicia|    Sales|   NY|  8600| 56|-0.24527770770278648|
| Robert|    Sales|   CA|  8100| 30| -0.6046413008367373|
|   John|    Sales|   AZ|  8600| 31|-0.35642077694085117|
|   Rose|    Sales|   AZ|  8100| 33|  1.0779880693130197|
|   Lisa|  Finance|   CA|  9000| 24|-0.41863979724682465|
|   Deja|  Finance|   CA|  9900| 40| -0.6200861214441888|
|  Sugie|  Finance|   NY|  8300| 36|  0.6094312221458198|
|    Ram|  Finance|   NY|  7900| 53|-0.16990689230803258|
|   Kyle|Marketing|   CA|  8000| 25|0.003890795511576...|
|   Reid|Marketing|   NY|  9100| 50| -0.8070887599283788|
+-------+---------+-----+------+---+--------------------+



### 2. String Manipulation Functions

### 2.1 split(str, pattern)
- Splits str around patern (pattern is a regular expression)
- Can be used to extract positional elements from delimited fields

In [82]:
ord = spark.read.load('PracticeFiles/Orders', sep=',', format='csv', schema=('order_id int,order_date timestamp, order_customer_id int, order_status string'))
ord.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [86]:
# use case 1
ord.select('order_date', F.split(ord.order_date, pattern='-').alias('order_date_list')).show(5)

+-------------------+--------------------+
|         order_date|     order_date_list|
+-------------------+--------------------+
|2013-07-25 00:00:00|[2013, 07, 25 00:...|
|2013-07-25 00:00:00|[2013, 07, 25 00:...|
|2013-07-25 00:00:00|[2013, 07, 25 00:...|
|2013-07-25 00:00:00|[2013, 07, 25 00:...|
|2013-07-25 00:00:00|[2013, 07, 25 00:...|
+-------------------+--------------------+
only showing top 5 rows



In [87]:
# extract first element of list created from split
ord.select('order_date', F.split(ord.order_date, pattern='-')[0].alias('order_year')).show(5)

+-------------------+----------+
|         order_date|order_year|
+-------------------+----------+
|2013-07-25 00:00:00|      2013|
|2013-07-25 00:00:00|      2013|
|2013-07-25 00:00:00|      2013|
|2013-07-25 00:00:00|      2013|
|2013-07-25 00:00:00|      2013|
+-------------------+----------+
only showing top 5 rows



In [100]:
# use case 2
df1 = spark.createDataFrame([('abc2cd23fe27kI',)], ['s',])
df1.show()

+--------------+
|             s|
+--------------+
|abc2cd23fe27kI|
+--------------+



In [101]:
df1.select(F.split(df1.s, '[0-9]+')).show()

+--------------------+
|split(s, [0-9]+, -1)|
+--------------------+
|   [abc, cd, fe, kI]|
+--------------------+



### 2.2 length(col)

In [103]:
ord.withColumn('length_status', F.length(ord.order_status)).show(5)

+--------+-------------------+-----------------+---------------+-------------+
|order_id|         order_date|order_customer_id|   order_status|length_status|
+--------+-------------------+-----------------+---------------+-------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|            6|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|           15|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|            8|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|            6|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|            8|
+--------+-------------------+-----------------+---------------+-------------+
only showing top 5 rows



### 2.3 lower(col), upper(col), initcap(col)
- `initcap():` capitalises first letter in a string, and leave others in lower case. Similar to propcase
- Same as with other languages 

In [104]:
ord.withColumn('init_cap', F.initcap(ord.order_status)).show(5)

+--------+-------------------+-----------------+---------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|       init_cap|
+--------+-------------------+-----------------+---------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|         Closed|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|Pending_payment|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|       Complete|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|         Closed|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|       Complete|
+--------+-------------------+-----------------+---------------+---------------+
only showing top 5 rows



### 2.3 ltrim(col), rtrim(col), trim(col)

In [109]:
df2 = spark.createDataFrame([('  spark ',), ('   developer    ',)], schema=['col1'])
df2.show()

+----------------+
|            col1|
+----------------+
|          spark |
|   developer    |
+----------------+



Notice the presence of leading and trailing spaces

In [110]:
df2.withColumn('trimmed_value', F.trim('col1')).show()

+----------------+-------------+
|            col1|trimmed_value|
+----------------+-------------+
|          spark |        spark|
|   developer    |    developer|
+----------------+-------------+



### 2.4 lpad(col, len, pad), rpad(col, len, pad)
Pads the string columns to width 'len' with 'pad'

`e.g:` in the order df, convert order_id to a field with length 10 by padding zeros to the left (front)

In [115]:
ord.show(3)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 3 rows



In [114]:
ord.withColumn('padding', F.lpad(ord.order_id, len=10, pad='0')).show(3)

+--------+-------------------+-----------------+---------------+----------+
|order_id|         order_date|order_customer_id|   order_status|   padding|
+--------+-------------------+-----------------+---------------+----------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|0000000001|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|0000000002|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|0000000003|
+--------+-------------------+-----------------+---------------+----------+
only showing top 3 rows



### 2.5 reverse(col)
- Returns a reversed string

In [117]:
ord.withColumn('reversed_status', F.reverse(ord.order_status)).show(3)

+--------+-------------------+-----------------+---------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|reversed_status|
+--------+-------------------+-----------------+---------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|         DESOLC|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|TNEMYAP_GNIDNEP|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|       ETELPMOC|
+--------+-------------------+-----------------+---------------+---------------+
only showing top 3 rows



### 2.6 repeat(col, n)
- Repeats a string column n times

In [119]:
ord.withColumn('repeat_status', F.repeat(ord.order_status, n=3)).show(3, truncate=False)

+--------+-------------------+-----------------+---------------+---------------------------------------------+
|order_id|order_date         |order_customer_id|order_status   |repeat_status                                |
+--------+-------------------+-----------------+---------------+---------------------------------------------+
|1       |2013-07-25 00:00:00|11599            |CLOSED         |CLOSEDCLOSEDCLOSED                           |
|2       |2013-07-25 00:00:00|256              |PENDING_PAYMENT|PENDING_PAYMENTPENDING_PAYMENTPENDING_PAYMENT|
|3       |2013-07-25 00:00:00|12111            |COMPLETE       |COMPLETECOMPLETECOMPLETE                     |
+--------+-------------------+-----------------+---------------+---------------------------------------------+
only showing top 3 rows



### 2.7 hex(col)
- Computes hex value of a given column

In [121]:
ord.withColumn('hex', F.hex(ord.order_status)).show(3, truncate=False)

+--------+-------------------+-----------------+---------------+------------------------------+
|order_id|order_date         |order_customer_id|order_status   |hex                           |
+--------+-------------------+-----------------+---------------+------------------------------+
|1       |2013-07-25 00:00:00|11599            |CLOSED         |434C4F534544                  |
|2       |2013-07-25 00:00:00|256              |PENDING_PAYMENT|50454E44494E475F5041594D454E54|
|3       |2013-07-25 00:00:00|12111            |COMPLETE       |434F4D504C455445              |
+--------+-------------------+-----------------+---------------+------------------------------+
only showing top 3 rows



#### 2.8 concat(*cols)
concatenates multiple columns together into a single column

In [122]:
ord.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [124]:
# Task 1 - concatenate order id and status
ord.withColumn('IDStatus', F.concat('order_id','order_status')).show(5)

+--------+-------------------+-----------------+---------------+----------------+
|order_id|         order_date|order_customer_id|   order_status|        IDStatus|
+--------+-------------------+-----------------+---------------+----------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|         1CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|2PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|       3COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|         4CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|       5COMPLETE|
+--------+-------------------+-----------------+---------------+----------------+
only showing top 5 rows



In [127]:
# Task 2 - concatenate order id to status keeping a space between them
ord.withColumn('IDStatus', F.concat('order_id', F.lit(' '), 'order_status')).show(5)

+--------+-------------------+-----------------+---------------+-----------------+
|order_id|         order_date|order_customer_id|   order_status|         IDStatus|
+--------+-------------------+-----------------+---------------+-----------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|         1 CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|2 PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|       3 COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|         4 CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|       5 COMPLETE|
+--------+-------------------+-----------------+---------------+-----------------+
only showing top 5 rows



#### 2.9 concat_ws(sep, *cols)
- Similar to concat() but this allows you to provide a separator

In [133]:
ord.withColumn('IDStatus', F.concat_ws(' ',  'order_id','order_status')).show(5)

+--------+-------------------+-----------------+---------------+-----------------+
|order_id|         order_date|order_customer_id|   order_status|         IDStatus|
+--------+-------------------+-----------------+---------------+-----------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|         1 CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|2 PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|       3 COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|         4 CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|       5 COMPLETE|
+--------+-------------------+-----------------+---------------+-----------------+
only showing top 5 rows



#### 2.10 substring(str, pos, len)
- Starts retrieving substring from pos and is of length len
- Unlike sql and SAS where this only works on columns of type string (and not date fields), here this works on any column types

In [136]:
# Task - extract the date part of order_date
# notice this has a type = timestamp
ord.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [135]:
ord.withColumn('order_year', F.substring('order_date',1,10)).show(5)

+--------+-------------------+-----------------+---------------+----------+
|order_id|         order_date|order_customer_id|   order_status|order_year|
+--------+-------------------+-----------------+---------------+----------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|2013-07-25|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|2013-07-25|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|2013-07-25|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|2013-07-25|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|2013-07-25|
+--------+-------------------+-----------------+---------------+----------+
only showing top 5 rows



#### 2.11 substring_index(str, delim, count)
- Returns the substring from string str before counting occurences of the delimiter - delim
- If count > 0, everything to the left of the delimiter is returned
- If coung < 0, everything to the right of the final delimiter (counting from the right) is returned
- substring_index performs a case-sensitive match when searching for delim

In [137]:
ord.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [138]:
# Task 1: split order date field by hyphen, and select the first element
ord.withColumn('dummy', F.substring_index(ord.order_date, '-', 1)).show(5)

+--------+-------------------+-----------------+---------------+-----+
|order_id|         order_date|order_customer_id|   order_status|dummy|
+--------+-------------------+-----------------+---------------+-----+
|       1|2013-07-25 00:00:00|            11599|         CLOSED| 2013|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT| 2013|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE| 2013|
|       4|2013-07-25 00:00:00|             8827|         CLOSED| 2013|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE| 2013|
+--------+-------------------+-----------------+---------------+-----+
only showing top 5 rows



In [139]:
# Task 2: split order date field by hyphen, and select the first 2 elements
ord.withColumn('dummy', F.substring_index(ord.order_date, '-', 2)).show(5)

+--------+-------------------+-----------------+---------------+-------+
|order_id|         order_date|order_customer_id|   order_status|  dummy|
+--------+-------------------+-----------------+---------------+-------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|2013-07|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|2013-07|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|2013-07|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|2013-07|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|2013-07|
+--------+-------------------+-----------------+---------------+-------+
only showing top 5 rows



#### 2.12 instr(str, substr)
- Locates the position of the first occurence of subtr column in the given string
- Returns null if either of the arguments are null

In [141]:
# Task: Find the position of the first occurence of '_' in order_status
ord.withColumn('instr', F.instr('order_status', '_')).show(5)

+--------+-------------------+-----------------+---------------+-----+
|order_id|         order_date|order_customer_id|   order_status|instr|
+--------+-------------------+-----------------+---------------+-----+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|    0|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|    8|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|    0|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|    0|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|    0|
+--------+-------------------+-----------------+---------------+-----+
only showing top 5 rows



#### 2.13 locate(substr, str, pos=1)
- locate the position of the first occurence of substr in a string column, after position pos
- 1 based (rather than 0 based) indexing used.
- 0 returned if substr not found

In [143]:
ord.withColumn('locate', F.locate('00', 'order_date', 1)).show(5)

+--------+-------------------+-----------------+---------------+------+
|order_id|         order_date|order_customer_id|   order_status|locate|
+--------+-------------------+-----------------+---------------+------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|    12|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|    12|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|    12|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|    12|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|    12|
+--------+-------------------+-----------------+---------------+------+
only showing top 5 rows



In [144]:
ord.withColumn('locate', F.locate('00', 'order_date', 15)).show(5)

+--------+-------------------+-----------------+---------------+------+
|order_id|         order_date|order_customer_id|   order_status|locate|
+--------+-------------------+-----------------+---------------+------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|    15|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|    15|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|    15|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|    15|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|    15|
+--------+-------------------+-----------------+---------------+------+
only showing top 5 rows



#### 2.14 translate(srcCol, matching, replace):
translate any character in the srcCol by a character in 'matching'

In [148]:
df = spark.createDataFrame([('translate',)], ['col',])
df.show()

+---------+
|      col|
+---------+
|translate|
+---------+



Task:
- within the col field, mate the following strings with the following values: 'r' with 1, 'n' with 2, 'l' with 3, 't' with ' '

In [151]:
df.select(df.col, F.translate(df.col, 'rnlt', '123')).show()

+---------+-------------------------+
|      col|translate(col, rnlt, 123)|
+---------+-------------------------+
|translate|                  1a2s3ae|
+---------+-------------------------+



#### 2.15 overlay(src, replace, pos, len)
- New in version 3
- overlays the specified portion of 'src' with 'replace' starting with the byte position 'pos' of 'src' and proceeding for 'len' bytes

# 3. RegExp

### 3.1 regexp_extract(str, pattern, idx)
- Used to extract regex pattern - pattern - from 'str'
- idx = which group to extract. if 1 ==> we extract the first matching group

In [153]:
df = spark.createDataFrame(data=[('11ss1 ab',)], schema=['str'])
df.show()

+--------+
|     str|
+--------+
|11ss1 ab|
+--------+



In [163]:
# task 1: retrieve first set of digits
df.withColumn('extract_digits', F.regexp_extract(df.str, '(\d+)', 1)).show()

+--------+--------------+
|     str|extract_digits|
+--------+--------------+
|11ss1 ab|            11|
+--------+--------------+



In [167]:
# task 2: retrieve set of letters followed by alphabest
df.withColumn('extract_letters', F.regexp_extract(df.str, '(\d+)(\w+)', 1)).show()

+--------+---------------+
|     str|extract_letters|
+--------+---------------+
|11ss1 ab|             11|
+--------+---------------+



### 3.2 regexp_replace(str, pattern, replacement)
- Replaces a column value with a string for another string or substrin
- empty string returned if no match found

In [170]:
# task1: replace all digits with XX
df.withColumn('regexp_replace', F.regexp_replace(df.str, ('\d'), 'XX')).show()

+--------+--------------+
|     str|regexp_replace|
+--------+--------------+
|11ss1 ab|   XXXXssXX ab|
+--------+--------------+



In [175]:
# task 2: replace all double digits with XX
df.withColumn('regexp_replace', F.regexp_replace(df.str, ('\d\d'), 'XX')).show()

+--------+--------------+
|     str|regexp_replace|
+--------+--------------+
|11ss1 ab|      XXss1 ab|
+--------+--------------+



In [178]:
addr = [(1, '2625 Oxford University Rd', 'Phoenix'), (2, "1234 Thomas St", "Glendale")]
df=spark.createDataFrame(addr, ['id','addr','city'])
df.show(truncate=False)

+---+-------------------------+--------+
|id |addr                     |city    |
+---+-------------------------+--------+
|1  |2625 Oxford University Rd|Phoenix |
|2  |1234 Thomas St           |Glendale|
+---+-------------------------+--------+



In [188]:
# Task: Replace every occurence of 'Rd' with 'Road' and 'St' with 'Street' in the address field
df.withColumn('new_addr', F.when(df.addr.endswith('Rd'), F.regexp_replace(df.addr, 'Rd', 'Road'))\
                           .when(df.addr.endswith('St'), F.regexp_replace(df.addr, 'St', 'Street'))\
                           .otherwise(df.addr))\
                           .show(truncate=False)

+---+-------------------------+--------+---------------------------+
|id |addr                     |city    |new_addr                   |
+---+-------------------------+--------+---------------------------+
|1  |2625 Oxford University Rd|Phoenix |2625 Oxford University Road|
|2  |1234 Thomas St           |Glendale|1234 Thomas Street         |
+---+-------------------------+--------+---------------------------+



### 3.3 rlike()
- Not a dataframe function but a column function to check if a pattern is found or not

In [187]:
df.show()

+---+--------------------+--------+
| id|                addr|    city|
+---+--------------------+--------+
|  1|2625 Oxford Unive...| Phoenix|
|  2|      1234 Thomas St|Glendale|
+---+--------------------+--------+



In [191]:
df.select(df.addr, df.addr.rlike('(\d)')).show(truncate=False)

+-------------------------+---------------+
|addr                     |addr RLIKE (\d)|
+-------------------------+---------------+
|2625 Oxford University Rd|true           |
|1234 Thomas St           |true           |
+-------------------------+---------------+



# 4. Null Functions

In [192]:
df = spark.createDataFrame([('Robert', 1, None, 114.0), ('John', None, 2577, float('nan'))], ('name', 'id', 'phone', 'stAdd'))
df.show()

+------+----+-----+-----+
|  name|  id|phone|stAdd|
+------+----+-----+-----+
|Robert|   1| null|114.0|
|  John|null| 2577|  NaN|
+------+----+-----+-----+



### 4.1 isnull(col)
- Returns true if the column is null

In [193]:
df.select('phone', F.isnull(df.phone)).show()

+-----+---------------+
|phone|(phone IS NULL)|
+-----+---------------+
| null|           true|
| 2577|          false|
+-----+---------------+



### 4.2 isnan(col)
Returns true if the column is NaN

In [194]:
df.select('stAdd', F.isnan('stAdd')).show()

+-----+------------+
|stAdd|isnan(stAdd)|
+-----+------------+
|114.0|       false|
|  NaN|        true|
+-----+------------+



### `4.3 nanvl(col1, col2)`
- similar to the coalesce function (which is limited in scope in pyspark)
- If col1 has a value, it will return it. Else it returns value in col2
- Not a value in this case = NaN or null

In [195]:
df.select('stAdd', 'phone', F.nanvl('stAdd', 'phone')).show()

+-----+-----+-------------------+
|stAdd|phone|nanvl(stAdd, phone)|
+-----+-----+-------------------+
|114.0| null|              114.0|
|  NaN| 2577|             2577.0|
+-----+-----+-------------------+



### 4.4 coalesce(*cols)
- Returns the first column that is not null
- `wierd`: NaN is treated as a value 

In [196]:
df.select('stAdd', 'phone', F.coalesce('stAdd', 'phone')).show()

+-----+-----+----------------------+
|stAdd|phone|coalesce(stAdd, phone)|
+-----+-----+----------------------+
|114.0| null|                 114.0|
|  NaN| 2577|                   NaN|
+-----+-----+----------------------+



## 5. na Functions
Used to work with missing data

### 5.1 drop(how='any', thresh=None, subset=None)
- Removes rows with Null values
- Thresh: if specified, drop rows that have less than 'thresh' non-null values. This overwrites the 'how' parameter
- Subset: optional list of column names to consider

In [207]:
data = [('Alice', 80, 10), ('Bob', None, 5), ('Tom', 50, 50), (None, None, None), ('Robert', 30, 35)]
schema = 'name string, Age int, height int'
df = spark.createDataFrame(data, schema=schema)
df.show()

+------+----+------+
|  name| Age|height|
+------+----+------+
| Alice|  80|    10|
|   Bob|null|     5|
|   Tom|  50|    50|
|  null|null|  null|
|Robert|  30|    35|
+------+----+------+



In [209]:
# drop every record with null values
df.na.drop().show()

+------+---+------+
|  name|Age|height|
+------+---+------+
| Alice| 80|    10|
|   Tom| 50|    50|
|Robert| 30|    35|
+------+---+------+



In [210]:
# drop records with less than 1 non-null values (i.e. atleast one present value)
df.na.drop(thresh=1).show()

+------+----+------+
|  name| Age|height|
+------+----+------+
| Alice|  80|    10|
|   Bob|null|     5|
|   Tom|  50|    50|
|Robert|  30|    35|
+------+----+------+



In [211]:
# drop records with less than 1 non-null values (i.e. atleast one present value)
df.na.drop(thresh=3).show()

+------+---+------+
|  name|Age|height|
+------+---+------+
| Alice| 80|    10|
|   Tom| 50|    50|
|Robert| 30|    35|
+------+---+------+



In [213]:
df.show()

+------+----+------+
|  name| Age|height|
+------+----+------+
| Alice|  80|    10|
|   Bob|null|     5|
|   Tom|  50|    50|
|  null|null|  null|
|Robert|  30|    35|
+------+----+------+



In [214]:
# every row(not column) which has a null value in age will be dropped
df.na.drop(subset='age').show()

+------+---+------+
|  name|Age|height|
+------+---+------+
| Alice| 80|    10|
|   Tom| 50|    50|
|Robert| 30|    35|
+------+---+------+



In [215]:
# every row(not column) which has a null value in height will be dropped
df.na.drop(subset='height').show()

+------+----+------+
|  name| Age|height|
+------+----+------+
| Alice|  80|    10|
|   Bob|null|     5|
|   Tom|  50|    50|
|Robert|  30|    35|
+------+----+------+



### 5.2 fill(value, subset=None)
- Replace null values
- value: value to replace null values with
- subset: optional list of column names to consider


In [216]:
df.show()

+------+----+------+
|  name| Age|height|
+------+----+------+
| Alice|  80|    10|
|   Bob|null|     5|
|   Tom|  50|    50|
|  null|null|  null|
|Robert|  30|    35|
+------+----+------+



Note:
- if you fill in na for the whole df with integers, then that will only be applied to the numeric fields
- If you fill with string, that will only be applied to the non-numeric fields

In [218]:
# Task 1: only fill numeric columns 
df.na.fill(50).show()

+------+---+------+
|  name|Age|height|
+------+---+------+
| Alice| 80|    10|
|   Bob| 50|     5|
|   Tom| 50|    50|
|  null| 50|    50|
|Robert| 30|    35|
+------+---+------+



In [219]:
# Task 2: only fill string columns filled
df.na.fill('jayjay').show()

+------+----+------+
|  name| Age|height|
+------+----+------+
| Alice|  80|    10|
|   Bob|null|     5|
|   Tom|  50|    50|
|jayjay|null|  null|
|Robert|  30|    35|
+------+----+------+



In [220]:
# Task 3: only fill 'age'
df.na.fill({'Age':50}).show()

+------+---+------+
|  name|Age|height|
+------+---+------+
| Alice| 80|    10|
|   Bob| 50|     5|
|   Tom| 50|    50|
|  null| 50|  null|
|Robert| 30|    35|
+------+---+------+



### 5.3 replace(to_replace, value=<no value>, subset=None)
- can be used to replace null values, but has a wider use case

In [221]:
df.show()

+------+----+------+
|  name| Age|height|
+------+----+------+
| Alice|  80|    10|
|   Bob|null|     5|
|   Tom|  50|    50|
|  null|null|  null|
|Robert|  30|    35|
+------+----+------+



In [222]:
# task 1: replace 80 with 99
df.na.replace(50, 99).show()

+------+----+------+
|  name| Age|height|
+------+----+------+
| Alice|  80|    10|
|   Bob|null|     5|
|   Tom|  99|    99|
|  null|null|  null|
|Robert|  30|    35|
+------+----+------+



In [227]:
# task 2 beyond nulls: within the name field, replace Alice with Alex and Bob with Cob
df.replace({'Alice': 'Alex', 'Bob':'Cob'}, subset='name').show()

+------+----+------+
|  name| Age|height|
+------+----+------+
|  Alex|  80|    10|
|   Cob|null|     5|
|   Tom|  50|    50|
|  null|null|  null|
|Robert|  30|    35|
+------+----+------+



### 6. Mathematics and Statistics Functions
- Import these from pyspark.sql.functions module
- `abs(col)`
- `exp(col)`
- `factorial(col)`
- `sqrt(col)`
- `cbrt(col)`: cube root of a value
- `pow(col, n)`
- `floor(col)`
- `ceil(col)`
- `round(col, scale=0)`
- `trunc(col, format):` can apply on a date or timestamp field
- `signum():`returns 1 if n > 0, 0 if n = 0, -1 if n < 0
- `avg(), sum(), sumDistinct(col), mean(col), Count(col), min(col), max(col)`
- `countDistinct(col)`
- `corr(col1, col2):` returns Pearson Correlation Coefficient
- `covar_pop(col1, col2):` returns population covariance
- `covar_samp(col1, col2):` Returns the sample covariance
- `var_pop(col):` Return shte population variance of the values in a group
- `var_samp(col):` Returns the unbiased variance of the values in a group
- `variance(col):` Returns the population variance of the values in a group
- `stddev(col):` Returns the unbiased sample standard deviation of the expression in a group
- `stddev_pop(col):` Returns the population standard deviation of the expression in a group
- `stddev_samp(col):` Returns the unbiased sample standard deviation of the expression in a group

In [228]:
ord.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [231]:
ord.select(F.countDistinct('order_status'), F.round(F.corr('order_id', 'order_customer_id'),5)).show()

+----------------------------+-------------------------------------------+
|count(DISTINCT order_status)|round(corr(order_id, order_customer_id), 5)|
+----------------------------+-------------------------------------------+
|                           9|                                    0.00159|
+----------------------------+-------------------------------------------+

