# SAMPLE()

In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .appName("example-sample-resample")
    .getOrCreate()
)

## Using fraction to get a random sample in PySpark

By using fraction between 0 to 1, it returns the approximate number of the fraction of the dataset. For example, 0.1 returns 10% of the rows. However, this does not guarantee it returns the exact 10% of the records

In [3]:
# My DataFrame has 100 records and I wanted to get 6% sample records which are 6 but the sample() function returned 7 records. 
# This proves the sample function doesn’t return the exact fraction specified.
df = spark.range(100)
print(df.sample(0.06).collect())

[Row(id=30), Row(id=32), Row(id=65), Row(id=76), Row(id=91), Row(id=97)]


## Using seed to reproduce the same Samples in PySpark

Every time you run a sample() function it returns a different set of sampling records, however sometimes during the development and testing phase you may need to regenerate the same sample every time as you need to compare the results from your previous run. To get consistent same random sampling uses the same slice value for every run. Change slice value to get different results.

In [4]:
print(df.sample(0.1,123).collect())

print(df.sample(0.1,123).collect())

print(df.sample(0.1,456).collect())


[Row(id=35), Row(id=38), Row(id=41), Row(id=45), Row(id=71), Row(id=84), Row(id=87), Row(id=99)]
[Row(id=35), Row(id=38), Row(id=41), Row(id=45), Row(id=71), Row(id=84), Row(id=87), Row(id=99)]
[Row(id=22), Row(id=33), Row(id=35), Row(id=41), Row(id=53), Row(id=80), Row(id=83), Row(id=87), Row(id=92)]


## Sample withReplacement (May contain duplicates)

Some times you may need to get a random sample with repeated values. By using the value true, results in repeated values.

In [5]:
print(df.sample(True, 0.3, 123).collect()) # with Duplicates


[Row(id=0), Row(id=5), Row(id=9), Row(id=11), Row(id=13), Row(id=16), Row(id=17), Row(id=26), Row(id=26), Row(id=37), Row(id=41), Row(id=45), Row(id=49), Row(id=50), Row(id=50), Row(id=57), Row(id=58), Row(id=58), Row(id=65), Row(id=66), Row(id=71), Row(id=74), Row(id=77), Row(id=80), Row(id=81), Row(id=82), Row(id=84), Row(id=88), Row(id=90), Row(id=91), Row(id=91), Row(id=92), Row(id=94), Row(id=96)]


In [6]:
print(df.sample(0.3, 123).collect()) # No duplicates


[Row(id=0), Row(id=4), Row(id=12), Row(id=15), Row(id=19), Row(id=21), Row(id=23), Row(id=24), Row(id=25), Row(id=28), Row(id=29), Row(id=34), Row(id=35), Row(id=36), Row(id=38), Row(id=41), Row(id=45), Row(id=47), Row(id=50), Row(id=52), Row(id=59), Row(id=63), Row(id=65), Row(id=71), Row(id=82), Row(id=84), Row(id=87), Row(id=94), Row(id=99)]


## Stratified sampling in PySpark

You can get Stratified sampling in PySpark without replacement by using sampleBy() method. It returns a sampling fraction for each stratum. If a stratum is not specified, it takes zero as the default.

In [7]:
df2 = df.select((df.id % 3).alias("key"))
print(df2.sampleBy("key", {0: 0.1, 1: 0.2},0).collect())


[Row(key=0), Row(key=0), Row(key=1), Row(key=1), Row(key=0), Row(key=1), Row(key=0), Row(key=1), Row(key=0), Row(key=0), Row(key=1), Row(key=1), Row(key=0)]


Which of the following code blocks returns about 150 randomly selected rows from the 1000-row DataFrame transactionsDf, assuming that any row can appear more than once in the returned DataFrame?

- `transactionsDf.resample(0.15, False, 3142)`
- `transactionsDf.sample(0.15, False, 3142)`
- `transactionsDf.sample(0.15)`
- `transactionsDf.sample(0.85, 8429)`
- `transactionsDf.sample(True, 0.15, 8261)`

In [21]:
transactionsDf = spark.range(1000)

In [22]:
transactionsDf.sample(0.15, False, 3142).count()

166

In [23]:
transactionsDf.sample(0.15).count()

154

In [24]:
transactionsDf.sample(0.85, 8429).count()

859

In [25]:
transactionsDf.sample(True, 0.15, 8261).count()

150