# PrefixSpan Example

### Import required libraries and create Spark session

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.fpm import PrefixSpan

# Create a Spark session
spark = SparkSession.builder.appName("PrefixSpanExample").getOrCreate()

### Create syntehtic data and display the data frame

In [8]:
# Create a DataFrame from the given data
data = [
    Row(sequence=[["eggs", "bread"], ["apple"]]),
    Row(sequence=[["banana"], ["cheese", "chocolate"], ["eggs", "bread"]]),
    Row(sequence=[["eggs", "bread"], ["orange"]]),
    Row(sequence=[["milk"]])
]
df = spark.createDataFrame(data)
df.show(truncate=False)

+----------------------------------------------+
|sequence                                      |
+----------------------------------------------+
|[[eggs, bread], [apple]]                      |
|[[banana], [cheese, chocolate], [eggs, bread]]|
|[[eggs, bread], [orange]]                     |
|[[milk]]                                      |
+----------------------------------------------+



### Set parameters for PrefixSpan
### Find frequent sequential patterns

In [9]:
prefixSpan = PrefixSpan().setMinSupport(0.5).setMaxPatternLength(5).setMaxLocalProjDBSize(32000000)
result = prefixSpan.findFrequentSequentialPatterns(df).sort("sequence")
result.show(truncate=False)

+---------------+----+
|sequence       |freq|
+---------------+----+
|[[bread]]      |3   |
|[[bread, eggs]]|3   |
|[[eggs]]       |3   |
+---------------+----+

