# Analyze data with Apache Spark

https://learn.microsoft.com/en-ca/azure/synapse-analytics/spark/apache-spark-data-visualization-tutorial

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.master("spark://spark:7077") \
        .appName("jupyter-notebook-index2mongo") \
        .config("spark.driver.memory", "1g") \
        .getOrCreate()

In [4]:
spark

In [3]:
spark.conf.set("spark.sql.parquet.enableVectorizedReader","false")  

# Spark SQL

In [5]:
sdf = spark.read.option("overwriteSchema", "true").parquet('/home/jovyan/work/data/**/*.parquet.zst')

In [6]:
type(sdf)

pyspark.sql.dataframe.DataFrame

In [16]:
sdf.tail(10)

[Row(language='FR', country='FR', job_id='379974/15/185782', productCategoryAlt=None, productAltEAN=None, productPriceBase=None, productBrand='PTIT BASILE', productCategory="['Tout pour bébé', 'Puériculture', 'Chambre bébé']", productDate='2022-12-29T00:25:33Z', crawlID='417ca1d7-3878-437d-a6a9-13d82f4ced78', currentURL='https://www.auchan.fr/bebe/puericulture/chambre-bebe/ca-7135359?page=71', productLinkDetail='https://www.auchan.fr/ptit-basile-parure-housse-de-couette-bio-brodee-taie-d-oreiller-nuages/pr-C901112', productEAN='3760240961222', productID='108146', productAltId=None, productImage='https://media.auchan.fr/148fd2a8-da43-4456-ac8b-047fe407c853_460x460/B2CD/', productIsAvailable=1, productIsBio=0, productIsFresh=0, productIsFreezer=0, productIsNew=0, promotion=0, isSponsored=None, productLotPriceBase=None, productLotPrice=None, productName="Parure housse de couette Bio brodée + Taie d'oreiller Nuages", productNutriscore=None, other='{"is_heard_on_radio": false}', productPack

# Pandas spark flavor

In [8]:
import pyspark.pandas as ps

In [13]:
psdf = ps.read_parquet('/home/jovyan/work/data/**/*.parquet.zst', index_col="productID")

In [14]:
type(psdf)

pyspark.pandas.frame.DataFrame

In [15]:
psdf.tail(10)

Unnamed: 0_level_0,language,country,job_id,productCategoryAlt,productAltEAN,productPriceBase,productBrand,productCategory,productDate,crawlID,currentURL,productLinkDetail,productEAN,productAltId,productImage,productIsAvailable,productIsBio,productIsFresh,productIsFreezer,productIsNew,promotion,isSponsored,productLotPriceBase,productLotPrice,productName,productNutriscore,other,productPackaging,typeCrawler,productPosition,productPricePrevious,productPrice,productPromotionText,productQuantityRating,productRating,enseigneID,enseigneName,productService,shopID,productStock,productUnit,productValueUnit,variant_master,variant_position,zip_code
productID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1
108146,FR,FR,379974/15/185782,,,,PTIT BASILE,"['Tout pour bébé', 'Puériculture', 'Chambre bé...",2022-12-29T00:25:33Z,417ca1d7-3878-437d-a6a9-13d82f4ced78,https://www.auchan.fr/bebe/puericulture/chambr...,https://www.auchan.fr/ptit-basile-parure-houss...,3760240961222.0,,https://media.auchan.fr/148fd2a8-da43-4456-ac8...,1,0,0,0,0,0,,,,Parure housse de couette Bio brodée + Taie d'o...,,"{""is_heard_on_radio"": false}",,WEB,41,,46.599998,,,,1,Auchan Drive,,131774,,,,,,
CA1306558,FR,FR,379974/15/185782,,,,PTIT BASILE,"['Tout pour bébé', 'Puériculture', 'Chambre bé...",2022-12-29T00:25:34Z,417ca1d7-3878-437d-a6a9-13d82f4ced78,https://www.auchan.fr/bebe/puericulture/chambr...,https://www.auchan.fr/ptit-basile-drap-plat-be...,,,https://media.auchan.fr/1dc5de90-6051-4f0c-afb...,1,0,0,0,0,0,,,,Drap plat bébé coton Bio 118x180 cm imprimé,,"{""is_heard_on_radio"": false}",2 coloris,WEB,42,,26.0,,,,1,Auchan Drive,,131774,,,,,,
108703,FR,FR,379974/15/185782,,,,PTIT BASILE,"['Tout pour bébé', 'Puériculture', 'Chambre bé...",2022-12-29T00:25:42Z,417ca1d7-3878-437d-a6a9-13d82f4ced78,https://www.auchan.fr/bebe/puericulture/chambr...,https://www.auchan.fr/ptit-basile-gigoteuse-et...,3760240962687.0,,https://media.auchan.fr/469fa5b2-ba27-47c8-b66...,1,0,0,0,0,0,,,,Gigoteuse été Bio 6-24 mois Little sweet dreams,,"{""is_heard_on_radio"": false}",,WEB,48,,36.900002,,,,1,Auchan Drive,,131774,,,,,,
269862,FR,FR,379974/15/185782,,,,PTIT BASILE,"['Tout pour bébé', 'Puériculture', 'Chambre bé...",2022-12-29T00:25:42Z,417ca1d7-3878-437d-a6a9-13d82f4ced78,https://www.auchan.fr/bebe/puericulture/chambr...,https://www.auchan.fr/ptit-basile-gigoteuse-et...,3760240962199.0,,https://media.auchan.fr/8768feae-8033-45a2-acc...,1,0,0,0,0,0,,,,Gigoteuse été en coton bio 6-36 mois Pluie d'é...,,"{""is_heard_on_radio"": false}",,WEB,49,,38.900002,,,,1,Auchan Drive,,131774,,,,,,
727591,FR,FR,379974/15/185782,,,,Bébé Provence,"['Tout pour bébé', 'Puériculture', 'Chambre bé...",2022-12-29T00:25:46Z,417ca1d7-3878-437d-a6a9-13d82f4ced78,https://www.auchan.fr/bebe/puericulture/chambr...,https://www.auchan.fr/bebe-provence-matelas-be...,,,https://media.auchan.fr/0ae5f99d-8843-4ad5-875...,1,0,0,0,0,1,,,,Matelas bébé mousse HD 60 x 120 cm BELLE NUIT,,"{""is_heard_on_radio"": false}",,WEB,51,,109.0,25% cagnotté,,,1,Auchan Drive,,131774,,,,,,
221894,FR,FR,379974/15/185782,,,,Ptit Albatros,"['Tout pour bébé', 'Puériculture', 'Chambre bé...",2022-12-29T00:25:47Z,417ca1d7-3878-437d-a6a9-13d82f4ced78,https://www.auchan.fr/bebe/puericulture/chambr...,https://www.auchan.fr/ptit-albatros-p-tit-alba...,3454980150250.0,,https://media.auchan.fr/433deab3-860e-4e78-85f...,1,0,0,0,0,0,,,,P'tit Albatros - Plan Incliné Bébé 60x35 cm - ...,,"{""is_heard_on_radio"": false}",,WEB,50,,19.9,,,,1,Auchan Drive,,131774,,,,,,
CA1306562,FR,FR,379974/15/185782,,,,PTIT BASILE,"['Tout pour bébé', 'Puériculture', 'Chambre bé...",2022-12-29T00:25:48Z,417ca1d7-3878-437d-a6a9-13d82f4ced78,https://www.auchan.fr/bebe/puericulture/chambr...,https://www.auchan.fr/ptit-basile-drap-bebe-co...,,,https://media.auchan.fr/64b2d50e-bb2f-4a5a-ab2...,1,0,0,0,0,0,,,,Drap bébé coton Bio 118x180 cm,,"{""is_heard_on_radio"": false}",17 coloris,WEB,52,,25.9,,,,1,Auchan Drive,,131774,,,,,,
CA985272,FR,FR,379974/15/185782,,,,BLANREVE,"['Tout pour bébé', 'Puériculture', 'Chambre bé...",2022-12-29T00:25:49Z,417ca1d7-3878-437d-a6a9-13d82f4ced78,https://www.auchan.fr/bebe/puericulture/chambr...,https://www.auchan.fr/blanreve-couette-bebe-co...,,,https://media.auchan.fr/5a29a493-5aac-46f5-8ea...,1,0,0,0,0,1,,,,Couette bébé coton tempérée anti-acariens BAMBIN,,"{""is_heard_on_radio"": false}",1 coloris,WEB,53,34.990002,24.49,-30 %,,,1,Auchan Drive,,131774,,,,,,
CA1129202,FR,FR,379974/15/185782,,,,PTIT BASILE,"['Tout pour bébé', 'Puériculture', 'Chambre bé...",2022-12-29T00:25:51Z,417ca1d7-3878-437d-a6a9-13d82f4ced78,https://www.auchan.fr/bebe/puericulture/chambr...,https://www.auchan.fr/ptit-basile-gigoteuse-et...,,,https://media.auchan.fr/4bbea478-5a65-44d3-b75...,1,0,0,0,0,0,,,,Gigoteuse été jersey bio imprimé mini stars,,"{""is_heard_on_radio"": false}",1 coloris,WEB,54,,29.9,,,,1,Auchan Drive,,131774,,,,,,
CA1306628,FR,FR,379974/15/185782,,,,PTIT BASILE,"['Tout pour bébé', 'Puériculture', 'Chambre bé...",2022-12-29T00:25:55Z,417ca1d7-3878-437d-a6a9-13d82f4ced78,https://www.auchan.fr/bebe/puericulture/chambr...,https://www.auchan.fr/ptit-basile-lot-x3-draps...,,,https://media.auchan.fr/25a25b98-be4e-4974-98c...,1,0,0,0,0,0,,,,Lot x3 draps housse jersey lit bébé 100% coton...,,"{""is_heard_on_radio"": false}",4 coloris,WEB,55,,29.6,,,,1,Auchan Drive,,131774,,,,,,
