In [1]:
import findspark
findspark.init()

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [2]:
# Spark session & context
spark = SparkSession.builder.master("spark://spark:7077") \
        .appName("jupyter-notebook-index2mongo") \
        .config("spark.driver.memory", "512m") \
        .config("spark.mongodb.input.uri", "mongodb://mongodb:27017/test.myCollection") \
        .config("spark.mongodb.output.uri", "mongodb://mongodb:27017/test.myCollection") \
        .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.2') \
        .getOrCreate()

#         .config("spark.mongodb.write.connection.uri", "mongodb://mongodb:27017/test.myCollection") \
#         .config("spark.mongodb.read.connection.uri", "mongodb://mongodb:27017/test.myCollection") \
#         .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector:10.0.5') \

spark

# Write data in MongoDB

In [3]:
spark.conf.set("spark.sql.parquet.enableVectorizedReader","false")  

In [4]:
df = spark.read.option("overwriteSchema", "true").parquet('/home/jovyan/work/data/**/*.parquet.zst')
#df = spark.read.option("overwriteSchema", "true").parquet('/home/jovyan/work/data/21_12_2022/2022-12-21.1.131762.parquet.zst')

df

DataFrame[language: string, country: string, job_id: string, productCategoryAlt: double, productAltEAN: double, productPriceBase: float, productBrand: string, productCategory: string, productDate: string, crawlID: string, currentURL: string, productLinkDetail: string, productEAN: string, productID: string, productAltId: string, productImage: string, productIsAvailable: bigint, productIsBio: bigint, productIsFresh: bigint, productIsFreezer: bigint, productIsNew: bigint, promotion: bigint, isSponsored: double, productLotPriceBase: float, productLotPrice: float, productName: string, productNutriscore: double, other: string, productPackaging: string, typeCrawler: string, productPosition: bigint, productPricePrevious: float, productPrice: float, productPromotionText: string, productQuantityRating: bigint, productRating: string, enseigneID: string, enseigneName: string, productService: double, shopID: string, productStock: bigint, productUnit: string, productValueUnit: float, variant_master:

In [5]:
from pyspark.sql.types import StringType

d2 = df.withColumn("zip_code",df["zip_code"].cast(StringType()))

In [6]:
d2.printSchema()

root
 |-- language: string (nullable = true)
 |-- country: string (nullable = true)
 |-- job_id: string (nullable = true)
 |-- productCategoryAlt: double (nullable = true)
 |-- productAltEAN: double (nullable = true)
 |-- productPriceBase: float (nullable = true)
 |-- productBrand: string (nullable = true)
 |-- productCategory: string (nullable = true)
 |-- productDate: string (nullable = true)
 |-- crawlID: string (nullable = true)
 |-- currentURL: string (nullable = true)
 |-- productLinkDetail: string (nullable = true)
 |-- productEAN: string (nullable = true)
 |-- productID: string (nullable = true)
 |-- productAltId: string (nullable = true)
 |-- productImage: string (nullable = true)
 |-- productIsAvailable: long (nullable = true)
 |-- productIsBio: long (nullable = true)
 |-- productIsFresh: long (nullable = true)
 |-- productIsFreezer: long (nullable = true)
 |-- productIsNew: long (nullable = true)
 |-- promotion: long (nullable = true)
 |-- isSponsored: double (nullable = tru

In [7]:
d2.write.format("mongo").mode("overwrite").option("overwriteSchema", "true").save()

In [8]:
d2.printSchema()

root
 |-- language: string (nullable = true)
 |-- country: string (nullable = true)
 |-- job_id: string (nullable = true)
 |-- productCategoryAlt: double (nullable = true)
 |-- productAltEAN: double (nullable = true)
 |-- productPriceBase: float (nullable = true)
 |-- productBrand: string (nullable = true)
 |-- productCategory: string (nullable = true)
 |-- productDate: string (nullable = true)
 |-- crawlID: string (nullable = true)
 |-- currentURL: string (nullable = true)
 |-- productLinkDetail: string (nullable = true)
 |-- productEAN: string (nullable = true)
 |-- productID: string (nullable = true)
 |-- productAltId: string (nullable = true)
 |-- productImage: string (nullable = true)
 |-- productIsAvailable: long (nullable = true)
 |-- productIsBio: long (nullable = true)
 |-- productIsFresh: long (nullable = true)
 |-- productIsFreezer: long (nullable = true)
 |-- productIsNew: long (nullable = true)
 |-- promotion: long (nullable = true)
 |-- isSponsored: double (nullable = tru

# Read data in MongoDB

In [27]:
df = spark.read.format("mongo").load()
df.printSchema()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- country: string (nullable = true)
 |-- crawlID: string (nullable = true)
 |-- currentURL: string (nullable = true)
 |-- enseigneID: string (nullable = true)
 |-- enseigneName: string (nullable = true)
 |-- job_id: string (nullable = true)
 |-- language: string (nullable = true)
 |-- other: string (nullable = true)
 |-- productBrand: string (nullable = true)
 |-- productCategory: string (nullable = true)
 |-- productDate: string (nullable = true)
 |-- productEAN: string (nullable = true)
 |-- productID: string (nullable = true)
 |-- productImage: string (nullable = true)
 |-- productIsAvailable: long (nullable = true)
 |-- productIsBio: long (nullable = true)
 |-- productIsFreezer: long (nullable = true)
 |-- productIsFresh: long (nullable = true)
 |-- productIsNew: long (nullable = true)
 |-- productLinkDetail: string (nullable = true)
 |-- productName: string (nullable = true)
 |-- productPackaging: s

In [29]:
df.select("productName").show(5)

+--------------------+
|         productName|
+--------------------+
|Chauffeuse 1 plac...|
|Pull de noël "Sno...|
|Guirlande intérie...|
|Scotch whisky Spe...|
|Scotch whisky éco...|
+--------------------+
only showing top 5 rows

