In [1]:
import findspark
findspark.init()

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [2]:
# Spark session & context
spark = SparkSession.builder.master("spark://spark:7077") \
        .appName("jupyter-notebook-analytics") \
        .config("spark.driver.memory", "512m") \
        .config("spark.mongodb.input.uri", "mongodb://mongodb:27017/test.myCollection") \
        .config("spark.mongodb.output.uri", "mongodb://mongodb:27017/test.myCollection") \
        .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.2') \
        .getOrCreate()

#         .config("spark.mongodb.write.connection.uri", "mongodb://mongodb:27017/test.myCollection") \
#         .config("spark.mongodb.read.connection.uri", "mongodb://mongodb:27017/test.myCollection") \
#         .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector:10.0.5') \

spark

In [3]:
sc = spark.sparkContext
sc

In [4]:
# Sum of the first 100 whole numbers
from pyspark.rdd import RDD

rdd = sc.parallelize(range(1000+1))
rdd.sum()

500500

# Mongo Spark Connector
Example


In [5]:
people = spark.createDataFrame([("Bilbo Baggins",  50), ("Gandalf", 1000), ("Thorin", 195), ("Balin", 178), ("Kili", 77),
   ("Dwalin", 169), ("Oin", 167), ("Gloin", 158), ("Fili", 82), ("Bombur", 50)], schema='name string, age int')

people.write.format("mongo").mode("append").save()
# people.write.format("mongodb").mode("append").save()


+-------------+----+
|         name| age|
+-------------+----+
|Bilbo Baggins|  50|
|      Gandalf|1000|
|       Thorin| 195|
|        Balin| 178|
|         Kili|  77|
|       Dwalin| 169|
|          Oin| 167|
|        Gloin| 158|
|         Fili|  82|
|       Bombur|  50|
+-------------+----+



In [None]:
# If you need to write to a different MongoDB collection, use the .option() 
# method with .write().
# To write to a collection called contacts in a database called people, 
# specify the collection and database with .option():
# OLD: people.write.format("mongodb").mode("append").option("database","people").option("collection", "contacts").save()
# people.write.format("mongo").mode("append").option("database", "people").option("collection", "contacts").save()
people.show()

In [7]:
people.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



In [8]:
# df = spark.read.format("mongodb").load()
df = spark.read.format("mongo").load()
df.printSchema()
# df = spark.read.format("mongo").option("uri", "mongodb://127.0.0.1/people.contacts").load()

root
 |-- _id: struct (nullable = true)
 |    |-- oid: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [9]:
pipeline = "{'$match': {'name': 'Gloin'}}"
df = spark.read.format("mongo").option("pipeline", pipeline).load()
df.show()



+--------------------+---+-----+
|                 _id|age| name|
+--------------------+---+-----+
|{63c6cc2814c2b47f...|158|Gloin|
+--------------------+---+-----+



In [10]:
spark.stop()