In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, col, desc
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType

# Create the Spark Session
spark = SparkSession.builder \
    .appName("Read MongoDB Data") \
    .config("spark.streaming.stopGracefullyOnShutdown", True) \
    .master("local[*]") \
    .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1") \
    .getOrCreate()

mongo_uri = "mongodb+srv://ayush:ayush123@cluster0.eeclfqm.mongodb.net"
database_name = "PrepMong"
collection_name = "finaldata"  

#  MongoDB data
dataSchema = StructType([
    StructField("id", StringType()),
    StructField("week", StringType()),
    StructField("center_id", StringType()),
    StructField("city_code", StringType()),
    StructField("region_code", StringType()),
    StructField("center_type", StringType()),
    StructField("op_area", StringType()),
    StructField("meal_id", StringType()),
    StructField("category", StringType()),
    StructField("cuisine", StringType()),
    StructField("checkout_price", StringType()),
    StructField("base_price", StringType()),
    StructField("emailer_for_promotion", StringType()),
    StructField("homepage_featured", StringType()),
    StructField("num_orders", StringType())
])

df_mongo = spark.read.format("com.mongodb.spark.sql.DefaultSource") \
    .option("uri", mongo_uri) \
    .option("database", database_name) \
    .option("collection", collection_name) \
    .schema(dataSchema) \
    .load()

# initial data
df_mongo.show()

df_mongo.describe().show()

df_mongo.groupBy('week').count().sort(desc('count')).show()


                                                                                

+-------+----+---------+---------+-----------+-----------+-------+-------+---------+-------+--------------+----------+---------------------+-----------------+----------+
|     id|week|center_id|city_code|region_code|center_type|op_area|meal_id| category|cuisine|checkout_price|base_price|emailer_for_promotion|homepage_featured|num_orders|
+-------+----+---------+---------+-----------+-----------+-------+-------+---------+-------+--------------+----------+---------------------+-----------------+----------+
|1379560|   1|       55|      647|         56|     TYPE_C|    2.0|   1885|Beverages|   Thai|        136.83|    152.29|                    0|                0|       177|
|1018704|   2|       55|      647|         56|     TYPE_C|    2.0|   1885|Beverages|   Thai|        135.83|    152.29|                    0|                0|       323|
|1196273|   3|       55|      647|         56|     TYPE_C|    2.0|   1885|Beverages|   Thai|        132.92|    133.92|                    0|          

                                                                                

+-------+------------------+-----------------+-----------------+-----------------+------------------+-----------+-----------------+------------------+---------+-----------+------------------+------------------+---------------------+-------------------+-----------------+
|summary|                id|             week|        center_id|        city_code|       region_code|center_type|          op_area|           meal_id| category|    cuisine|    checkout_price|        base_price|emailer_for_promotion|  homepage_featured|       num_orders|
+-------+------------------+-----------------+-----------------+-----------------+------------------+-----------+-----------------+------------------+---------+-----------+------------------+------------------+---------------------+-------------------+-----------------+
|  count|            456548|           456548|           456548|           456548|            456548|     456548|           456548|            456548|   456548|     456548|            456

                                                                                

+----+-----+
|week|count|
+----+-----+
| 122| 3359|
| 105| 3348|
| 106| 3347|
| 140| 3332|
| 123| 3331|
| 134| 3330|
| 133| 3324|
| 113| 3312|
| 100| 3309|
| 143| 3305|
|  94| 3303|
| 144| 3302|
| 114| 3300|
| 109| 3299|
| 121| 3298|
| 110| 3293|
| 131| 3293|
| 129| 3291|
| 117| 3290|
|  91| 3289|
+----+-----+
only showing top 20 rows



In [13]:
csv_path = "final_data/food_data"

df_mongo.coalesce(1).write.csv(csv_path, header=True, mode="overwrite")


spark.stop()

                                                                                