In [5]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('simple aggregation').getOrCreate()

In [6]:
walmart_df = spark.read.format('csv') \
    .option("inferSchema","true") \
        .option("header","true") \
            .load(r"C:\Users\TARUN\Desktop\Pyspark\Trendytech\week8\walmart")

In [7]:
walmart_df.show(5)

+---------+---------+--------+-----------+---------+----------+-------+
|InvoiceNo|StockCode|Quantity|InvoiceDate|UnitPrice|CustomerId|Country|
+---------+---------+--------+-----------+---------+----------+-------+
|  INV0006|     A002|      15| 2023-01-02|    95.43|      1003| Canada|
|  INV0007|     A005|       4| 2023-11-17|    30.55|      1005| Canada|
|  INV0007|     A002|       5| 2023-07-11|    76.49|      1005| Canada|
|  INV0008|     A005|      11| 2023-09-08|     2.93|      1003|Germany|
|  INV0009|     A001|       8| 2023-10-18|    94.75|      1001| France|
+---------+---------+--------+-----------+---------+----------+-------+
only showing top 5 rows



In [8]:
walmart_df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: date (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Country: string (nullable = true)



In [10]:
walmart_df.createOrReplaceTempView("walmart")

In [11]:
walmart_pdf = walmart_df.toPandas()

In [12]:
walmart_pdf.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerId,Country
0,INV0006,A002,15,2023-01-02,95.43,1003,Canada
1,INV0007,A005,4,2023-11-17,30.55,1005,Canada
2,INV0007,A002,5,2023-07-11,76.49,1005,Canada
3,INV0008,A005,11,2023-09-08,2.93,1003,Germany
4,INV0009,A001,8,2023-10-18,94.75,1001,France


###### total row count,distinct invoice ,total quantity and avg unit price

In [15]:
from pyspark.sql.functions import *

In [16]:
walmart_df.select(count("*").alias("row_count")).show()

+---------+
|row_count|
+---------+
|       29|
+---------+



In [18]:
walmart_df.select(expr("count(*) as row_count")).show()

+---------+
|row_count|
+---------+
|       29|
+---------+



In [19]:
walmart_df.selectExpr("count(*) as row_count").show()

+---------+
|row_count|
+---------+
|       29|
+---------+



In [20]:
spark.sql("select count(*) from walmart").show()

+--------+
|count(1)|
+--------+
|      29|
+--------+



In [21]:
walmart_pdf.count()

InvoiceNo      29
StockCode      29
Quantity       29
InvoiceDate    29
UnitPrice      29
CustomerId     29
Country        29
dtype: int64

In [25]:
walmart_df.select(
                    count("*").alias("row_count"),
                    countDistinct("CustomerId").alias("CustomerId"),
                    sum("Quantity").alias("total_quantity"),
                    round(avg("UnitPrice"),2).alias("avg_unit_price")
                ).show()

+---------+----------+--------------+--------------+
|row_count|CustomerId|total_quantity|avg_unit_price|
+---------+----------+--------------+--------------+
|       29|         5|           319|         48.53|
+---------+----------+--------------+--------------+



In [45]:
walmart_df.select(expr(" count(*) as row_count"),
                  expr("count(distinct(CustomerId)) as distinct_cid"),
                  expr("sum(Quantity) as total_quantity"),
                  expr("round(avg(Unitprice),2) as avg_unit_price")
                  ).show()

+---------+------------+--------------+--------------+
|row_count|distinct_cid|total_quantity|avg_unit_price|
+---------+------------+--------------+--------------+
|       29|           5|           319|         48.53|
+---------+------------+--------------+--------------+



In [46]:
walmart_df.selectExpr(" count(*) as row_count",
                    "count(distinct(CustomerId)) as distinct_cid",
                    "sum(Quantity) as total_quantity",
                    "round(avg(Unitprice),2) as avg_unit_price"
                    ).show()

+---------+------------+--------------+--------------+
|row_count|distinct_cid|total_quantity|avg_unit_price|
+---------+------------+--------------+--------------+
|       29|           5|           319|         48.53|
+---------+------------+--------------+--------------+



In [44]:
spark.sql("select count(*) as row_count, \
                count(distinct(CustomerId)) as distinct_cid, \
                    sum(Quantity) as total_quantity, \
                        round(avg(Unitprice),2) as avg_unit_price \
                                        from walmart").show()

+---------+------------+--------------+--------------+
|row_count|distinct_cid|total_quantity|avg_unit_price|
+---------+------------+--------------+--------------+
|       29|           5|           319|         48.53|
+---------+------------+--------------+--------------+



In [47]:
walmart_pdf.head(2)

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,UnitPrice,CustomerId,Country
0,INV0006,A002,15,2023-01-02,95.43,1003,Canada
1,INV0007,A005,4,2023-11-17,30.55,1005,Canada


In [49]:
results = {
    "row_count":len(walmart_pdf),
    "distinct_cid":walmart_pdf["CustomerId"].nunique(),
    "total_quantity":walmart_pdf["Quantity"].sum(),
    "avg_unit_price":walmart_pdf["UnitPrice"].mean()
}

print(results)

{'row_count': 29, 'distinct_cid': 5, 'total_quantity': 319, 'avg_unit_price': 48.52655172413793}


In [52]:
spark.stop()