In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 44 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 64.4 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=d8db2d244e539f39dc613fcd0362709856a249a9daa562af5ec14a63d38c8808
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import types as t
from pyspark.sql import functions as f

In [5]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving invoices.csv to invoices.csv
User uploaded file "invoices.csv" with length 45547667 bytes


In [6]:
spark = SparkSession.builder.appName("Spark Aggregation").getOrCreate()

In [7]:
df = spark.read.format('CSV').option("header","true").option("inferSchema","true").load("invoices.csv")

In [9]:
df.printSchema()
df.show(5)

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536365|     null|WHITE HANGING HEA...|       6|01-12-2010 8.26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|01-12-2010 8.26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|01-12-2010 8.26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA

In [11]:
df1 = df.select(f.count("*").alias("count *"), f.sum("Quantity").alias("Total Quantity"),f.avg('UnitPrice').alias(" avg quantity"),f.countDistinct("InvoiceNo").alias(" CountDistinct"))

In [12]:
df1.show()

+-------+--------------+-----------------+--------------+
|count *|Total Quantity|     avg quantity| CountDistinct|
+-------+--------------+-----------------+--------------+
| 541909|       5176450|4.611113626088498|         25900|
+-------+--------------+-----------------+--------------+



In [16]:
df1_sql = df.selectExpr(" count(*) as `count*`","sum(Quantity) as Totl_Quantity"," avg(UnitPrice) as avg_quantity") 
df1_sql.show()

+------+-------------+----------------+
|count*|Totl_Quantity|    avg_quantity|
+------+-------------+----------------+
|541909|      5176450|4.61111362608295|
+------+-------------+----------------+



In [19]:
df2 = df.groupBy("Country","InvoiceNo").agg(f.sum("Quantity").alias("Total_Quantity"),f.sum(f.expr("Quantity * UnitPrice")).alias("InvoiceValue"))
df2.show()

+--------------+---------+--------------+-------------------+
|       Country|InvoiceNo|Total_Quantity|       InvoiceValue|
+--------------+---------+--------------+-------------------+
|United Kingdom|   536446|           329|             440.89|
|United Kingdom|   536508|           216|             155.52|
|United Kingdom|   537018|            -3|                0.0|
|United Kingdom|   537401|           -24|                0.0|
|United Kingdom|   537811|            74|             268.86|
|United Kingdom|  C537824|            -2|-14.899999999999999|
|United Kingdom|   538895|           370|             247.38|
|United Kingdom|   540453|           341| 302.44999999999993|
|United Kingdom|   541291|           217| 305.81000000000006|
|United Kingdom|   542551|            -1|                0.0|
|United Kingdom|   542576|            -1|                0.0|
|United Kingdom|   542628|             9|             132.35|
|United Kingdom|   542886|           199|  320.5099999999998|
|United 

In [51]:
df3 = df.withColumn("InvoiceDate",f.expr("to_date(InvoiceDate,'dd-MM-yyyy H.mm')")).withColumn("WeekNumber",f.weekofyear(f.col("InvoiceDate")))

In [65]:
InvoiceValue = f.expr(" round(sum(Quantity * UnitPrice),2) ").alias("InvoiceValue")

In [76]:
df4 = df3.where(f.year(f.col("InvoiceDate")) == '2010').groupBy("Country","WeekNumber").agg(f.sum("Quantity").alias("Total_Quantity"),f.countDistinct("InvoiceNo").alias("NumberofInvice"),InvoiceValue)

In [78]:
df4.sort("Country","WeekNumber").show()

+---------------+----------+--------------+--------------+------------+
|        Country|WeekNumber|Total_Quantity|NumberofInvice|InvoiceValue|
+---------------+----------+--------------+--------------+------------+
|      Australia|        48|           107|             1|      358.25|
|      Australia|        49|           214|             1|       258.9|
|      Australia|        50|           133|             2|      387.95|
|        Austria|        50|             3|             2|      257.04|
|        Bahrain|        51|            54|             1|      205.74|
|        Belgium|        48|           528|             1|       346.1|
|        Belgium|        50|           285|             2|      625.16|
|        Belgium|        51|           942|             2|      838.65|
|Channel Islands|        49|            80|             1|      363.53|
|         Cyprus|        50|           917|             1|     1590.82|
|        Denmark|        49|           454|             1|      

In [81]:
from pyspark.sql import Window 

In [83]:
# Window Function 
RunningTotalWindow = Window.partitionBy("Country").orderBy("WeekNumber").rowsBetween(Window.unboundedPreceding,Window.currentRow)


In [84]:
df5=df4.withColumn("Running_Total",f.sum("InvoiceValue").over(RunningTotalWindow))
df5.show()

+---------------+----------+--------------+--------------+------------+------------------+
|        Country|WeekNumber|Total_Quantity|NumberofInvice|InvoiceValue|     Running_Total|
+---------------+----------+--------------+--------------+------------+------------------+
|      Australia|        48|           107|             1|      358.25|            358.25|
|      Australia|        49|           214|             1|       258.9|            617.15|
|      Australia|        50|           133|             2|      387.95|1005.0999999999999|
|        Austria|        50|             3|             2|      257.04|            257.04|
|        Bahrain|        51|            54|             1|      205.74|            205.74|
|        Belgium|        48|           528|             1|       346.1|             346.1|
|        Belgium|        50|           285|             2|      625.16|            971.26|
|        Belgium|        51|           942|             2|      838.65|1809.9099999999999|

In [92]:
demo = Window.partitionBy("Country").orderBy("WeekNumber")
df6=df4.select(f.col("Country"),f.col("WeekNumber"),f.col("Total_Quantity"),f.col("NumberofInvice"),f.col("InvoiceValue"),f.sum("InvoiceValue").over(demo).alias("Running Invoice"))
df6.show()

+---------------+----------+--------------+--------------+------------+------------------+
|        Country|WeekNumber|Total_Quantity|NumberofInvice|InvoiceValue|   Running Invoice|
+---------------+----------+--------------+--------------+------------+------------------+
|      Australia|        48|           107|             1|      358.25|            358.25|
|      Australia|        49|           214|             1|       258.9|            617.15|
|      Australia|        50|           133|             2|      387.95|1005.0999999999999|
|        Austria|        50|             3|             2|      257.04|            257.04|
|        Bahrain|        51|            54|             1|      205.74|            205.74|
|        Belgium|        48|           528|             1|       346.1|             346.1|
|        Belgium|        50|           285|             2|      625.16|            971.26|
|        Belgium|        51|           942|             2|      838.65|1809.9099999999999|