In [24]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.window import Window as w
from pyspark.sql.types import IntegerType, BooleanType, DateType

In [2]:
spark = SparkSession.builder.appName('test').getOrCreate()

In [3]:
df= spark.createDataFrame([
("Tushar",30),
("Vaishali",29)
],["Name","Age"])

df.show()

+--------+---+
|    Name|Age|
+--------+---+
|  Tushar| 30|
|Vaishali| 29|
+--------+---+



# Customer Data

In [5]:
df_cust = spark.createDataFrame(
    [(1001,'Vikas Bhosale','Mumbai',426075,'v@gmail.com'),
     (1432,'Sahil Kharat','Nagpur',45234,'Sahil@ymail.com'),
     (1021,'Ram Dev','Pune',41108,''),
     (3021,'Akshay Abhale','Akole',45234,'Akshay@gmail.com'),
     (1801,'Jitesh Varma','Mumbai',None,'jitu@gmail.com'),
    ],
    ['id','name','city','pin','email']
)

df_cust.show()

+----+-------------+------+------+----------------+
|  id|         name|  city|   pin|           email|
+----+-------------+------+------+----------------+
|1001|Vikas Bhosale|Mumbai|426075|     v@gmail.com|
|1432| Sahil Kharat|Nagpur| 45234| Sahil@ymail.com|
|1021|      Ram Dev|  Pune| 41108|                |
|3021|Akshay Abhale| Akole| 45234|Akshay@gmail.com|
|1801| Jitesh Varma|Mumbai|  NULL|  jitu@gmail.com|
+----+-------------+------+------+----------------+



In [6]:
df_cust.drop('email')\
.where("email=''")\
.show()

df_cust.groupBy('city')\
.count()\
.orderBy('count',ascending=False)\
.show()

+----+-------+----+-----+
|  id|   name|city|  pin|
+----+-------+----+-----+
|1021|Ram Dev|Pune|41108|
+----+-------+----+-----+

+------+-----+
|  city|count|
+------+-----+
|Mumbai|    2|
|Nagpur|    1|
|  Pune|    1|
| Akole|    1|
+------+-----+



In [44]:
df_cust.agg(f.sum(df_cust.pin).alias('pin total'),\
            f.max('pin'))\
.show()

+---------+--------+
|pin total|max(pin)|
+---------+--------+
|   557651|  426075|
+---------+--------+



# Adding one more column to existing table

In [7]:
df_cust = df_cust.withColumn('gen',f.lit('M'))
df_cust.show()

+----+-------------+------+------+----------------+---+
|  id|         name|  city|   pin|           email|gen|
+----+-------------+------+------+----------------+---+
|1001|Vikas Bhosale|Mumbai|426075|     v@gmail.com|  M|
|1432| Sahil Kharat|Nagpur| 45234| Sahil@ymail.com|  M|
|1021|      Ram Dev|  Pune| 41108|                |  M|
|3021|Akshay Abhale| Akole| 45234|Akshay@gmail.com|  M|
|1801| Jitesh Varma|Mumbai|  NULL|  jitu@gmail.com|  M|
+----+-------------+------+------+----------------+---+



In [9]:
values = [(3412,'Neha Sharma','Nashik',543412,'neha@tmsil.com','Female'),\
          (2534,'Vaishali Shete','Akole',45234,'vaish@gmail.com','Fema')]
cols = ['id','name','city','pin','email','gen']
df_add = spark.createDataFrame(values,cols)
df_add.show()

+----+--------------+------+------+---------------+------+
|  id|          name|  city|   pin|          email|   gen|
+----+--------------+------+------+---------------+------+
|3412|   Neha Sharma|Nashik|543412| neha@tmsil.com|Female|
|2534|Vaishali Shete| Akole| 45234|vaish@gmail.com|  Fema|
+----+--------------+------+------+---------------+------+



# Merging two tables 

In [12]:
df_cust = df_cust.union(df_add)

In [30]:
df_cust = df_cust.withColumn('gen',f.when(df_cust.gen.like('F%'),'F')\
                             .otherwise(df_cust.gen))
df_cust.show()

+----+--------------+------+------+----------------+---+
|  id|          name|  city|   pin|           email|gen|
+----+--------------+------+------+----------------+---+
|1001| Vikas Bhosale|Mumbai|426075|     v@gmail.com|  M|
|1021|       Ram Dev|  Pune| 41108|                |  M|
|1432|  Sahil Kharat|Nagpur| 45234| Sahil@ymail.com|  M|
|1801|  Jitesh Varma|Mumbai|  NULL|  jitu@gmail.com|  M|
|2534|Vaishali Shete| Akole| 45234| vaish@gmail.com|  F|
|3021| Akshay Abhale| Akole| 45234|Akshay@gmail.com|  M|
|3412|   Neha Sharma|Nashik|543412|  neha@tmsil.com|  F|
+----+--------------+------+------+----------------+---+



In [31]:
# Remove duplicate in case required

In [29]:
wdwPart = w.partitionBy('id','name').orderBy('id')
df_cust = df_cust.withColumn('row',f.row_number().over(wdwPart))\
            .where(f.col('row')==1)\
            .drop(f.col('row'))
df_cust.show()

+----+--------------+------+------+----------------+---+
|  id|          name|  city|   pin|           email|gen|
+----+--------------+------+------+----------------+---+
|1001| Vikas Bhosale|Mumbai|426075|     v@gmail.com|  M|
|1021|       Ram Dev|  Pune| 41108|                |  M|
|1432|  Sahil Kharat|Nagpur| 45234| Sahil@ymail.com|  M|
|1801|  Jitesh Varma|Mumbai|  NULL|  jitu@gmail.com|  M|
|2534|Vaishali Shete| Akole| 45234| vaish@gmail.com|  F|
|3021| Akshay Abhale| Akole| 45234|Akshay@gmail.com|  M|
|3412|   Neha Sharma|Nashik|543412|  neha@tmsil.com|  F|
+----+--------------+------+------+----------------+---+



In [None]:
# Created order table df

In [32]:
cols = ['OrderId','OrderDate','Qty','Product']
values = [(342,'2021-09-12',23,1),
          (23,'2020-02-21',2,4),
          (42,'2014-08-02',23,3),
          (2,'2022-06-23',3,1),
          (76,'2001-01-10',56,7),
          (567,'2021-08-18',24,6),
          (9,'2012-05-13',7,2),
          (25,'2023-06-22',230,2),
          (56,'2016-01-01',13,5)
         ]
df_order = spark.createDataFrame(values,cols)
df_order.show()

+-------+----------+---+-------+
|OrderId| OrderDate|Qty|Product|
+-------+----------+---+-------+
|    342|2021-09-12| 23|      1|
|     23|2020-02-21|  2|      4|
|     42|2014-08-02| 23|      3|
|      2|2022-06-23|  3|      1|
|     76|2001-01-10| 56|      7|
|    567|2021-08-18| 24|      6|
|      9|2012-05-13|  7|      2|
|     25|2023-06-22|230|      2|
|     56|2016-01-01| 13|      5|
+-------+----------+---+-------+



In [None]:
# Product table df

In [33]:
cols = ['ProductId','ProductName']
values = [(1,'USB'),
          (2,'Mobile'),
          (3,'Headphone'),
          (4,'Mouse'),
          (5,'Pendrive'),
          (6,'Camera'),
          (7,'Battery'),
          (8,'Cover'),
          (9,'Charger'),
          (10,'Lense')
]
df_prdt = spark.createDataFrame(values,cols)
df_prdt.show()

+---------+-----------+
|ProductId|ProductName|
+---------+-----------+
|        1|        USB|
|        2|     Mobile|
|        3|  Headphone|
|        4|      Mouse|
|        5|   Pendrive|
|        6|     Camera|
|        7|    Battery|
|        8|      Cover|
|        9|    Charger|
|       10|      Lense|
+---------+-----------+



In [None]:
# Joining the product and order table 

In [34]:
df_order.join(df_prdt,df_order.Product == df_prdt.ProductId , 'inner')\
        .select('OrderId','ProductName','Qty')\
        .show()

+-------+-----------+---+
|OrderId|ProductName|Qty|
+-------+-----------+---+
|    342|        USB| 23|
|      2|        USB|  3|
|      9|     Mobile|  7|
|     25|     Mobile|230|
|     42|  Headphone| 23|
|     23|      Mouse|  2|
|     56|   Pendrive| 13|
|    567|     Camera| 24|
|     76|    Battery| 56|
+-------+-----------+---+



In [36]:
df_order.join(df_prdt,df_order.Product == df_prdt.ProductId , 'inner')\
.select('ProductName','Qty')\
.groupby('ProductName')\
.agg(f.sum('Qty').alias("Total Qty"))\
.show()

+-----------+---------+
|ProductName|Total Qty|
+-----------+---------+
|  Headphone|       23|
|   Pendrive|       13|
|      Mouse|        2|
|     Camera|       24|
|     Mobile|      237|
|        USB|       26|
|    Battery|       56|
+-----------+---------+



In [37]:
df_order = df_order.withColumn('OrderDate',f.col('OrderDate').astype('date'))
df_order.show()

+-------+----------+---+-------+
|OrderId| OrderDate|Qty|Product|
+-------+----------+---+-------+
|    342|2021-09-12| 23|      1|
|     23|2020-02-21|  2|      4|
|     42|2014-08-02| 23|      3|
|      2|2022-06-23|  3|      1|
|     76|2001-01-10| 56|      7|
|    567|2021-08-18| 24|      6|
|      9|2012-05-13|  7|      2|
|     25|2023-06-22|230|      2|
|     56|2016-01-01| 13|      5|
+-------+----------+---+-------+



In [None]:
# Creating customer-order link table

In [38]:
values = [ (3021,23),(1001,42),(3412,56),(1801,2),(2534,25),(1021,567)]
cols = ['c_id','o_id']
df_cust_order = spark.createDataFrame(values,cols)
df_cust_order.show()

+----+----+
|c_id|o_id|
+----+----+
|3021|  23|
|1001|  42|
|3412|  56|
|1801|   2|
|2534|  25|
|1021| 567|
+----+----+



In [43]:
df_order = df_order.join(df_cust_order,df_cust_order.o_id == df_order.OrderId,'leftouter').drop('o_id')
df_order.show()

+-------+----------+---+-------+----+
|OrderId| OrderDate|Qty|Product|c_id|
+-------+----------+---+-------+----+
|    342|2021-09-12| 23|      1|NULL|
|     23|2020-02-21|  2|      4|3021|
|      2|2022-06-23|  3|      1|1801|
|     42|2014-08-02| 23|      3|1001|
|    567|2021-08-18| 24|      6|1021|
|     76|2001-01-10| 56|      7|NULL|
|     25|2023-06-22|230|      2|2534|
|      9|2012-05-13|  7|      2|NULL|
|     56|2016-01-01| 13|      5|3412|
+-------+----------+---+-------+----+



In [44]:
df_order = df_order.withColumnRenamed('c_id','cust_id')
df_order.printSchema()

root
 |-- OrderId: long (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Qty: long (nullable = true)
 |-- Product: long (nullable = true)
 |-- cust_id: long (nullable = true)



In [None]:
# Creatring table for product stock details

In [46]:
value = [(1,1,34),(2,4,56),(3,2,21),(4,3,897),(5,8,27),(6,3,23),(7,4,21)]
col = ['stk_id','p_id','stock']
df_stock = spark.createDataFrame(value,col)
df_stock.show()

+------+----+-----+
|stk_id|p_id|stock|
+------+----+-----+
|     1|   1|   34|
|     2|   4|   56|
|     3|   2|   21|
|     4|   3|  897|
|     5|   8|   27|
|     6|   3|   23|
|     7|   4|   21|
+------+----+-----+



In [47]:
df_cust.join(df_order,
             df_order.cust_id == df_cust.id,
             'inner')\
        .join(df_prdt,
              df_order.Product==df_prdt.ProductId,
              'inner')\
        .join(df_stock,
              df_prdt.ProductId == df_stock.p_id,
              'leftouter')\
.groupBy('cust_id','name','OrderId','ProductName','Qty')\
.agg(f.sum('stock').alias('stock'))\
.select('cust_id','name','OrderId','ProductName','Qty',f.ifnull(f.col('stock'),f.lit(0)).alias('stock'))\
.orderBy('Qty',ascending=False)\
.show()

+-------+--------------+-------+-----------+---+-----+
|cust_id|          name|OrderId|ProductName|Qty|stock|
+-------+--------------+-------+-----------+---+-----+
|   2534|Vaishali Shete|     25|     Mobile|230|   21|
|   1021|       Ram Dev|    567|     Camera| 24|    0|
|   1001| Vikas Bhosale|     42|  Headphone| 23|  920|
|   3412|   Neha Sharma|     56|   Pendrive| 13|    0|
|   1801|  Jitesh Varma|      2|        USB|  3|   34|
|   3021| Akshay Abhale|     23|      Mouse|  2|   77|
+-------+--------------+-------+-----------+---+-----+

