In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *

In [0]:
users = [[4,'92541','A'],
         [1,'09397','B']]

txns = [[100,4,'92541'],
        [101,4,'124'],
        [102,4,'92541'],
        [103,1,'09397'],
        [104,1,'890']]

users_df = spark.createDataFrame(users,['id','add','name'])
txn_df = spark.createDataFrame(txns,['id','user_id','ship_add'])

In [0]:
result = users_df.join(txn_df, users_df['id'] == txn_df['user_id'], how='inner')\
                .select(col('user_id'),col('add'),col('ship_add')) \
                .groupby(col('user_id')).agg(count("*").alias("total_orders"), 
                                     sum(when(col('add') == col('ship_add'),1).
                                         otherwise(0)).alias("orders_in_home"),
                                     sum(when(col('add') != col('ship_add'),1).
                                         otherwise(0)).alias("orders_not_in_home")
                                    ) \
                .filter(col("orders_in_home") > col("orders_not_in_home"))
        
result.withColumn("home_percentage", (col("orders_in_home") / col("total_orders")) * 100) \
      .select("user_id","home_percentage") \
      .show()

+-------+-----------------+
|user_id|  home_percentage|
+-------+-----------------+
|      4|66.66666666666666|
+-------+-----------------+



In [0]:
users_df.createOrReplaceTempView('user_tbl')

In [0]:
txn_df.createOrReplaceTempView('txn_tbl')

In [0]:
%sql
with user_ship as (
  select user_id,add,ship_add
  from user_tbl u
  join txn_tbl t
  on u.id = t.user_id
),
total_orders as (
  select user_id,count(*) as total_orders
  from user_ship
  group by user_id
),
total_home_orders as (
  select user_id,count(add) as orders_in_home
  from user_ship
  where add = ship_add
  group by user_id
),
total_ship_orders as (
  select user_id,count(ship_add) as orders_not_in_home
  from user_ship
  where add <> ship_add
  group by user_id
)
select user_id,home_percentage from(
select t_o.user_id,total_orders,orders_in_home,orders_not_in_home,((orders_in_home/total_orders)*100) as home_percentage
from total_orders t_o
join total_home_orders t_h_o
on t_o.user_id = t_h_o.user_id
join total_ship_orders t_s_o
on t_h_o.user_id = t_s_o.user_id
where orders_in_home > orders_not_in_home
)tmp;

user_id,home_percentage
4,66.66666666666666


In [0]:
%sql
select 
  T.user_id, 
  (count(add) / count(*)) * 100 as Home_order_pect
from
txn_tbl T
left join
user_tbl U
on T.user_id = U.id and T.ship_add = U.add
group by T.user_id

user_id,Home_order_pect
4,66.66666666666666
1,50.0
