# New and Repeat Customers
## Find the total count of new and repeat customers for the order date

In [0]:
-- Switch to my Catalog
USE CATALOG workspace;

-- Create schema if not exists
CREATE SCHEMA IF NOT EXISTS sql_pyspark_practice;

-- Use this schema
USE sql_pyspark_practice;

In [0]:
create or replace table customer_orders (
order_id integer,
customer_id integer,
order_date date,
order_amount integer
);

insert into customer_orders values(1,100,cast('2022-01-01' as date),2000),(2,200,cast('2022-01-01' as date),2500),(3,300,cast('2022-01-01' as date),2100)
,(4,100,cast('2022-01-02' as date),2000),(5,400,cast('2022-01-02' as date),2200),(6,500,cast('2022-01-02' as date),2700)
,(7,100,cast('2022-01-03' as date),3000),(8,400,cast('2022-01-03' as date),1000),(9,600,cast('2022-01-03' as date),3000);


select * from customer_orders;

In [0]:
with first_visit as (
select customer_id, min(order_date) as first_visit_date
from customer_orders
group by customer_id
)

select co.order_date,
  sum(case when co.order_date = fv.first_visit_date then 1 else 0 end) as first_visit_flag,
  sum(case when order_date != first_visit_date then 1 else 0 end) as repeat_visit_flag
from customer_orders co 
join first_visit fv 
on co.customer_id = fv.customer_id
group by co.order_date
order by co.order_date;

### Learnings
- Usage of Min to find the first occurence thereby telling us about the data that first occurence of that id is on that particular date
- Usage of CTEs
- Finding and matching the pattern to identify whether it's first or repeated

In [0]:
%python
from pyspark.sql.functions import min
from pyspark.sql import functions as F
df = spark.table("customer_orders")
# display(df)

first_visit = df.groupBy('customer_id').agg(min('order_date').alias('first_visit'))
display(first_visit)

joined_df = df.join(first_visit, df.customer_id == first_visit.customer_id, 'inner')
display(joined_df)

joined_df = joined_df.withColumn(
    "first_visit_flag",
    F.when(F.col("order_date") == F.col("first_visit"), 1).otherwise(0)
    ).withColumn(
        "repeat_visit_flag",
        F.when(F.col("order_date") != F.col("first_visit"), 1).otherwise(0)
    )

result = joined_df.groupBy("order_date").agg(
    F.sum("first_visit_flag").alias('new_customers'),
    F.sum('repeat_visit_flag').alias('repeat_customers')
)

result = result.orderBy("order_date")

display(result)
