In [1]:
import pandas as pd

In [2]:
df= pd.read_csv('/content/sample_data/2019-Dec.csv')

In [3]:
df['event_type'].unique()

array(['remove_from_cart', 'view', 'cart', 'purchase'], dtype=object)

In [4]:
df['event_time'] = pd.to_datetime(df['event_time'])

### **데이터 전처리**

---



1. 중복값 처리

In [5]:
df = df.drop_duplicates()

2. 결측값 처리

In [6]:
df.isna().sum()

Unnamed: 0,0
event_time,0
event_type,0
product_id,0
category_id,0
category_code,3292700
brand,1428211
price,0
user_id,0
user_session,714


In [7]:
#분석에 의미없는 카테고리 코드 제거
df = df.drop('category_code', axis=1)

#총 2541810 데이터 중 1072879(brand 결측치) 제거해도 1,468,931개의 데이터를 확보할 수 있기 때문에 단순 제거
df = df.dropna(subset = ['brand','price','user_id','user_session'])

df.isna().sum()

Unnamed: 0,0
event_time,0
event_type,0
product_id,0
category_id,0
brand,0
price,0
user_id,0
user_session,0


3. 이상치(Outliers) 처리

In [8]:
df['event_time'].describe()

Unnamed: 0,event_time
count,1920758
mean,2019-12-14 20:50:43.754482432+00:00
min,2019-12-01 00:00:00+00:00
25%,2019-12-07 19:10:03.249999872+00:00
50%,2019-12-14 11:05:43+00:00
75%,2019-12-21 16:29:01.249999872+00:00
max,2019-12-31 23:59:46+00:00


- 2019-12-01부터 2019-12-31 한달 간의 데이터
- 날짜에 대한 이상치 없음

### **퍼널별 전환률 분석**

---



In [12]:
import pandas as pd
import sqlite3

conn = sqlite3.connect('ecommerce.db')
df.to_sql('ecommerce_table', conn, if_exists='replace', index=False)

query = "SELECT * FROM ecommerce_table"
result_df = pd.read_sql(query, conn)

print(result_df.head())

                  event_time        event_type  product_id  \
0  2019-12-01 00:00:00+00:00  remove_from_cart     5712790   
1  2019-12-01 00:00:00+00:00              view     5764655   
2  2019-12-01 00:00:02+00:00              cart        4958   
3  2019-12-01 00:00:05+00:00              view     5848413   
4  2019-12-01 00:00:09+00:00              view     5773361   

           category_id      brand  price    user_id  \
0  1487580005268456287      f.o.x   6.27  576802932   
1  1487580005411062629        cnd  29.05  412120092   
2  1487580009471148064     runail   1.19  494077766   
3  1487580007675986893  freedecor   0.79  348405118   
4  1487580005134238553     runail   2.62  560109803   

                           user_session  
0  51d85cb0-897f-48d2-918b-ad63965c12dc  
1  8adff31e-2051-4894-9758-224bfa8aec18  
2  c99a50e8-2fac-4c4d-89ec-41c05f114554  
3  722ffea5-73c0-4924-8e8f-371ff8031af4  
4  38cf4ba1-4a0a-4c9e-b870-46685d105f95  


1. view - cart - purchase 퍼널

In [25]:
query_funnel1 = pd.read_sql_query("""

WITH session_purchase AS (
    SELECT user_session,
           (CASE WHEN event_type = 'view' THEN 1 ELSE 0 END) AS has_view,
           (CASE WHEN event_type = 'cart' THEN 1 ELSE 0 END) AS has_cart,
           (CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) AS has_purchase,
    FROM ecommerce_table
    GROUP BY user_session
)
SELECT
    ROUND(CAST(SUM(has_view) AS FLOAT) / COUNT(*) * 100, 2) || '%' as Entire_View_conversion_rate,
    ROUND(CAST(SUM(has_cart) AS FLOAT) / CAST(SUM(has_view) AS FLOAT) * 100, 2) || '%' as View_Cart_conversion_rate,
    ROUND(CAST(SUM(has_purchase) AS FLOAT) / CAST(SUM(has_cart) AS FLOAT) * 100, 2) || '%' as Cart_Purchase_conversion_rate
FROM session_purchase;

"""
,conn)

# 결과 확인
print(query_funnel1.head())

  Entire_View_conversion_rate View_Cart_conversion_rate  \
0                      86.08%                    11.06%   

  Cart_Purchase_conversion_rate  
0                         7.11%  


2. view - cart - remove_from_cart 퍼널

In [24]:
query_funnel2 = pd.read_sql_query("""

WITH T1 AS (
    SELECT user_session,
           (CASE WHEN event_type = 'view' THEN 1 ELSE 0 END) AS has_view,
           (CASE WHEN event_type = 'cart' THEN 1 ELSE 0 END) AS has_cart,
           (CASE WHEN event_type = 'remove_from_cart' THEN 1 ELSE 0 END) AS has_remove_from_cart
    FROM ecommerce_table
    GROUP BY user_session
)
SELECT
    ROUND(CAST(SUM(has_view) AS FLOAT) / COUNT(*) * 100, 2) || '%' as Entire_View_conversion_rate,
    ROUND(CAST(SUM(has_cart) AS FLOAT) / CAST(SUM(has_view) AS FLOAT) * 100, 2) || '%' as View_Cart_conversion_rate,
    ROUND(CAST(SUM(has_remove_from_cart) AS FLOAT) / CAST(SUM(has_cart) AS FLOAT) * 100, 2) || '%' as Cart_Remove_conversion_rate
FROM T1;

"""
,conn)

# 결과 확인
print(query_funnel2.head())

  Entire_View_conversion_rate View_Cart_conversion_rate  \
0                      86.08%                    11.06%   

  Cart_Remove_conversion_rate  
0                      39.09%  


3. view -purchase 퍼널

In [29]:
query_funnel3 = pd.read_sql_query("""

WITH T1 AS (
    SELECT user_session,
           (CASE WHEN event_type = 'view' THEN 1 ELSE 0 END) AS has_view,
           (CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) AS has_purchase

    FROM ecommerce_table
    GROUP BY user_session
)
SELECT
    ROUND(CAST(SUM(has_purchase) AS FLOAT) / CAST(SUM(has_view) AS FLOAT) * 100, 2) || '%' as View_Purchase_conversion_rate
FROM T1;

"""
,conn)

# 결과 확인
print(query_funnel3.head())

  View_Purchase_conversion_rate
0                         0.79%
