In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('click_stream.csv')
df['event_time'] = pd.to_datetime(df['event_time'])

In [3]:
df.shape

(12833602, 6)

In [4]:
df['event_time'].min()

Timestamp('2016-06-30 22:59:36.254358+0000', tz='UTC')

In [5]:
df['event_time'].max()

Timestamp('2022-08-01 00:49:59.408424+0000', tz='UTC')

### Filter the data so we only use events which occured during the past year

In [6]:
df = df[(df['event_time'] >= '2021-08-01') & (df['event_time'] <= '2022-07-31')]

In [7]:
df.shape

(4793008, 6)

### Double check that everything is correct and is working.

In [8]:
df['event_time'].min()

Timestamp('2021-08-01 00:00:09.598036+0000', tz='UTC')

In [11]:
df['event_time'].max()

Timestamp('2022-07-30 23:59:56.710139+0000', tz='UTC')

In [12]:
df.dtypes

session_id                     object
event_name                     object
event_time        datetime64[ns, UTC]
event_id                       object
traffic_source                 object
event_metadata                 object
dtype: object

In [13]:
df.isnull().sum()

session_id              0
event_name              0
event_time              0
event_id                0
traffic_source          0
event_metadata    3274293
dtype: int64

### Split the column metadata into 3 separate columns - ProductID, Quantity and Item Price.

In [14]:
import ast

def extract_metadata(meta, key):
    if pd.isna(meta):
        return None
    if isinstance(meta, str):
        meta = ast.literal_eval(meta)
    return meta.get(key, None)

df['product_id'] = df['event_metadata'].apply(lambda x: extract_metadata(x, 'product_id'))
df['quantity'] = df['event_metadata'].apply(lambda x: extract_metadata(x, 'quantity'))
df['item_price'] = df['event_metadata'].apply(lambda x: extract_metadata(x, 'item_price'))
df['payment_status'] = df['event_metadata'].apply(lambda x: extract_metadata(x, 'payment_status'))

In [15]:
df = df.drop(columns=['event_metadata'])

In [17]:
df['event_name'].value_counts()

event_name
CLICK          968370
HOMEPAGE       930962
SCROLL         655255
ADD_TO_CART    637452
ITEM_DETAIL    500132
SEARCH         464663
BOOKING        314839
PROMO_PAGE     219574
ADD_PROMO      101761
Name: count, dtype: int64

In [18]:
df.head()

Unnamed: 0,session_id,event_name,event_time,event_id,traffic_source,product_id,quantity,item_price,payment_status
1734,9586a822-62e6-43e3-80a8-604e817daad4,HOMEPAGE,2021-08-04 07:31:35.425431+00:00,a78f83d7-489c-4123-808b-9fca01c51847,MOBILE,,,,
1735,9586a822-62e6-43e3-80a8-604e817daad4,ADD_TO_CART,2021-08-04 15:25:09.425431+00:00,f79f8b9f-c044-41da-add5-ed2e511585ce,MOBILE,17642.0,1.0,163920.0,
1736,9586a822-62e6-43e3-80a8-604e817daad4,BOOKING,2021-08-06 06:57:08.425431+00:00,544bfa6a-9bf3-410a-b7c5-ad430ab756ae,MOBILE,,,,Success
1737,9586a822-62e6-43e3-80a8-604e817daad4,SCROLL,2021-08-04 15:24:06.425431+00:00,2553f0a6-80cf-4df5-b0bc-3cc99332c0c2,MOBILE,,,,
1738,9586a822-62e6-43e3-80a8-604e817daad4,SEARCH,2021-08-04 23:15:21.425431+00:00,8669ac67-c9e3-4b4c-add7-3d55ca7386b1,MOBILE,,,,


### Drop the columns since I will not be needing them for my analysis and the majority of data is missing.

In [19]:
df = df.drop(columns= ['product_id', 'quantity','item_price'])

In [20]:
df.head()

Unnamed: 0,session_id,event_name,event_time,event_id,traffic_source,payment_status
1734,9586a822-62e6-43e3-80a8-604e817daad4,HOMEPAGE,2021-08-04 07:31:35.425431+00:00,a78f83d7-489c-4123-808b-9fca01c51847,MOBILE,
1735,9586a822-62e6-43e3-80a8-604e817daad4,ADD_TO_CART,2021-08-04 15:25:09.425431+00:00,f79f8b9f-c044-41da-add5-ed2e511585ce,MOBILE,
1736,9586a822-62e6-43e3-80a8-604e817daad4,BOOKING,2021-08-06 06:57:08.425431+00:00,544bfa6a-9bf3-410a-b7c5-ad430ab756ae,MOBILE,Success
1737,9586a822-62e6-43e3-80a8-604e817daad4,SCROLL,2021-08-04 15:24:06.425431+00:00,2553f0a6-80cf-4df5-b0bc-3cc99332c0c2,MOBILE,
1738,9586a822-62e6-43e3-80a8-604e817daad4,SEARCH,2021-08-04 23:15:21.425431+00:00,8669ac67-c9e3-4b4c-add7-3d55ca7386b1,MOBILE,


In [21]:
df.to_csv('click_stream_cleaned.csv', index=False)