In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
dataset = pd.read_csv('user_views.csv')
dataset.head()

Unnamed: 0,user_id,sku_id,timestamp,session_id,device_type,referrer
0,U3089,P1223,2024-11-17 20:03:29,S71948,Mobile,Paid Search
1,U2658,P0448,2025-01-21 16:01:04,S71280,Mobile,Campaign
2,U3831,P1124,2024-03-23 09:41:11,S40100,App,Social Media
3,U2823,P0261,2023-10-17 13:32:16,S10259,App,Campaign
4,U4688,P0354,2023-07-10 04:38:42,S70757,Mobile,Organic


In [None]:
df_user_views = dataset.copy()
df_user_views.head()

Unnamed: 0,user_id,sku_id,timestamp,session_id,device_type,referrer
0,U3089,P1223,2024-11-17 20:03:29,S71948,Mobile,Paid Search
1,U2658,P0448,2025-01-21 16:01:04,S71280,Mobile,Campaign
2,U3831,P1124,2024-03-23 09:41:11,S40100,App,Social Media
3,U2823,P0261,2023-10-17 13:32:16,S10259,App,Campaign
4,U4688,P0354,2023-07-10 04:38:42,S70757,Mobile,Organic


In [None]:
df_user_views.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   user_id      200000 non-null  object
 1   sku_id       200000 non-null  object
 2   timestamp    200000 non-null  object
 3   session_id   200000 non-null  object
 4   device_type  194067 non-null  object
 5   referrer     195966 non-null  object
dtypes: object(6)
memory usage: 9.2+ MB


In [None]:
print("\nMissing Values:")
print(df_user_views.isnull().sum())


Missing Values:
user_id           0
sku_id            0
timestamp         0
session_id        0
device_type    5933
referrer       4034
dtype: int64


In [None]:
df_user_views['timestamp'] = pd.to_datetime(df_user_views['timestamp'])

In [None]:
df_user_views['timestamp'].dtype

dtype('<M8[ns]')

In [None]:
#  Fix missing values
df_user_views['device_type'] = df_user_views['device_type'].fillna('Unknown')
df_user_views['referrer'] = df_user_views['referrer'].fillna('direct')


In [None]:
print("\nMissing After Fix:")
print(df_user_views.isnull().sum())


Missing After Fix:
user_id        0
sku_id         0
timestamp      0
session_id     0
device_type    0
referrer       0
dtype: int64


In [None]:
# Full row duplicates
duplicate_rows = df_user_views[df_user_views.duplicated()]
print(f"\nTotal Duplicate Rows: {duplicate_rows.shape[0]}")


Total Duplicate Rows: 0


In [None]:
# formatting issues in device_type
df_user_views['device_type'] = df_user_views['device_type'].str.strip().str.lower()


In [None]:
# Check unique values
print(df_user_views['device_type'].value_counts(dropna=False))

device_type
app        64771
desktop    64707
mobile     64589
unknown     5933
Name: count, dtype: int64


In [None]:
# Normalize referrer column
df_user_views['referrer'] = df_user_views['referrer'].str.strip().str.lower()

# Check value distribution
print(df_user_views['referrer'].value_counts().head(10))

referrer
campaign        39414
paid search     39254
email           39236
social media    39210
organic         38852
direct           4034
Name: count, dtype: int64


In [None]:
# Fill missing device_type with most frequent value (mode)
df_user_views['device_type'].fillna(df_user_views['device_type'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_user_views['device_type'].fillna(df_user_views['device_type'].mode()[0], inplace=True)


In [None]:
df_user_views.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   user_id      200000 non-null  object        
 1   sku_id       200000 non-null  object        
 2   timestamp    200000 non-null  datetime64[ns]
 3   session_id   200000 non-null  object        
 4   device_type  200000 non-null  object        
 5   referrer     200000 non-null  object        
dtypes: datetime64[ns](1), object(5)
memory usage: 9.2+ MB


# Feature Engineering for EDA


In [None]:
# TIME‑BASED CONTEXT
df_user_views['view_hour'] = df_user_views['timestamp'].dt.hour
df_user_views['view_dayofweek'] = df_user_views['timestamp'].dt.dayofweek   # 0=Mon
df_user_views['is_weekend'] = df_user_views['view_dayofweek'].isin([5, 6]).astype(int)

In [None]:
# SESSION‑LEVEL INTENT
session_view_counts = (
    df_user_views
      .groupby('session_id').size()
      .rename('session_view_count'))

df_user_views = df_user_views.merge(session_view_counts,
                                    on='session_id', how='left')


In [None]:
# USER‑LEVEL ENGAGEMENT
user_view_counts = (
    df_user_views
      .groupby('user_id')
      .size()
      .rename('user_view_count')
)
df_user_views = df_user_views.merge(user_view_counts,
                                    on='user_id',how='left')


In [None]:
#PRODUCT‑LEVEL INTEREST
sku_view_counts = (
    df_user_views
      .groupby('sku_id')
      .size()
      .rename('sku_total_views'))

df_user_views = df_user_views.merge(sku_view_counts,
                                    on='sku_id',
                                    how='left')

| **Feature**                  | What It Captures                     | Why It Powers Dynamic Pricing & Demand Estimation                                                                             |
| ------------------------ | ------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------- |
| **`view_hour`**          | Micro‑temporal context (hour of day) | Buying intent and traffic mix change by hour; RL can learn time‑sensitive price patterns.                                     |
| **`view_dayofweek`**     | Day‑of‑week context (0=Mon)          | Weekends often show higher leisure browsing; weekday patterns may imply urgency (e.g., lunchtime shopping).                   |
| **`is_weekend`**         | Binary weekend flag                  | Simple signal for weekend promos and higher discretionary spending.                                                           |
| **`session_view_count`** | Depth of a single browsing session   | A session with many views indicates strong intent → model may allow smaller discounts to close sale.                          |
| **`user_view_count`**    | Overall engagement of a user         | High‑engagement users (loyal) often convert at higher life‑time value; RL can price less aggressively or offer loyalty perks. |
| **`sku_total_views`**    | Aggregate popularity of a product    | Proxy for demand; highly viewed SKUs may sustain higher prices, while low‑view SKUs may need markdowns.                       |


In [None]:
df_user_views[['view_hour',
                     'view_dayofweek',
                     'is_weekend',
                     'session_view_count',
                     'user_view_count',
                     'sku_total_views']].head()

Unnamed: 0,view_hour,view_dayofweek,is_weekend,session_view_count,user_view_count,sku_total_views
0,20,6,1,5,33,115
1,16,1,0,2,47,143
2,9,5,1,2,42,157
3,13,1,0,2,41,136
4,4,0,0,7,48,139


In [None]:
df_user_views.to_csv("clean_user_views.csv", index=False)
