In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import unicodedata

In [2]:
user_info = pd.read_csv("/content/drive/MyDrive/AI Project/user_info_final.csv")

In [2]:
user_log = pd.read_csv("/content/drive/MyDrive/AI Project/user_log.csv")

In [6]:
user_log.action.unique()

array(['click', 'purchase', 'favourite', 'add-to-cart'], dtype=object)

In [3]:
# item, cat, brand, merchant count
user_log_1 = (
    user_log.groupby('user_id', as_index=False)
    .agg(
        item_count=('item_id', 'nunique'),
        cat_count=('cat_id', 'nunique'),
        brand_count=('brand_id', 'nunique'),
        merchant_count=('merchant_id', 'nunique')
    )
)

In [6]:
user_log_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   user_id         424170 non-null  int64
 1   item_count      424170 non-null  int64
 2   cat_count       424170 non-null  int64
 3   brand_count     424170 non-null  int64
 4   merchant_count  424170 non-null  int64
dtypes: int64(5)
memory usage: 16.2 MB


In [4]:
# Action count for each user_id
agg_action_count = (user_log
       .groupby(['user_id', 'action'])
       .size()
       .reset_index(name='count'))

In [9]:
agg_action_count.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1112504 entries, 0 to 1112503
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   user_id  1112504 non-null  int64 
 1   action   1112504 non-null  object
 2   count    1112504 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 25.5+ MB


In [10]:
agg_action_count.head()

Unnamed: 0,user_id,action,count
0,1,click,27
1,1,purchase,6
2,2,click,47
3,2,favourite,2
4,2,purchase,14


In [5]:
# Pivot table
features_action_count = agg_action_count.pivot_table(
    index=['user_id'],
    columns=['action'],
    values='count',
    fill_value=0
)

# Flatten column names
features_action_count.columns = [
    f"{act}_count"
    for act in features_action_count.columns
]

features_action_count = features_action_count.reset_index()

In [14]:
features_action_count.head()

Unnamed: 0,user_id,add-to-cart_count,click_count,favourite_count,purchase_count
0,1,0.0,27.0,0.0,6.0
1,2,0.0,47.0,2.0,14.0
2,3,0.0,63.0,1.0,4.0
3,4,0.0,49.0,0.0,1.0
4,5,0.0,150.0,10.0,13.0


In [6]:
final_1 = pd.merge(user_log_1, features_action_count, on='user_id', how='inner')
final_1.head()

Unnamed: 0,user_id,item_count,cat_count,brand_count,merchant_count,add-to-cart_count,click_count,favourite_count,purchase_count
0,1,12,6,9,9,0.0,27.0,0.0,6.0
1,2,43,14,15,14,0.0,47.0,2.0,14.0
2,3,45,19,22,23,0.0,63.0,1.0,4.0
3,4,28,13,12,12,0.0,49.0,0.0,1.0
4,5,87,40,59,56,0.0,150.0,10.0,13.0


In [3]:
user_log_2 = user_log[['user_id', 'action', 'datetime']]
user_log_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   int64 
 1   action    object
 2   datetime  object
dtypes: int64(1), object(2)
memory usage: 1.2+ GB


In [4]:
# Change datetime data type
user_log_2['datetime'] = pd.to_datetime(user_log_2['datetime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_log_2['datetime'] = pd.to_datetime(user_log_2['datetime'])


In [8]:
## Adding is_sale_day
# Define Shopee double-day mega sale days for time range above
sale_days = [
    "2024-05-15", "2024-05-25",
    "2024-06-06", "2024-06-15", "2024-06-25",
    "2024-07-07", "2024-07-15", "2024-07-25",
    "2024-08-08", "2024-08-15", "2024-08-25",
    "2024-09-02", "2024-09-09", "2024-09-15", "2024-09-25",
    "2024-10-10", "2024-10-15", "2024-10-20", "2024-10-25",
    "2024-11-11", "2024-11-15", "2024-11-25",
]

sale_days = pd.to_datetime(sale_days)

# Flag whether each row is a sale-day interaction
user_log_2['is_sale_day'] = user_log_2['datetime'].dt.normalize().isin(sale_days)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_log_2['datetime'] = pd.to_datetime(user_log_2['datetime'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_log_2['is_sale_day'] = user_log_2['datetime'].dt.normalize().isin(sale_days)


In [10]:
user_log_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 4 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   action       object        
 2   datetime     datetime64[ns]
 3   is_sale_day  bool          
dtypes: bool(1), datetime64[ns](1), int64(1), object(1)
memory usage: 1.3+ GB


In [11]:
# Action count for sale and nosale days

agg_action_count_sale_nosale = (user_log_2
       .groupby(['user_id', 'is_sale_day', 'action'])
       .size()
       .reset_index(name='count'))

# Pivot table: sale_day vs non-sale & action counts
features_action_count_sale_nosale = agg_action_count_sale_nosale.pivot_table(
    index=['user_id'],
    columns=['is_sale_day', 'action'],
    values='count',
    fill_value=0
)

# Flatten column names
features_action_count_sale_nosale.columns = [
    f"{'sale' if sd else 'nosale'}_{act}"
    for sd, act in features_action_count_sale_nosale.columns
]

features_action_count_sale_nosale = features_action_count_sale_nosale.reset_index()

In [12]:
features_action_count_sale_nosale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   user_id             424170 non-null  int64  
 1   nosale_add-to-cart  424170 non-null  float64
 2   nosale_click        424170 non-null  float64
 3   nosale_favourite    424170 non-null  float64
 4   nosale_purchase     424170 non-null  float64
 5   sale_add-to-cart    424170 non-null  float64
 6   sale_click          424170 non-null  float64
 7   sale_favourite      424170 non-null  float64
 8   sale_purchase       424170 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 29.1 MB


In [23]:
features_action_count_sale_nosale.head()

Unnamed: 0,user_id,nosale_add-to-cart,nosale_click,nosale_favourite,nosale_purchase,sale_add-to-cart,sale_click,sale_favourite,sale_purchase
0,1,0.0,14.0,0.0,2.0,0.0,13.0,0.0,4.0
1,2,0.0,45.0,1.0,5.0,0.0,2.0,1.0,9.0
2,3,0.0,55.0,1.0,3.0,0.0,8.0,0.0,1.0
3,4,0.0,38.0,0.0,0.0,0.0,11.0,0.0,1.0
4,5,0.0,116.0,8.0,10.0,0.0,34.0,2.0,3.0


In [13]:
features_action_count_sale_nosale['sale_click_purchase'] = features_action_count_sale_nosale['sale_click'] / (features_action_count_sale_nosale['sale_purchase'] + 1)
features_action_count_sale_nosale['sale_fav_purchase'] = features_action_count_sale_nosale['sale_favourite'] / (features_action_count_sale_nosale['sale_purchase'] + 1)
features_action_count_sale_nosale['sale_add_purchase'] = features_action_count_sale_nosale['sale_add-to-cart'] / (features_action_count_sale_nosale['sale_purchase'] + 1)
features_action_count_sale_nosale['nosale_click_purchase'] = features_action_count_sale_nosale['nosale_click'] / (features_action_count_sale_nosale['nosale_purchase'] + 1)
features_action_count_sale_nosale['nosale_fav_purchase'] = features_action_count_sale_nosale['nosale_favourite'] / (features_action_count_sale_nosale['nosale_purchase'] + 1)
features_action_count_sale_nosale['nosale_add_purchase'] = features_action_count_sale_nosale['nosale_add-to-cart'] / (features_action_count_sale_nosale['nosale_purchase'] + 1)

In [25]:
features_action_count_sale_nosale.head()

Unnamed: 0,user_id,nosale_add-to-cart,nosale_click,nosale_favourite,nosale_purchase,sale_add-to-cart,sale_click,sale_favourite,sale_purchase,sale_click_purchase,sale_fav_purchase,sale_add_purchase,nosale_click_purchase,nosale_fav_purchase,nosale_add_purchase
0,1,0.0,14.0,0.0,2.0,0.0,13.0,0.0,4.0,2.6,0.0,0.0,4.666667,0.0,0.0
1,2,0.0,45.0,1.0,5.0,0.0,2.0,1.0,9.0,0.2,0.1,0.0,7.5,0.166667,0.0
2,3,0.0,55.0,1.0,3.0,0.0,8.0,0.0,1.0,4.0,0.0,0.0,13.75,0.25,0.0
3,4,0.0,38.0,0.0,0.0,0.0,11.0,0.0,1.0,5.5,0.0,0.0,38.0,0.0,0.0
4,5,0.0,116.0,8.0,10.0,0.0,34.0,2.0,3.0,8.5,0.5,0.0,10.545455,0.727273,0.0


In [27]:
features_action_count_sale_nosale.columns

Index(['user_id', 'nosale_add-to-cart', 'nosale_click', 'nosale_favourite',
       'nosale_purchase', 'sale_add-to-cart', 'sale_click', 'sale_favourite',
       'sale_purchase', 'sale_click_purchase', 'sale_fav_purchase',
       'sale_add_purchase', 'nosale_click_purchase', 'nosale_fav_purchase',
       'nosale_add_purchase'],
      dtype='object')

In [14]:
final_2 = pd.merge(final_1,features_action_count_sale_nosale[['user_id', 'sale_click_purchase', 'sale_fav_purchase', 'sale_add_purchase',
                                                             'nosale_click_purchase', 'nosale_fav_purchase', 'nosale_add_purchase']],
                   on='user_id', how='inner')
final_2.head()

Unnamed: 0,user_id,item_count,cat_count,brand_count,merchant_count,add-to-cart_count,click_count,favourite_count,purchase_count,sale_click_purchase,sale_fav_purchase,sale_add_purchase,nosale_click_purchase,nosale_fav_purchase,nosale_add_purchase
0,1,12,6,9,9,0.0,27.0,0.0,6.0,2.6,0.0,0.0,4.666667,0.0,0.0
1,2,43,14,15,14,0.0,47.0,2.0,14.0,0.2,0.1,0.0,7.5,0.166667,0.0
2,3,45,19,22,23,0.0,63.0,1.0,4.0,4.0,0.0,0.0,13.75,0.25,0.0
3,4,28,13,12,12,0.0,49.0,0.0,1.0,5.5,0.0,0.0,38.0,0.0,0.0
4,5,87,40,59,56,0.0,150.0,10.0,13.0,8.5,0.5,0.0,10.545455,0.727273,0.0


In [15]:
final_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   user_id                424170 non-null  int64  
 1   item_count             424170 non-null  int64  
 2   cat_count              424170 non-null  int64  
 3   brand_count            424170 non-null  int64  
 4   merchant_count         424170 non-null  int64  
 5   add-to-cart_count      424170 non-null  float64
 6   click_count            424170 non-null  float64
 7   favourite_count        424170 non-null  float64
 8   purchase_count         424170 non-null  float64
 9   sale_click_purchase    424170 non-null  float64
 10  sale_fav_purchase      424170 non-null  float64
 11  sale_add_purchase      424170 non-null  float64
 12  nosale_click_purchase  424170 non-null  float64
 13  nosale_fav_purchase    424170 non-null  float64
 14  nosale_add_purchase    424170 non-nu

In [16]:
# Purchase count in no_sale and sale days

df_purchase = user_log_2[user_log['action'] == 'purchase'].copy().reset_index(drop=True)
df_purchase['month'] = df_purchase['datetime'].dt.to_period('M')
df_purchase.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3292144 entries, 0 to 3292143
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   action       object        
 2   datetime     datetime64[ns]
 3   is_sale_day  bool          
 4   month        period[M]     
dtypes: bool(1), datetime64[ns](1), int64(1), object(1), period[M](1)
memory usage: 103.6+ MB


In [18]:
monthly_purchase = df_purchase.groupby(['user_id', 'month', 'is_sale_day']).size().reset_index(name='count')
monthly_purchase.head()

Unnamed: 0,user_id,month,is_sale_day,count
0,1,2024-10,False,2
1,1,2024-11,True,4
2,2,2024-06,False,3
3,2,2024-07,False,1
4,2,2024-08,False,1


In [19]:
# Pivot table: sale_day vs non-sale & action counts
features2 = monthly_purchase.pivot_table(
    index=['user_id'],
    columns=['is_sale_day', 'month'],
    values='count',
    fill_value=0
)

# Flatten column names
features2.columns = [
    f"{'sale' if sd else 'nosale'}_{act}"
    for sd, act in features2.columns
]

features2 = features2.reset_index()
features2.head()

Unnamed: 0,user_id,nosale_2024-05,nosale_2024-06,nosale_2024-07,nosale_2024-08,nosale_2024-09,nosale_2024-10,nosale_2024-11,sale_2024-05,sale_2024-06,sale_2024-07,sale_2024-08,sale_2024-09,sale_2024-10,sale_2024-11
0,1,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
1,2,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,7.0
2,3,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,2.0,1.0,0.0,1.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0


In [20]:
features2['purchase_ratio_06'] = features2['sale_2024-06'] / (features2['nosale_2024-06'] + 1)
features2['purchase_ratio_07'] = features2['sale_2024-07'] / (features2['nosale_2024-07'] + 1)
features2['purchase_ratio_08'] = features2['sale_2024-08'] / (features2['nosale_2024-08'] + 1)
features2['purchase_ratio_09'] = features2['sale_2024-09'] / (features2['nosale_2024-09'] + 1)
features2['purchase_ratio_10'] = features2['sale_2024-10'] / (features2['nosale_2024-10'] + 1)
features2['purchase_ratio_11'] = features2['sale_2024-11'] / (features2['nosale_2024-11'] + 1)
features2['purchase_ratio'] = (features2['purchase_ratio_06']
                               + features2['purchase_ratio_07']
                               + features2['purchase_ratio_08']
                               + features2['purchase_ratio_09']
                               + features2['purchase_ratio_10']
                               + features2['purchase_ratio_11']) / 6

In [36]:
features2.head(10)

Unnamed: 0,user_id,nosale_2024-05,nosale_2024-06,nosale_2024-07,nosale_2024-08,nosale_2024-09,nosale_2024-10,nosale_2024-11,sale_2024-05,sale_2024-06,...,sale_2024-09,sale_2024-10,sale_2024-11,purchase_ratio_06,purchase_ratio_07,purchase_ratio_08,purchase_ratio_09,purchase_ratio_10,purchase_ratio_11,purchase_ratio
0,1,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.666667
1,2,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,7.0,0.0,0.0,0.0,0.0,2.0,7.0,1.5
2,3,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.166667
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.166667
4,5,2.0,1.0,0.0,1.0,2.0,4.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.5
5,6,0.0,3.0,4.0,0.0,5.0,1.0,1.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.166667,0.0,1.0,0.194444
6,7,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,3.5,0.583333
7,8,1.0,2.0,4.0,4.0,0.0,1.0,0.0,0.0,0.0,...,3.0,2.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,1.0
8,9,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.333333
9,10,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.333333


In [21]:
final_3 = pd.merge(final_2, features2[['user_id', 'purchase_ratio']],
                                        on='user_id', how='left')
final_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   user_id                424170 non-null  int64  
 1   item_count             424170 non-null  int64  
 2   cat_count              424170 non-null  int64  
 3   brand_count            424170 non-null  int64  
 4   merchant_count         424170 non-null  int64  
 5   add-to-cart_count      424170 non-null  float64
 6   click_count            424170 non-null  float64
 7   favourite_count        424170 non-null  float64
 8   purchase_count         424170 non-null  float64
 9   sale_click_purchase    424170 non-null  float64
 10  sale_fav_purchase      424170 non-null  float64
 11  sale_add_purchase      424170 non-null  float64
 12  nosale_click_purchase  424170 non-null  float64
 13  nosale_fav_purchase    424170 non-null  float64
 14  nosale_add_purchase    424170 non-nu

In [25]:
final_3.to_csv("/content/drive/MyDrive/AI Project/final_3.csv", index=False)

In [22]:
final_3.head()

Unnamed: 0,user_id,item_count,cat_count,brand_count,merchant_count,add-to-cart_count,click_count,favourite_count,purchase_count,sale_click_purchase,sale_fav_purchase,sale_add_purchase,nosale_click_purchase,nosale_fav_purchase,nosale_add_purchase,purchase_ratio
0,1,12,6,9,9,0.0,27.0,0.0,6.0,2.6,0.0,0.0,4.666667,0.0,0.0,0.666667
1,2,43,14,15,14,0.0,47.0,2.0,14.0,0.2,0.1,0.0,7.5,0.166667,0.0,1.5
2,3,45,19,22,23,0.0,63.0,1.0,4.0,4.0,0.0,0.0,13.75,0.25,0.0,0.166667
3,4,28,13,12,12,0.0,49.0,0.0,1.0,5.5,0.0,0.0,38.0,0.0,0.0,0.166667
4,5,87,40,59,56,0.0,150.0,10.0,13.0,8.5,0.5,0.0,10.545455,0.727273,0.0,0.5


In [5]:
user_log_3 = user_log_2.copy()
user_log_3['date'] = user_log_3['datetime'].dt.date
user_log_3['month'] = user_log_3['datetime'].dt.to_period('M')
user_log_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 5 columns):
 #   Column    Dtype         
---  ------    -----         
 0   user_id   int64         
 1   action    object        
 2   datetime  datetime64[ns]
 3   date      object        
 4   month     period[M]     
dtypes: datetime64[ns](1), int64(1), object(2), period[M](1)
memory usage: 2.0+ GB


In [6]:
monthly_stats = user_log_3.groupby(['user_id', 'month']).agg(
    total_actions=('action', 'count'),
    active_days=('date', lambda x: x.nunique())
).reset_index()

monthly_stats['actions_days'] = monthly_stats['total_actions'] / monthly_stats['active_days']
monthly_stats.head()

Unnamed: 0,user_id,month,total_actions,active_days,actions_days
0,1,2024-10,16,4,4.0
1,1,2024-11,17,1,17.0
2,2,2024-05,2,1,2.0
3,2,2024-06,26,1,26.0
4,2,2024-07,1,1,1.0


In [9]:
monthly_stats_avg = (
    monthly_stats.groupby('user_id', as_index=False)
      .agg(
          avg_action_day_ratio=('actions_days', 'mean'),
          avg_active_day=('active_days', 'mean')
      )
)


In [10]:
monthly_stats_avg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   user_id               424170 non-null  int64  
 1   avg_action_day_ratio  424170 non-null  float64
 2   avg_active_day        424170 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 9.7 MB


In [11]:
monthly_stats_avg.head()

Unnamed: 0,user_id,avg_action_day_ratio,avg_active_day
0,1,10.5,2.5
1,2,9.194444,1.5
2,3,4.095238,1.857143
3,4,5.566667,2.0
4,5,8.778139,4.285714


In [12]:
final_3 = pd.read_csv("/content/drive/MyDrive/AI Project/final_3.csv")
final_4 = final_3.merge(monthly_stats_avg, on='user_id', how='inner')
final_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   user_id                424170 non-null  int64  
 1   item_count             424170 non-null  int64  
 2   cat_count              424170 non-null  int64  
 3   brand_count            424170 non-null  int64  
 4   merchant_count         424170 non-null  int64  
 5   add-to-cart_count      424170 non-null  float64
 6   click_count            424170 non-null  float64
 7   favourite_count        424170 non-null  float64
 8   purchase_count         424170 non-null  float64
 9   sale_click_purchase    424170 non-null  float64
 10  sale_fav_purchase      424170 non-null  float64
 11  sale_add_purchase      424170 non-null  float64
 12  nosale_click_purchase  424170 non-null  float64
 13  nosale_fav_purchase    424170 non-null  float64
 14  nosale_add_purchase    424170 non-nu

In [13]:
final_4.head()

Unnamed: 0,user_id,item_count,cat_count,brand_count,merchant_count,add-to-cart_count,click_count,favourite_count,purchase_count,sale_click_purchase,sale_fav_purchase,sale_add_purchase,nosale_click_purchase,nosale_fav_purchase,nosale_add_purchase,purchase_ratio,avg_action_day_ratio,avg_active_day
0,1,12,6,9,9,0.0,27.0,0.0,6.0,2.6,0.0,0.0,4.666667,0.0,0.0,0.666667,10.5,2.5
1,2,43,14,15,14,0.0,47.0,2.0,14.0,0.2,0.1,0.0,7.5,0.166667,0.0,1.5,9.194444,1.5
2,3,45,19,22,23,0.0,63.0,1.0,4.0,4.0,0.0,0.0,13.75,0.25,0.0,0.166667,4.095238,1.857143
3,4,28,13,12,12,0.0,49.0,0.0,1.0,5.5,0.0,0.0,38.0,0.0,0.0,0.166667,5.566667,2.0
4,5,87,40,59,56,0.0,150.0,10.0,13.0,8.5,0.5,0.0,10.545455,0.727273,0.0,0.5,8.778139,4.285714


In [14]:
final_4.to_csv("/content/drive/MyDrive/AI Project/final_4.csv", index=False)

In [5]:
user_log_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 3 columns):
 #   Column    Dtype         
---  ------    -----         
 0   user_id   int64         
 1   action    object        
 2   datetime  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 1.2+ GB


In [7]:
user_log_4 = user_log_2.copy()
user_log_4['date'] = user_log_4['datetime'].dt.normalize()
user_log_4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 4 columns):
 #   Column    Dtype         
---  ------    -----         
 0   user_id   int64         
 1   action    object        
 2   datetime  datetime64[ns]
 3   date      datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 1.6+ GB


In [8]:
# Sort theo user_id, merchant_id và datetime
user_log_4 = user_log_4.sort_values(['user_id', 'datetime'])

# Tính chênh lệch ngày giữa các hoạt động
user_log_4['activity_gap'] = user_log_4.groupby(['user_id'])['date'].diff().dt.days

global_max_date = user_log_4['date'].max()


In [11]:
user_log_4.head(20)

Unnamed: 0,user_id,action,datetime,date,activity_gap
23288890,1,click,2024-10-09,2024-10-09,
23288891,1,click,2024-10-09,2024-10-09,0.0
23288892,1,click,2024-10-09,2024-10-09,0.0
23288893,1,click,2024-10-09,2024-10-09,0.0
23288894,1,click,2024-10-09,2024-10-09,0.0
23288886,1,click,2024-10-11,2024-10-11,2.0
23288887,1,click,2024-10-11,2024-10-11,0.0
23288888,1,purchase,2024-10-11,2024-10-11,0.0
23288889,1,click,2024-10-11,2024-10-11,0.0
23288863,1,click,2024-10-18,2024-10-18,7.0


In [10]:
user_log_4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54925330 entries, 23288890 to 13710715
Data columns (total 5 columns):
 #   Column        Dtype         
---  ------        -----         
 0   user_id       int64         
 1   action        object        
 2   datetime      datetime64[ns]
 3   date          datetime64[ns]
 4   activity_gap  float64       
dtypes: datetime64[ns](2), float64(1), int64(1), object(1)
memory usage: 2.5+ GB


In [13]:
# 4. Aggregate: lấy max internal gap và last_date cho mỗi cặp
agg = (
    user_log_4.groupby(['user_id'])
       .agg(
           max_internal_gap = ('activity_gap', 'max'),  # có thể NaN nếu chỉ 1 log
           last_date = ('date', 'max')
       )
       .reset_index()
)

# 5. Chuyển NaN max_internal_gap -> 0 (để dễ so sánh)
agg['max_internal_gap'] = agg['max_internal_gap'].fillna(0)

# 6. Tính last_gap (khoảng từ last_date tới global_max_date)
agg['last_gap'] = (global_max_date - agg['last_date']).dt.days

# 7. Lấy max giữa max_internal_gap và last_gap
agg['longest_inactivity_days'] = agg[['max_internal_gap','last_gap']].max(axis=1).astype(int)

# 8. (Tuỳ chọn) giữ các cột cần thiết
longest_gap = agg[['user_id','longest_inactivity_days']]


In [14]:
longest_gap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 2 columns):
 #   Column                   Non-Null Count   Dtype
---  ------                   --------------   -----
 0   user_id                  424170 non-null  int64
 1   longest_inactivity_days  424170 non-null  int64
dtypes: int64(2)
memory usage: 6.5 MB


In [15]:
longest_gap.head()

Unnamed: 0,user_id,longest_inactivity_days
0,1,21
1,2,46
2,3,42
3,4,53
4,5,32


In [16]:
final_4 = pd.read_csv("/content/drive/MyDrive/AI Project/final_4.csv")
final_5 = final_4.merge(longest_gap, on='user_id', how='inner')
final_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 19 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   user_id                  424170 non-null  int64  
 1   item_count               424170 non-null  int64  
 2   cat_count                424170 non-null  int64  
 3   brand_count              424170 non-null  int64  
 4   merchant_count           424170 non-null  int64  
 5   add-to-cart_count        424170 non-null  float64
 6   click_count              424170 non-null  float64
 7   favourite_count          424170 non-null  float64
 8   purchase_count           424170 non-null  float64
 9   sale_click_purchase      424170 non-null  float64
 10  sale_fav_purchase        424170 non-null  float64
 11  sale_add_purchase        424170 non-null  float64
 12  nosale_click_purchase    424170 non-null  float64
 13  nosale_fav_purchase      424170 non-null  float64
 14  nosa

In [17]:
final_5.head()

Unnamed: 0,user_id,item_count,cat_count,brand_count,merchant_count,add-to-cart_count,click_count,favourite_count,purchase_count,sale_click_purchase,sale_fav_purchase,sale_add_purchase,nosale_click_purchase,nosale_fav_purchase,nosale_add_purchase,purchase_ratio,avg_action_day_ratio,avg_active_day,longest_inactivity_days
0,1,12,6,9,9,0.0,27.0,0.0,6.0,2.6,0.0,0.0,4.666667,0.0,0.0,0.666667,10.5,2.5,21
1,2,43,14,15,14,0.0,47.0,2.0,14.0,0.2,0.1,0.0,7.5,0.166667,0.0,1.5,9.194444,1.5,46
2,3,45,19,22,23,0.0,63.0,1.0,4.0,4.0,0.0,0.0,13.75,0.25,0.0,0.166667,4.095238,1.857143,42
3,4,28,13,12,12,0.0,49.0,0.0,1.0,5.5,0.0,0.0,38.0,0.0,0.0,0.166667,5.566667,2.0,53
4,5,87,40,59,56,0.0,150.0,10.0,13.0,8.5,0.5,0.0,10.545455,0.727273,0.0,0.5,8.778139,4.285714,32


In [18]:
# Lọc hành động mua
df_purchase_new = user_log_4[user_log_4['action'] == 'purchase'].copy()

# Sort theo user_id, merchant_id và datetime
df_purchase_new = df_purchase_new.sort_values(['user_id', 'datetime'])

# Tính chênh lệch ngày giữa các hoạt động
df_purchase_new['purchase_gap'] = df_purchase_new.groupby(['user_id'])['date'].diff().dt.days

global_max_date = user_log_4['date'].max()

# 4. Aggregate: lấy max internal gap và last_date cho mỗi cặp
agg = (
    df_purchase_new.groupby(['user_id'])
       .agg(
           max_internal_gap = ('purchase_gap', 'max'),  # có thể NaN nếu chỉ 1 log
           last_date = ('date', 'max')
       )
       .reset_index()
)

# 5. Chuyển NaN max_internal_gap -> 0 (để dễ so sánh)
agg['max_internal_gap'] = agg['max_internal_gap'].fillna(0)

# 6. Tính last_gap (khoảng từ last_date tới global_max_date)
agg['last_gap'] = (global_max_date - agg['last_date']).dt.days

# 7. Lấy max giữa max_internal_gap và last_gap
agg['longest_purchase_gap'] = agg[['max_internal_gap','last_gap']].max(axis=1).astype(int)

# 8. (Tuỳ chọn) giữ các cột cần thiết
longest_purchase_gap = agg[['user_id', 'longest_purchase_gap']]


In [19]:
longest_purchase_gap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 2 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   user_id               424170 non-null  int64
 1   longest_purchase_gap  424170 non-null  int64
dtypes: int64(2)
memory usage: 6.5 MB


In [20]:
longest_purchase_gap.head()

Unnamed: 0,user_id,longest_purchase_gap
0,1,21
1,2,54
2,3,95
3,4,1
4,5,55


In [21]:
final_6 = final_5.merge(longest_purchase_gap, on='user_id', how='inner')
final_6.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   user_id                  424170 non-null  int64  
 1   item_count               424170 non-null  int64  
 2   cat_count                424170 non-null  int64  
 3   brand_count              424170 non-null  int64  
 4   merchant_count           424170 non-null  int64  
 5   add-to-cart_count        424170 non-null  float64
 6   click_count              424170 non-null  float64
 7   favourite_count          424170 non-null  float64
 8   purchase_count           424170 non-null  float64
 9   sale_click_purchase      424170 non-null  float64
 10  sale_fav_purchase        424170 non-null  float64
 11  sale_add_purchase        424170 non-null  float64
 12  nosale_click_purchase    424170 non-null  float64
 13  nosale_fav_purchase      424170 non-null  float64
 14  nosa

In [22]:
user_info = pd.read_csv("/content/drive/MyDrive/AI Project/user_info_final.csv")

In [23]:
user_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422741 entries, 0 to 422740
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         422741 non-null  int64  
 1   age             422741 non-null  float64
 2   sex             422741 non-null  object 
 3   marital_status  422741 non-null  object 
 4   job_industry    422741 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 16.1+ MB


In [24]:
data_for_clustering = final_6.merge(user_info, on='user_id', how='left')

In [25]:
data_for_clustering.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 24 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   user_id                  424170 non-null  int64  
 1   item_count               424170 non-null  int64  
 2   cat_count                424170 non-null  int64  
 3   brand_count              424170 non-null  int64  
 4   merchant_count           424170 non-null  int64  
 5   add-to-cart_count        424170 non-null  float64
 6   click_count              424170 non-null  float64
 7   favourite_count          424170 non-null  float64
 8   purchase_count           424170 non-null  float64
 9   sale_click_purchase      424170 non-null  float64
 10  sale_fav_purchase        424170 non-null  float64
 11  sale_add_purchase        424170 non-null  float64
 12  nosale_click_purchase    424170 non-null  float64
 13  nosale_fav_purchase      424170 non-null  float64
 14  nosa

In [26]:
data_for_clustering.to_csv("/content/drive/MyDrive/AI Project/data_for_clustering.csv", index=False)