In [1]:
import pandas as pd

df = pd.read_csv('../data/csv/data.csv')

print(df.shape)
df.head()

(1254243, 6)


Unnamed: 0,CLNT_RMT_IP,ROI_CLICK_EVENT_TS,BRWSR_NAME,ams_pblshr_id,buyer_id,click_id
0,77.111.247.168,2020-08-02 18:37:01,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,5574672411,valen_949,209245484389035777
1,66.249.73.101,2020-08-21 03:36:38,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,5574672411,morbanisaf,209453736892211586
2,77.111.247.129,2020-08-21 03:36:38,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,5574672411,morbanisaf,209453736892211586
3,66.249.73.101,2020-08-21 03:36:33,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,5574672411,morbanisaf,209453736892211586
4,77.111.247.129,2020-08-21 03:36:33,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,5574672411,morbanisaf,209453736892211586


## Time Differences for Buyers and IPs

In [2]:
df['ROI_CLICK_EVENT_TS'] = pd.to_datetime(df['ROI_CLICK_EVENT_TS'])
# Sort by timestamp to maintain sequence
df = df.sort_values(['buyer_id', 'ROI_CLICK_EVENT_TS'])

# Time gap for each buyer
df['buyer_time_diff'] = df.groupby('buyer_id')['ROI_CLICK_EVENT_TS'].diff().dt.total_seconds().fillna(0)

# Time gap for each IP
df = df.sort_values(['CLNT_RMT_IP', 'ROI_CLICK_EVENT_TS'])
df['ip_time_diff'] = df.groupby('CLNT_RMT_IP')['ROI_CLICK_EVENT_TS'].diff().dt.total_seconds().fillna(0)



In [3]:
df.columns

Index(['CLNT_RMT_IP', 'ROI_CLICK_EVENT_TS', 'BRWSR_NAME', 'ams_pblshr_id',
       'buyer_id', 'click_id', 'buyer_time_diff', 'ip_time_diff'],
      dtype='object')

## Click counts per minute

In [4]:
# Count clicks in 1-minute windows
df['minute'] = df['ROI_CLICK_EVENT_TS'].dt.floor('T')
buyer_clicks_per_minute = df.groupby(['buyer_id', 'minute']).size().reset_index(name='buyer_clicks_per_minute')
# Calculate clicks per minute for each IP
ip_clicks_per_minute = df.groupby(['CLNT_RMT_IP', 'minute']).size().reset_index(name='ip_clicks_per_minute')
# Merge buyer click counts into the main dataframe
df = df.merge(buyer_clicks_per_minute, on=['buyer_id', 'minute'], how='left')
df = df.merge(ip_clicks_per_minute, on=['CLNT_RMT_IP', 'minute'], how='left')

df.head() # Check the new columns 'buyer_clicks_per_minute' and 'ip_clicks_per_minute'


  df['minute'] = df['ROI_CLICK_EVENT_TS'].dt.floor('T')


Unnamed: 0,CLNT_RMT_IP,ROI_CLICK_EVENT_TS,BRWSR_NAME,ams_pblshr_id,buyer_id,click_id,buyer_time_diff,ip_time_diff,minute,buyer_clicks_per_minute,ip_clicks_per_minute
0,101.44.82.14,2023-12-29 19:31:54,Mozilla/5.0 (Linux; Android 10; JEF-AN00 Build...,5574735181,cl7645,223334034911676801,0.0,0.0,2023-12-29 19:31:00,2,1
1,101.44.83.169,2024-02-20 06:55:49,Mozilla/5.0 (Linux; Android 10; JEF-AN00 Build...,5574735181,cl7645,223925649672767360,4533835.0,0.0,2024-02-20 06:55:00,1,1
2,101.44.83.169,2024-02-20 09:45:50,Mozilla/5.0 (Linux; Android 10; JEF-AN00 Build...,5574735181,cl7645,223925649672767360,10201.0,10201.0,2024-02-20 09:45:00,1,1
3,101.44.83.234,2024-05-15 18:59:02,ebayUserAgent/eBayIOS;6.159.0;iOS;17.4.1;Apple...,5575280720,smir5900,224888936095825920,0.0,0.0,2024-05-15 18:59:00,1,1
4,101.44.83.234,2024-05-15 20:01:59,ebayUserAgent/eBayIOS;6.159.0;iOS;17.4.1;Apple...,5575280720,smir5900,224888936095825920,3777.0,3777.0,2024-05-15 20:01:00,1,1


## Aggregating Features

In [5]:
buyer_features = df.groupby('buyer_id').agg({
    'buyer_time_diff': ['mean', 'median', 'std'],
    'buyer_clicks_per_minute': ['mean', 'max', 'std'],
    'ip_time_diff': ['mean', 'median', 'std'],
    'ip_clicks_per_minute': ['mean', 'max', 'std']
}).reset_index()

# Flatten column names and fill NA values
buyer_features.columns = ['_'.join(col).strip() for col in buyer_features.columns]
buyer_features.rename(columns={'buyer_id_': 'buyer_id'}, inplace=True)
buyer_features.fillna(0, inplace=True)

# Step 2: Merge the aggregated buyer features into the main dataframe
df = df.merge(buyer_features, on='buyer_id', how='left')




## IsolationForest

In [6]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# from sklearn.svm import OneClassSVM

# Assuming the data is loaded into a pandas DataFrame called df
# Example: df = pd.read_csv("your_data.csv")

# Step 1: Frequency Encoding (Add new columns for frequency encoding)
def frequency_encode(df, columns):
    for col in columns:
        freq = df[col].value_counts()
        new_col_name = f"{col}_encoded"
        df[new_col_name] = df[col].map(freq)
    return df

# Encoding the categorical columns
categorical_columns = ['CLNT_RMT_IP', 'ams_pblshr_id', 'buyer_id', 'click_id']
df = frequency_encode(df, categorical_columns)
# print(df.columns)
# Step 2: PCA on the specified numeric columns
numeric_columns = ['buyer_clicks_per_minute', 'ip_clicks_per_minute', 'buyer_time_diff_mean',
                   'buyer_time_diff_median', 'buyer_time_diff_std', 'buyer_clicks_per_minute_mean',
                   'buyer_clicks_per_minute_max', 'buyer_clicks_per_minute_std', 'ip_time_diff_mean',
                   'ip_time_diff_median', 'ip_time_diff_std', 'ip_clicks_per_minute_mean',
                   'ip_clicks_per_minute_max', 'ip_clicks_per_minute_std', 'CLNT_RMT_IP_encoded', 'ams_pblshr_id_encoded', 'buyer_id_encoded', 'click_id_encoded']

# Scaling the numeric features before PCA
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df[numeric_columns])

# Apply PCA to reduce dimensionality
pca = PCA(n_components=5)  # Choose the number of components as needed
df_pca = pca.fit_transform(df_scaled)

# Add the PCA components back to the dataframe (optional)
df_pca_df = pd.DataFrame(df_pca, columns=[f'PC{i+1}' for i in range(df_pca.shape[1])])
df = pd.concat([df, df_pca_df], axis=1)

# Step 3: Train Isolation Forest for anomaly detection
# We'll use the PCA components for anomaly detection
X = df_pca_df.values

model = IsolationForest(contamination=0.035)  # contamination is the expected proportion of outliers
model.fit(X)
predictions = model.predict(X)  # 1 for normal, -1 for anomaly

# Step 4: Results
# Add anomaly predictions to the DataFrame
df['Anomaly'] = predictions

# Display detected anomalies
anomalies = df[df['Anomaly'] == -1]
print(f"Detected anomalies:\n{anomalies}")


Detected anomalies:
            CLNT_RMT_IP  ROI_CLICK_EVENT_TS  \
20       102.129.143.11 2023-08-25 02:14:24   
25       102.129.143.11 2024-07-30 15:08:23   
30        102.129.143.4 2022-09-02 14:37:01   
35        102.129.143.4 2023-05-28 04:45:49   
37        102.129.143.4 2024-04-16 01:46:36   
...                 ...                 ...   
1253613   98.159.37.176 2024-02-02 17:58:09   
1253683    98.98.147.50 2023-10-08 13:06:54   
1253804   98.98.148.165 2024-07-17 10:34:33   
1253877    98.98.171.50 2024-10-23 00:41:50   
1254176   98.98.210.136 2024-07-11 02:44:32   

                                                BRWSR_NAME  ams_pblshr_id  \
20       Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...     5575612316   
25       Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...     5574858753   
30       ebayUserAgent/eBayIOS;6.75.0;iOS;15.6.1;Apple;...     5575400987   
35       ebayUserAgent/eBayIOS;6.111.0;iOS;16.4.1;Apple...     5574635388   
37       Mozilla/5.0 (Windows 

## Train the Hidden Markov Model

In [None]:
# from hmmlearn import hmm
# import numpy as np
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans


# # # Prepare features for HMM
# # X = buyer_features[[  # Ensure this has the correct dimensions
# #     'buyer_time_diff_mean', 'buyer_time_diff_median', 'buyer_time_diff_std',
# #     'buyer_clicks_per_minute_mean', 'buyer_clicks_per_minute_max', 'buyer_clicks_per_minute_std',
# #     'ip_time_diff_mean', 'ip_time_diff_median', 'ip_time_diff_std',
# #     'ip_clicks_per_minute_mean', 'ip_clicks_per_minute_max', 'ip_clicks_per_minute_std'
# # ]].values

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
# pca = PCA(n_components=5)  # Keep 5 principal components
# X_pca = pca.fit_transform(X_scaled)

# kmeans = KMeans(n_clusters=2, random_state=42).fit(X_pca)

# # Train a Gaussian HMM with 2 states
# hmm_model = hmm.GaussianHMM(n_components=2, covariance_type="diag", n_iter=200, tol=1e-3, random_state=42)
# hmm_model.means_ = kmeans.cluster_centers_
# hmm_model.fit(X_pca)

# df['hmm_state'] = hmm_model.predict(X_pca)
# df['log_likelihood'] = hmm_model.score_samples(X_pca)  # Log-likelihood of each sample

# # Anomaly detection based on log-likelihood
# threshold = np.percentile(df['log_likelihood'], 8)  # Use lowest 8% log-likelihood values as anomalies
# df['anomaly'] = df['log_likelihood'] < threshold

# # Alternatively, assume state 1 as anomalous
# df['anomaly'] = df['anomaly'] | (df['hmm_state'] == 1)

# # Step 6: Final output (merge with original data and save results)
# final_df = df[[
#     'CLNT_RMT_IP', 'ROI_CLICK_EVENT_TS', 'BRWSR_NAME', 'ams_pblshr_id', 
#     'buyer_id', 'click_id', 'hmm_state', 'log_likelihood', 'anomaly'
# ]]


# # Save the results to a CSV file
# final_df.to_csv('../data/csv/click_fraud_detection_results_time_feeatures.csv', index=False)

# # Display a sample of the final output
# print(final_df.head())


## Anamoly Detection

In [None]:
# from hmmlearn import hmm
# import numpy as np

# # Preparing time gap data for HMM
# time_gaps = df.groupby('buyer_id')['time_diff'].apply(list).values
# flat_gaps = np.concatenate(time_gaps).reshape(-1, 1)

# # Train a 2-component Gaussian HMM
# hmm_model = hmm.GaussianHMM(n_components=2, covariance_type="diag", n_iter=100, random_state=42)
# hmm_model.fit(flat_gaps)

# # Predicting the hidden states
# hidden_states = hmm_model.predict(flat_gaps)
# df['hmm_state'] = np.concatenate([np.repeat(state, len(seq)) for state, seq in zip(hidden_states, time_gaps)])
# print(df.head())
# Predict states and classify anomalies
# buyer_features['hmm_state'] = hmm_model.predict(X_pca)
# buyer_features['log_likelihood'] = hmm_model.score_samples(X_pca)  # Log-likelihood of each sample

# # Apply anomaly detection based on log-likelihood threshold
# threshold = np.percentile(buyer_features['log_likelihood'], 8)  # Use the lowest 8% log-likelihood values as anomalies
# buyer_features['anomaly'] = buyer_features['log_likelihood'] < threshold

# # Alternatively, assume state 1 as anomalous
# buyer_features['anomaly'] = buyer_features['anomaly'] | (buyer_features['hmm_state'] == 1)

# # Merge anomaly predictions back to the main dataframe, ensuring alignment
# df = df.merge(buyer_features[['buyer_id', 'hmm_state', 'log_likelihood', 'anomaly']], on='buyer_id', how='left')

# # Select and reorder columns for the final output
# final_df = df[[
#     'CLNT_RMT_IP', 'ROI_CLICK_EVENT_TS', 'BRWSR_NAME', 'ams_pblshr_id', 
#     'buyer_id', 'click_id', 'hmm_state', 'log_likelihood', 'anomaly'
# ]]

# # Save the results to a CSV file
# final_df.to_csv('../data/csv/click_fraud_detection_results_time_feeatures.csv', index=False)

# # Display a sample of the final output
# print(final_df.head())


In [None]:
# # Predict log likelihood for sequences
# log_likelihood = hmm_model.score_samples(flat_gaps)
# df['log_likelihood'] = log_likelihood

# # Mark anomalies where log likelihood is below a threshold
# threshold = np.percentile(log_likelihood, 10)  # 10% lowest log likelihood as anomalies
# df['anomaly'] = df['log_likelihood'] < threshold
# print(df[df['anomaly']])
# Initialize an empty list for storing log-likelihood values
log_likelihoods = []

# Loop over each buyer's click sequence
# for time_gap_sequence in df.groupby('buyer_id')['time_diff']:
#     log_likelihood = hmm_model.score(np.array(time_gap_sequence[1]).reshape(-1, 1))
#     log_likelihoods.extend([log_likelihood] * len(time_gap_sequence[1]))  # Assign to each row in sequence

# # Assign log-likelihoods back to the DataFrame
# df['log_likelihood'] = log_likelihoods


In [None]:
# Define anomaly threshold
# threshold = np.percentile(df['log_likelihood'], 8)  # Lowest 10% log-likelihood values are anomalies

# # Mark anomalies in the DataFrame
# df['anomaly'] = df['log_likelihood'] < threshold
# # print(df[df['anomaly']].head())
# df['hmm_state'] = hmm_model.predict(X_pca)
# df['log_likelihood'] = hmm_model.score_samples(X_pca)
# df['anomaly'] = df['hmm_state'] == 1  # Assume state 1 indicates anomalies


In [10]:
final_df = df[[
    'CLNT_RMT_IP', 'ROI_CLICK_EVENT_TS', 'BRWSR_NAME', 'ams_pblshr_id', 
    'buyer_id', 'Anomaly'
]]
# Index(['CLNT_RMT_IP', 'ROI_CLICK_EVENT_TS', 'BRWSR_NAME', 'ams_pblshr_id',
#        'buyer_id', 'click_id', 'buyer_time_diff', 'ip_time_diff', 'minute',
#        'buyer_clicks_per_minute', 'ip_clicks_per_minute',
#        'buyer_time_diff_mean', 'buyer_time_diff_median', 'buyer_time_diff_std',
#        'buyer_clicks_per_minute_mean', 'buyer_clicks_per_minute_max',
#        'buyer_clicks_per_minute_std', 'ip_time_diff_mean',
#        'ip_time_diff_median', 'ip_time_diff_std', 'ip_clicks_per_minute_mean',
#        'ip_clicks_per_minute_max', 'ip_clicks_per_minute_std', 'PC1', 'PC2',
#        'PC3', 'PC4', 'PC5', 'Anomaly'],
# Save to CSV
final_df.to_csv('../data/csv/anamoly_click_pattern/click_fraud_detection_results_time_feeatures.csv', index=False)
print(final_df.head())


     CLNT_RMT_IP  ROI_CLICK_EVENT_TS  \
0   101.44.82.14 2023-12-29 19:31:54   
1  101.44.83.169 2024-02-20 06:55:49   
2  101.44.83.169 2024-02-20 09:45:50   
3  101.44.83.234 2024-05-15 18:59:02   
4  101.44.83.234 2024-05-15 20:01:59   

                                          BRWSR_NAME  ams_pblshr_id  buyer_id  \
0  Mozilla/5.0 (Linux; Android 10; JEF-AN00 Build...     5574735181    cl7645   
1  Mozilla/5.0 (Linux; Android 10; JEF-AN00 Build...     5574735181    cl7645   
2  Mozilla/5.0 (Linux; Android 10; JEF-AN00 Build...     5574735181    cl7645   
3  ebayUserAgent/eBayIOS;6.159.0;iOS;17.4.1;Apple...     5575280720  smir5900   
4  ebayUserAgent/eBayIOS;6.159.0;iOS;17.4.1;Apple...     5575280720  smir5900   

   Anomaly  
0        1  
1        1  
2        1  
3        1  
4        1  


In [8]:


# import matplotlib.pyplot as plt

# # Plotting time gaps and anomalies
# plt.figure(figsize=(12, 6))
# plt.plot(df['ROI_CLICK_EVENT_TS'], df['time_diff'], label='Time Gaps')
# plt.scatter(df[df['anomaly']]['ROI_CLICK_EVENT_TS'], df[df['anomaly']]['time_diff'], color='red', label='Anomalies', zorder=5)
# plt.legend()
# plt.show()
buyers_with_anomaly = df[df['Anomaly'] == -1]

# Print the 'buyer_id' column for those rows
buyer_counts = buyers_with_anomaly['buyer_id'].value_counts()

# Print the buyer IDs along with their occurrences
buyers_gt_100 = buyer_counts[buyer_counts > 100].index

# Create a new DataFrame with those buyer_ids
df_gt_100 = buyers_with_anomaly[buyers_with_anomaly['buyer_id'].isin(buyers_gt_100)]

# Print the new DataFrame
print(df_gt_100)


            CLNT_RMT_IP  ROI_CLICK_EVENT_TS  \
9410     103.231.89.173 2021-06-14 07:11:35   
9411     103.231.89.173 2021-06-14 08:08:18   
9412     103.231.89.173 2021-06-14 21:09:34   
9413     103.231.89.173 2021-06-14 21:33:14   
9581     103.231.89.230 2021-06-17 19:22:03   
...                 ...                 ...   
1153151   77.234.43.170 2022-01-16 12:48:13   
1153152   77.234.43.170 2022-01-16 12:48:16   
1153153   77.234.43.170 2022-01-16 12:48:19   
1153827   77.234.43.171 2020-08-03 12:43:46   
1153952   77.234.43.171 2020-10-16 11:32:21   

                                                BRWSR_NAME  ams_pblshr_id  \
9410     Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...     5574735181   
9411     Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...     5574735181   
9412     Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like M...     5574735181   
9413     Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like M...     5574735181   
9581     Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 l

In [14]:
# df_gt_100.shape
df_gt_100 = df_gt_100[[
    'CLNT_RMT_IP', 'ROI_CLICK_EVENT_TS', 'BRWSR_NAME', 'ams_pblshr_id', 
    'buyer_id', 'click_id', 'buyer_time_diff', 'ip_time_diff', 'Anomaly', 
    'buyer_clicks_per_minute', 'ip_clicks_per_minute'
]]
df_gt_100.to_csv('../data/csv/anamoly_click_pattern/df_gt_100_click_fraud_detection_results_time_feeatures.csv', index=False)
# print(final_df.head())


In [13]:
# anomalous_rows = df[df['anomaly']]

# # Show the first few anomalous rows
# print(anomalous_rows.head())
df_gt_100.columns

Index(['CLNT_RMT_IP', 'ROI_CLICK_EVENT_TS', 'BRWSR_NAME', 'ams_pblshr_id',
       'buyer_id', 'click_id', 'buyer_time_diff', 'ip_time_diff', 'minute',
       'buyer_clicks_per_minute', 'ip_clicks_per_minute',
       'buyer_time_diff_mean', 'buyer_time_diff_median', 'buyer_time_diff_std',
       'buyer_clicks_per_minute_mean', 'buyer_clicks_per_minute_max',
       'buyer_clicks_per_minute_std', 'ip_time_diff_mean',
       'ip_time_diff_median', 'ip_time_diff_std', 'ip_clicks_per_minute_mean',
       'ip_clicks_per_minute_max', 'ip_clicks_per_minute_std',
       'CLNT_RMT_IP_encoded', 'ams_pblshr_id_encoded', 'buyer_id_encoded',
       'click_id_encoded', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'Anomaly'],
      dtype='object')

In [None]:
# anomalous_rows.shape

(36505, 11)

In [None]:
# non_anomalous_rows = df[~df['anomaly']]

# # Show the first few anomalous rows
# print(non_anomalous_rows.head())


            CLNT_RMT_IP  ROI_CLICK_EVENT_TS  \
481168  172.225.184.187 2022-09-28 09:00:05   
957065     172.225.6.64 2024-09-19 10:20:24   
586981  195.181.174.227 2022-05-22 15:57:24   
586982  195.181.174.227 2022-05-22 16:02:30   
586983  195.181.174.227 2022-05-22 16:26:13   

                                               BRWSR_NAME  ams_pblshr_id  \
481168  Mozilla/5.0 (iPhone; CPU iPhone OS 16_0_2 like...     5575403800   
957065  Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like M...     5574635388   
586981  ebayUserAgent/eBayIOS;6.60.0;iOS;15.4.1;Apple;...     5575532731   
586982  ebayUserAgent/eBayIOS;6.60.0;iOS;15.4.1;Apple;...     5575532731   
586983  ebayUserAgent/eBayIOS;6.60.0;iOS;15.4.1;Apple;...     5575532731   

             buyer_id            click_id  time_diff              minute  \
481168     !!!eule!!!  218153671260927361        0.0 2022-09-28 09:00:00   
957065  !!!scheela!!!  226330661662892416        0.0 2024-09-19 10:20:00   
586981        !!mat!!  216694068

In [None]:
# non_anomalous_rows.shape

(1217738, 11)

In [None]:
# non_anomalous_rows['buyer_id'].value_counts()

buyer_id
alphaseeker    3224
yangbangban    2805
social_lite    2794
kennyb-0       2657
lostandlost    2360
               ... 
zzr600d3          1
zzr611            1
zzrd411           1
zzrwolfi          1
zzseller          1
Name: count, Length: 465394, dtype: int64

In [24]:
anomalous_rows['buyer_id'].value_counts()

buyer_id
ndeals80          8220
timhu_11          7102
ndeals60          6489
supenko2          5622
ndeals90          4889
                  ... 
keepitcrazy         94
stit6509            94
headbodyhead        94
wentwelltheday      94
harham_8189         93
Name: count, Length: 416, dtype: int64

In [37]:
df.to_csv('../data/csv/anamoly_click_pattern/anamoly_data.csv')

In [27]:
df.shape

(1254243, 11)

## Aggregate Features

In [None]:
# df['ROI_CLICK_EVENT_TS'] = pd.to_datetime(df['ROI_CLICK_EVENT_TS'])
# df = df.sort_values(['CLNT_RMT_IP', 'ROI_CLICK_EVENT_TS'])
# df['time_diff_ip'] = df.groupby('CLNT_RMT_IP')['ROI_CLICK_EVENT_TS'].diff().dt.total_seconds().fillna(0)
# Aggregated features for each buyer
# Aggregated features for each buyer
# buyer_features = df.groupby('buyer_id').agg({
#     'buyer_time_diff': ['mean', 'median', 'std'],
#     'buyer_clicks_per_minute': ['mean', 'max', 'std'],
#     'ip_time_diff': ['mean', 'median', 'std'],
#     'ip_clicks_per_minute': ['mean', 'max', 'std']
# }).reset_index()

# # Flatten multi-level columns
# buyer_features.columns = ['_'.join(col).strip() for col in buyer_features.columns]
# buyer_features = buyer_features.rename(columns={'buyer_id_': 'buyer_id'})

# print(np.any(np.isnan(X_scaled)), np.any(np.isinf(X_scaled)))

False False


In [None]:
# print(df.describe(percentiles=[0.01, 0.99]))  # Check for anomalies in data distribution



                  ROI_CLICK_EVENT_TS  ams_pblshr_id      click_id  \
count                        1254243   1.254243e+06  1.254243e+06   
mean   2023-04-29 04:45:47.277674240   5.575137e+09  2.205623e+17   
min              2020-07-02 10:34:33   5.574628e+09  2.088909e+17   
1%        2020-09-02 14:29:29.260000   5.574631e+09  2.095939e+17   
50%              2023-09-10 07:22:23   5.575134e+09  2.220814e+17   
99%       2024-11-04 10:59:55.480000   5.575751e+09  2.268510e+17   
max              2024-11-11 23:59:57   5.575799e+09  2.269373e+17   
std                              NaN   3.820568e+05  4.971967e+15   

       buyer_time_diff  ip_time_diff                         minute  \
count     1.254243e+06  1.254243e+06                        1254243   
mean      2.353903e+06  3.723367e+05  2023-04-29 04:45:17.700445696   
min       0.000000e+00  0.000000e+00            2020-07-02 10:34:00   
1%        0.000000e+00  2.000000e+00     2020-09-02 14:29:15.600000   
50%       1.740000e+02 