# Fresh Idea
## separate one/zero activity of domains
- replace zeros by minus one
- calculate the class activity for 3 hours bins for each domain
- calculate the user activity for gaussian around center of 3 hour bins
- calculate the likelihood of the person being a 1/-1 in that time 
- add user general metrics including domain cls activity and usage patterns

### questions
- how to take into account times when the person used a website when others didnt?
- how to give likelihood when the person didn't show any nearby activity?
- what if he used a similar website at same time but more nich? 
- how to average the bins weighted by the significance of that bin?
- how to give weight to the magnitude of number of users entering? probability of 1 with confidence
- what about sparse websites?
- how to not let times where there are no activity take a lot of weight?
### enhancements
- create graph embedding of urls
- for each bin, calculate the metric per url
- instead of only looking at the specific website, take into account websites with similar usages,
  for example looking at same domain_cls usage in gaussian around bin, or looking at domain embeddings and looking at the activity in similar embeddings weighted by the distance in the embedding space

In [1]:
!export NEPTUNE_API_TOKEN="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJiMGMyZjIyZC0xMjQzLTQxNjQtYjZjZC0wMTRiZmJmZmRlZjYifQ=="
import os
os.environ["NEPTUNE_API_TOKEN"] = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJiMGMyZjIyZC0xMjQzLTQxNjQtYjZjZC0wMTRiZmJmZmRlZjYifQ=="

In [2]:
# !export MODIN_CPUS=2
import neptune
import os

run = neptune.init_run(
    project="tom.touati/web-segmentation",  # replace with your project
    api_token=os.environ["NEPTUNE_API_TOKEN"],
    # name="Activity-Based Features",
    capture_stdout=True,
    capture_stderr=True,
    capture_hardware_metrics=True,
    tags=["time-based-models", "activity-based-features"],
    description="User activity patterns analysis"
)
import neptune

# Initialize Neptune run

# Log parameters
params = {
    "training_data":{
        "min_domain_devices": 10,
        "n_devices_hist":True,
        "test_size": 0.2,
        "random_state": 42
    },
    "user_activity_timeseries": {
        "bin_hours": 6,
        "gaussian_filter": True, 
        "n_days_each_side": 1,
        "std": 1.5,
        "drop_na": True,
        "drop_zeros": False
    },
    "domain_activity_timeseries": {
        "bin_hours": 6,
        "gaussian_filter": True, 
        "n_days_each_side": 7,
        "std": 1.5,
        "drop_na": True,
        "drop_zeros": False
    },
    "general_user_time_bin": {
        "should_run": True,
        "bin_hours": 3
    },
    "feature_selection": {
        "n_features": 1000,
        "step": 20000
    },
    "model": {
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "learning_rate": 0.1,
        "n_estimators": 150,
        "max_depth": 6
    }
}

run["parameters"] = params

# Log notebook
run["notebook"].upload("time_based_model.ipynb")

# Log metrics


[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/tom.touati/web-segmentation/e/WEB-2




In [3]:
import os
import sqlite3
# %matplotlib widget
import matplotlib.pyplot as plt
from multiprocessing import freeze_support
# os.environ["MODIN_CPUS"] = "2"
os.environ["RAY_OBJECT_STORE_MEMORY"] = str(5 * (1024 ** 3))
from modin.config import NPartitions
NPartitions.put(12)
from modin.db_conn import ModinDatabaseConnection

import ray


# Set a higher number of partitions to reduce memory per partition
# ctx = ray.init()#object_store_memory=24000000000)#,include_dashboard=True)  # Object store memory (~25GB)
#                 redis_max_memory=10000000000)  # Redis memory (~10GB)


import modin.pandas as mpd
# print(ctx.dashboard_url)
%load_ext autoreload
%autoreload 2
os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
# ray.init()
# NPartitions.put(16)
def load_data_from_db(con):
    try:
        # First get 1000 random Device_IDs
        selective_device_ids_query = """
        WITH random_devices AS (
            SELECT DISTINCT Device_ID 
            FROM data 
            LIMIT 1000
        )
        SELECT * 
        FROM data 
        WHERE Device_ID IN (SELECT Device_ID FROM random_devices)
        AND Domain_Name != 1732927
        """
        device_ids_query = """SELECT * from data
        WHERE Domain_Name != 1732927 """
        df = mpd.read_sql(selective_device_ids_query, con,index_col='Datetime',parse_dates=['Datetime']
                          )._repartition()
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise


# Add this line

In [4]:
# load data from db
freeze_support()
dbfile = '../train_data_for_competition/training_set.db'

conn = ModinDatabaseConnection('sqlalchemy', f'sqlite:///{dbfile}')

# Can use get_connection to get underlying sqlalchemy engine
conn.get_connection()
db_df = load_data_from_db(conn)
print(db_df.head())
del conn  

2025-03-18 16:16:56,966	INFO worker.py:1841 -- Started a local Ray instance.
[36m(raylet)[0m Spilled 2149 MiB, 18 objects, write throughput 92 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
[36m(raylet)[0m Spilled 8152 MiB, 51 objects, write throughput 235 MiB/s.
[36m(raylet)[0m Spilled 8271 MiB, 52 objects, write throughput 231 MiB/s.
[36m(raylet)[0m Spilled 20074 MiB, 144 objects, write throughput 348 MiB/s.
[36m(raylet)[0m Spilled 33939 MiB, 207 objects, write throughput 282 MiB/s.
[36m(raylet)[0m Spilled 66345 MiB, 451 objects, write throughput 266 MiB/s.


KeyboardInterrupt: 

In [16]:
# prepare training data
import matplotlib.pyplot as plt
#train test split
from sklearn.model_selection import train_test_split
def get_train_test_devices(device_target_df, test_size=0.2, random_state=42):    
    # Perform stratified split on device IDs
    train_device_ids, test_device_ids = train_test_split(
        device_target_df['Device_ID'],
        test_size=test_size,
        random_state=random_state,
        stratify=device_target_df['Target']
    )
    return train_device_ids, test_device_ids

def get_initial_train_data(db_df, test_size=0.2, random_state=42, min_domain_devices=10,n_devices_hist=False):
    device_targets = db_df.groupby("Device_ID")["Target"].first().reset_index()
    train_devices, test_device_ids = get_train_test_devices(device_targets,test_size=test_size,random_state=random_state)
    train_df = db_df[db_df["Device_ID"].isin(train_devices)]
    devices_per_domain = train_df.groupby("Domain_Name")["Device_ID"].nunique()
    
    domain_mask = devices_per_domain>min_domain_devices
    print(f"Percentage of domains with more than {min_domain_devices} devices: {domain_mask.mean()*100:.2f}%")
    devices_per_domain = devices_per_domain[domain_mask]
    if n_devices_hist:
        hist = devices_per_domain["Device_ID"].hist()
        run["plots/domain_devices_hist"].upload(neptune.types.File.as_image(hist.figure))
        plt.show()
    train_df = train_df[train_df["Domain_Name"].isin(devices_per_domain.index)]
    return train_df,train_devices,test_device_ids, device_targets


In [None]:
train_df,train_devices,test_device_ids, device_targets = get_initial_train_data(db_df,**params["training_data"])


In [18]:
# preprocess timeseries
from functools import partial
def process_activity_timeseries(domain_df,bin_hours=6,gaussian_filter=True,n_days_each_side=3,std=1.5,drop_na=True,drop_zeros=False):
    activity_per_3h = domain_df["Device_ID"].resample(f'{str(bin_hours)}h').nunique()
    gaussian_window_hours = int(n_days_each_side*24/bin_hours*2) # n_days_each_side * 24h / 3h_per_bin * 2 sides
    if gaussian_filter:
        activity_per_3h = activity_per_3h.rolling(window=gaussian_window_hours, win_type='gaussian',center=True,min_periods=1,closed="both").mean(std=std)
    if drop_na:
        activity_per_3h.dropna(inplace=True)
    if drop_zeros:
        activity_per_3h = activity_per_3h[activity_per_3h!=0]
    activity_per_3h.rename("Activity",inplace=True)
    return activity_per_3h.round().astype(int)

def get_domain_activity_timeseries(train_df,domain_ts_kwargs):
    process_domain_timeseries = partial(process_activity_timeseries,**domain_ts_kwargs)
    process_domain_timeseries.__name__ =process_activity_timeseries.__name__
    domain_timeseries = train_df.groupby(["Domain_Name","Target"]).apply(process_domain_timeseries)
    
    domain_fraction_ts = domain_timeseries
    del domain_timeseries
    domain_fraction_ts["activity_fraction"] = domain_fraction_ts.groupby(["Domain_Name", "Target"]).transform(lambda x: x/x.sum())
    # Add the sum of activity as a new column
    domain_activity = domain_fraction_ts.groupby(["Domain_Name", "Target"])[["Activity"]].sum()

    domain_activity = domain_activity.rename(columns={"Activity": "target_domain_activity"})
    # Merge the results
    domain_fraction_ts = domain_fraction_ts.merge(domain_activity, left_index=True, right_index=True)
    # Reset index to get Target as a column, then pivot to get Target as columns
    pivot_fraction_ts = domain_fraction_ts.reset_index().pivot(
        index=['Datetime', 'Domain_Name'],
        columns='Target'
    ).fillna(0)
    pivot_fraction_ts.columns = [f'{col[0]}_{col[1]}' for col in pivot_fraction_ts.columns]
    return pivot_fraction_ts
def get_user_activity_timeseries(db_df,user_ts_kwargs):
    process_user_timeseries = partial(process_activity_timeseries,**user_ts_kwargs)
    process_user_timeseries.__name__ =process_activity_timeseries.__name__
    user_timeseries = db_df.groupby(["Device_ID","Domain_Name"]).apply(process_activity_timeseries).swaplevel(0,1)
    return user_timeseries

In [19]:

domain_activity_timeseries = get_domain_activity_timeseries(
    train_df, params["domain_activity_timeseries"])
user_activity_timeseries = get_user_activity_timeseries(
    train_df, params["user_activity_timeseries"])



In [None]:
# from scipy import stats
import numpy as np

def class_probability_score(active, p_active_given_a, p_active_given_b, prior_a=0.5, total_users=100):
    """
    Calculate class probability score with vectorized operations
    
    Args:
        active: Boolean indicating if user was active
        p_active_given_a: Probability of activity given class A (0)
        p_active_given_b: Probability of activity given class B (1)
        prior_a: Prior probability for class A
        total_users: Total number of users for confidence calculation
    """
    # Use numpy for vectorized operations
    likelihood_a = np.where(active, p_active_given_a, 1 - p_active_given_a)
    likelihood_b = np.where(active, p_active_given_b, 1 - p_active_given_b)
    
    # Avoid division by zero
    evidence = (likelihood_a * prior_a + likelihood_b * (1 - prior_a))
    posterior_a = (likelihood_a * prior_a) / evidence

    return posterior_a #* confidence_factor

def get_user_domain_scores(domain_activity_timeseries,user_activity_timeseries):
    merged_timeseries_df = domain_activity_timeseries.merge(
        user_activity_timeseries.reset_index(), how="left", on=["Domain_Name", "Datetime"]
    ).set_index(["Datetime", "Domain_Name", "Device_ID"])

    merged_timeseries_df["bin_activity"] = merged_timeseries_df["Activity_0"]+merged_timeseries_df["Activity_1"]
    merged_timeseries_df["total_activity"] = (merged_timeseries_df["target_domain_activity_0"]+merged_timeseries_df["target_domain_activity_1"])
    merged_timeseries_df["relative_0_activity"] = merged_timeseries_df["target_domain_activity_0"]/merged_timeseries_df["total_activity"]
    merged_timeseries_df["score"]=class_probability_score(merged_timeseries_df["Activity"], merged_timeseries_df["activity_fraction_0"], merged_timeseries_df["activity_fraction_1"], prior_a=merged_timeseries_df["relative_0_activity"], total_users=merged_timeseries_df["bin_activity"])
    merged_timeseries_df["weighted_score"] = (merged_timeseries_df["score"])*(merged_timeseries_df["bin_activity"])#np.log(1+merged_df["bin_activity"]).astype(int))
    final_scores = merged_timeseries_df.groupby(["Device_ID","Domain_Name"])["weighted_score"].mean()
    final_scores_pivot = final_scores.to_frame().reset_index().pivot(index="Device_ID",columns="Domain_Name").fillna(0)

    final_scores_pivot=(final_scores_pivot-final_scores_pivot.values.min())/(final_scores_pivot.values.max()-final_scores_pivot.values.min())*2-1
    return final_scores_pivot

In [None]:
final_scores_pivot = get_user_domain_scores(domain_activity_timeseries,user_activity_timeseries)

In [21]:
# baseline features
def get_active_days_per_user(user_domain_ts):
    """
    Calculate the number of unique days each user had any activity.

    Args:
        user_domain_ts: MultiIndex Series with levels [Domain_Name, Device_ID, Datetime]

    Returns:
        Series with index Device_ID and values being number of unique active days
    """
    # Reset index to get Datetime as a column
    df = user_domain_ts.reset_index()

    # Convert Datetime to date (removing time component)
    df['Date'] = df['Datetime'].dt.date

    # Group by Device_ID and count unique dates where Activity > 0
    active_days = df[df['Activity'] > 0].groupby('Device_ID')['Date'].nunique()

    active_days = active_days.astype(int)
    active_days.name = "Active_Days"
    active_days = (active_days-active_days.min()
                   ) / (active_days.max()-active_days.min())*2-1
    return active_days


def get_activity_per_time_bin(df, bin_hours=3):
    # Convert datetime to time only
    # time_index = db_df.index.to_series().dt.time
    # df["time"] = time_index
    df_copy = df.copy()
    df_copy["time"] = db_df.index.to_series().dt.hour.astype(int)//bin_hours
    df_copy["day_part_activity"] = 0
    activity_per_time_range = df_copy[["Device_ID", "time", "day_part_activity"]].groupby(
        ["Device_ID", "time"]).count()
    activity_per_time_range["activity_fraction"] = activity_per_time_range.groupby("Device_ID").apply(lambda x: x/x.sum()).values
    activity_per_time_range = activity_per_time_range[["activity_fraction"]].reset_index()
    activity_per_time_range = activity_per_time_range.pivot(index="Device_ID",columns="time",values="activity_fraction")
    activity_per_time_range.columns = [f"time_{col}" for col in activity_per_time_range.columns]
    activity_per_time_range = (activity_per_time_range-activity_per_time_range.stack().min())/(activity_per_time_range.stack().max()-activity_per_time_range.stack().min())*2-1
    activity_per_time_range = activity_per_time_range.fillna(0)
    return activity_per_time_range  # .round().astype(int)

In [None]:
active_days = get_active_days_per_user(user_activity_timeseries)
del user_activity_timeseries
activity_per_time_range = get_activity_per_time_bin(db_df)


In [33]:
final_features = device_targets.set_index("Device_ID").join(final_scores_pivot)
if active_days is not None:
    final_features = final_features.join(active_days)
if activity_per_time_range is not None:
    final_features = final_features.join(activity_per_time_range)
final_features.columns = [str(col) for col in final_features.columns]

In [None]:
import xgboost
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
def prepare_model_data(final_features,train_devices,test_device_ids):
    X_train = final_features[final_features.index.isin(train_devices)].drop('Target', axis=1)
    y_train = final_features[final_features.index.isin(train_devices)]['Target']

    X_test = final_features.loc[final_features.index.isin(test_device_ids)]
    y_test = final_features[final_features.index.isin(test_device_ids)]['Target']
    return X_train,y_train,X_test,y_test

def train_model(X_train,y_train,X_test,y_test,params):
    xgb_reg = xgboost.XGBRegressor(random_state=0, subsample=0.8, colsample_bytree=0.8, learning_rate= 0.1,
                               n_estimators= 150, max_depth=6, objective ='binary:logistic' ,eval_metric =roc_auc_score)
    selector = RFE(xgb_reg, n_features_to_select=1000, step=20000)
    selector = selector.fit(X_train, y_train)
    best_features = list(X_train.columns[selector.support_])
    test_prediction = selector.estimator_.predict(X_test[best_features])
    test_auc =round(roc_auc_score(y_test,test_prediction), 3)
    return test_auc,selector, best_features


In [None]:
X_train,y_train,X_test,y_test = prepare_model_data(final_features,train_devices,test_device_ids)
score,selector,best_features = train_model(X_train,y_train,X_test,y_test,params["model"])
print(f'The auc for validation set: {score}')
run["metrics/roc_auc"] = score
run["metrics/selected_features"] = best_features
run["metrics/feature_importances"] = selector.estimator_.feature_importances_
run["metrics/feature_ranking"] = selector.ranking_
run["metrics/feature_support"] = selector.support_
# Close the run
run.stop()

In [None]:
# manual feature filter
# Create function to filter features based on mean values
# def filter_low_mean_features(features_df, percentile=0.1):
#     # Calculate absolute mean values for each feature
#     abs_means = abs(features_df).mean()
    
#     # Calculate percentile threshold of absolute means
#     threshold = abs_means.quantile(percentile)
    
#     # Get features with absolute means above threshold
#     significant_features = abs_means[abs_means >= threshold].index
    
#     # Filter features

#     return significant_features

# Apply the filtering function
