# Fresh Idea
## separate one/zero activity of domains
- replace zeros by minus one
- calculate the class activity for 3 hours bins for each domain
- calculate the user activity for gaussian around center of 3 hour bins
- calculate the likelihood of the person being a 1/-1 in that time 
- add user general metrics including domain cls activity and usage patterns

### questions
- how to take into account times when the person used a website when others didnt?
- how to give likelihood when the person didn't show any nearby activity?
- what if he used a similar website at same time but more nich? 
- how to average the bins weighted by the significance of that bin?
- how to give weight to the magnitude of number of users entering? probability of 1 with confidence
- what about sparse websites?
- how to not let times where there are no activity take a lot of weight?
### enhancements
- create graph embedding of urls
- for each bin, calculate the metric per url
- instead of only looking at the specific website, take into account websites with similar usages,
  for example looking at same domain_cls usage in gaussian around bin, or looking at domain embeddings and looking at the activity in similar embeddings weighted by the distance in the embedding space

In [None]:
import os
import sqlite3
%matplotlib widget
import matplotlib.pyplot as plt
from multiprocessing import freeze_support
from modin.db_conn import ModinDatabaseConnection
import modin.pandas as mpd
%load_ext autoreload
%autoreload 2
os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
import ray
ray.init(ignore_reinit_error=True)
def load_data_from_db(con):
    try:
        df = mpd.read_sql("SELECT * FROM data", con)
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise


freeze_support()
dbfile = '/workspace/data/mini_training_set.db'

conn = ModinDatabaseConnection('sqlalchemy', f'sqlite:///{dbfile}')

# Can use get_connection to get underlying sqlalchemy engine
conn.get_connection()
db_df = load_data_from_db(conn)
print(db_df.head())

2025-03-14 13:41:10,522	INFO worker.py:1841 -- Started a local Ray instance.


   Device_ID                   Datetime      URL  Domain_Name  Domain_cls1  \
0        124  2023-04-23 03:04:30+03:00     6466      2368671          755   
1        124  2023-04-23 03:04:30+03:00  2245864      1792903            0   
2        124  2023-04-23 03:04:30+03:00  1839478       107342          332   
3        124  2023-04-23 03:14:50+03:00  1172090       107342          332   
4        124  2023-04-23 03:14:50+03:00  1839478       107342          332   

   Domain_cls2  Domain_cls3  Domain_cls4  Target  
0          799            0            0       0  
1            0            0            0       0  
2            0            0            0       0  
3            0            0            0       0  
4            0            0            0       0  


In [2]:
# "23-04 to 18-05"
db_df = db_df[db_df["Domain_Name"]!=1732927] # remove empty url
db_df["Datetime"] = mpd.to_datetime(db_df["Datetime"])
db_df.set_index("Datetime",inplace=True)
# db_df.groupby("Device_ID").apply(lambda x: (x-x["Datetime"].min()).dt.days)

In [34]:
#train test split
from sklearn.model_selection import train_test_split
def get_train_test_devices(device_target_df, test_size=0.2, random_state=42):    
    # Perform stratified split on device IDs
    train_device_ids, test_device_ids = train_test_split(
        device_target_df['Device_ID'],
        test_size=test_size,
        random_state=random_state,
        stratify=device_target_df['Target']
    )
    return train_device_ids, test_device_ids


In [36]:
devices = db_df.groupby("Device_ID").first().reset_index()
train_devices, test_device_ids = get_train_test_devices(devices)
train_db_df = db_df[db_df["Device_ID"].isin(train_devices)]

In [40]:
# preprocess timeseries
def process_activity_timeseries(domain_df,gaussian_filter=True,n_days_each_side=3,std=1.5,drop_na=True,drop_zeros=False):
    activity_per_3h = domain_df["Device_ID"].resample('3H').nunique()
    gaussian_window_hours = int(n_days_each_side*24/3*2) # n_days_each_side * 24h / 3h_per_bin * 2 sides
    if gaussian_filter:
        activity_per_3h = activity_per_3h.rolling(window=gaussian_window_hours, win_type='gaussian',center=True).mean(std=std)
    if drop_na:
        activity_per_3h.dropna(inplace=True)
    if drop_zeros:
        activity_per_3h = activity_per_3h[activity_per_3h!=0]
    activity_per_3h.rename("Activity",inplace=True)
    return activity_per_3h.round().astype(int)


In [44]:
from functools import partial
process_domain_timeseries = partial(process_activity_timeseries,gaussian_filter=True,n_days_each_side=3,std=1.5,drop_na=True,drop_zeros=False)
process_domain_timeseries.__name__ =process_activity_timeseries.__name__
domain_time_series = train_db_df.groupby(["Domain_Name","Target"]).apply(process_domain_timeseries)


In [None]:
domain_time_series.groupby("Domain_Name").plot()

Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.


In [45]:
user_domain_time_series = db_df.groupby(["Device_ID","Domain_Name"]).apply(process_activity_timeseries)