# Fresh Idea
## separate one/zero activity of domains
- replace zeros by minus one
- calculate the class activity for 3 hours bins for each domain
- calculate the user activity for gaussian around center of 3 hour bins
- calculate the likelihood of the person being a 1/-1 in that time 
- add user general metrics including domain cls activity and usage patterns

### questions
- how to take into account times when the person used a website when others didnt?
- how to give likelihood when the person didn't show any nearby activity?
- what if he used a similar website at same time but more nich? 
- how to average the bins weighted by the significance of that bin?
- how to give weight to the magnitude of number of users entering? probability of 1 with confidence
- what about sparse websites?
- how to not let times where there are no activity take a lot of weight?
### enhancements
- create graph embedding of urls
- for each bin, calculate the metric per url
- instead of only looking at the specific website, take into account websites with similar usages,
  for example looking at same domain_cls usage in gaussian around bin, or looking at domain embeddings and looking at the activity in similar embeddings weighted by the distance in the embedding space

In [1]:
import os
import sqlite3
%matplotlib widget
import matplotlib.pyplot as plt
from multiprocessing import freeze_support
from modin.db_conn import ModinDatabaseConnection
import modin.pandas as mpd
%load_ext autoreload
%autoreload 2
os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
import ray
ray.init(ignore_reinit_error=True)
def load_data_from_db(con):
    try:
        df = mpd.read_sql("SELECT * FROM data", con)
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise


freeze_support()
dbfile = '/workspace/data/mini_training_set.db'

conn = ModinDatabaseConnection('sqlalchemy', f'sqlite:///{dbfile}')

# Can use get_connection to get underlying sqlalchemy engine
conn.get_connection()
db_df = load_data_from_db(conn)
print(db_df.head())

2025-03-14 18:55:15,237	INFO worker.py:1841 -- Started a local Ray instance.


   Device_ID                   Datetime      URL  Domain_Name  Domain_cls1  \
0        124  2023-04-23 03:04:30+03:00     6466      2368671          755   
1        124  2023-04-23 03:04:30+03:00  2245864      1792903            0   
2        124  2023-04-23 03:04:30+03:00  1839478       107342          332   
3        124  2023-04-23 03:14:50+03:00  1172090       107342          332   
4        124  2023-04-23 03:14:50+03:00  1839478       107342          332   

   Domain_cls2  Domain_cls3  Domain_cls4  Target  
0          799            0            0       0  
1            0            0            0       0  
2            0            0            0       0  
3            0            0            0       0  
4            0            0            0       0  


In [2]:
# "23-04 to 18-05"
db_df = db_df[db_df["Domain_Name"]!=1732927] # remove empty url
db_df["Datetime"] = mpd.to_datetime(db_df["Datetime"])
db_df.set_index("Datetime",inplace=True)
# db_df.groupby("Device_ID").apply(lambda x: (x-x["Datetime"].min()).dt.days)

In [3]:
#train test split
from sklearn.model_selection import train_test_split
def get_train_test_devices(device_target_df, test_size=0.2, random_state=42):    
    # Perform stratified split on device IDs
    train_device_ids, test_device_ids = train_test_split(
        device_target_df['Device_ID'],
        test_size=test_size,
        random_state=random_state,
        stratify=device_target_df['Target']
    )
    return train_device_ids, test_device_ids


In [4]:
devices = db_df.groupby("Device_ID").first().reset_index()
train_devices, test_device_ids = get_train_test_devices(devices)
train_db_df = db_df[db_df["Device_ID"].isin(train_devices)]

In [5]:
devices_per_domain = train_db_df.groupby("Domain_Name")["Device_ID"].nunique()
domains = devices_per_domain[devices_per_domain>10].index
train_db_df = train_db_df[train_db_df["Domain_Name"].isin(domains)]

In [6]:
# preprocess timeseries
def process_activity_timeseries(domain_df,bin_hours=6,gaussian_filter=True,n_days_each_side=3,std=1.5,drop_na=True,drop_zeros=False):
    activity_per_3h = domain_df["Device_ID"].resample(f'{str(bin_hours)}h').nunique()
    gaussian_window_hours = int(n_days_each_side*24/bin_hours*2) # n_days_each_side * 24h / 3h_per_bin * 2 sides
    if gaussian_filter:
        activity_per_3h = activity_per_3h.rolling(window=gaussian_window_hours, win_type='gaussian',center=True,min_periods=1,closed="both").mean(std=std)
    if drop_na:
        activity_per_3h.dropna(inplace=True)
    if drop_zeros:
        activity_per_3h = activity_per_3h[activity_per_3h!=0]
    activity_per_3h.rename("Activity",inplace=True)
    return activity_per_3h.round().astype(int)


In [16]:
from functools import partial
process_domain_timeseries = partial(process_activity_timeseries,gaussian_filter=True,n_days_each_side=3,std=1.5,drop_na=True,drop_zeros=False)
process_domain_timeseries.__name__ =process_activity_timeseries.__name__
domain_time_series = train_db_df.groupby(["Domain_Name","Target"]).apply(process_domain_timeseries)
# domain_no_target_time_series = train_db_df.groupby(["Domain_Name"]).apply(process_domain_timeseries)


In [8]:
user_domain_time_series = db_df.groupby(["Device_ID","Domain_Name"]).apply(process_activity_timeseries)

# Reset index to get all columns as regular columns

In [9]:
user_domain_time_series = user_domain_time_series.swaplevel(0,1)

In [19]:

# Calculate domain fractions and activity separately
# First, calculate activity fraction
domain_fraction_ts = domain_time_series.copy()
domain_fraction_ts["activity_fraction"] = domain_fraction_ts.groupby(["Domain_Name", "Target"]).transform(lambda x: x/x.sum())
# Add the sum of activity as a new column
domain_activity = domain_fraction_ts.groupby(["Domain_Name", "Target"])[["Activity"]].sum()

domain_activity = domain_activity.rename(columns={"Activity": "target_domain_activity"})
# Merge the results
domain_fraction_ts = domain_fraction_ts.merge(domain_activity, left_index=True, right_index=True)
# Reset index to get Target as a column, then pivot to get Target as columns
pivot_fraction_ts = domain_fraction_ts.reset_index().pivot(
    index=['Datetime', 'Domain_Name'],
    columns='Target'
).fillna(0)
pivot_fraction_ts.columns = [f'{col[0]}_{col[1]}' for col in pivot_fraction_ts.columns]
# Rename columns for clarity
# pivot_fraction_ts.columns = ['activity_0', 'activity_1']



In [42]:
merged_df = pivot_fraction_ts.merge(user_domain_time_series.reset_index(),how="left",on=["Domain_Name","Datetime"],)
merged_df.set_index(["Datetime","Domain_Name","Device_ID"],inplace=True)

In [43]:
from scipy import stats


def class_probability_score(active, p_active_given_a, p_active_given_b, prior_a=0.5, total_users=100):
    """
    Calculate class probability score with vectorized operations
    
    Args:
        active: Boolean indicating if user was active
        p_active_given_a: Probability of activity given class A (0)
        p_active_given_b: Probability of activity given class B (1)
        prior_a: Prior probability for class A
        total_users: Total number of users for confidence calculation
    """
    # Use numpy for vectorized operations
    likelihood_a = np.where(active, p_active_given_a, 1 - p_active_given_a)
    likelihood_b = np.where(active, p_active_given_b, 1 - p_active_given_b)
    
    # Avoid division by zero
    evidence = (likelihood_a * prior_a + likelihood_b * (1 - prior_a))
    posterior_a = (likelihood_a * prior_a) / evidence
    
    # Simplified confidence calculation
    # alpha = 1 + posterior_a * total_users
    # beta = 1 + (1 - posterior_a) * total_users
    # ci_width = stats.beta.interval(0.95, alpha, beta)[1] - stats.beta.interval(0.95, alpha, beta)[0]
    
    # # Calculate final score
    raw_score = 2 * posterior_a - 1
    # confidence_factor = 1 - ci_width
    
    return raw_score #* confidence_factor


In [46]:
merged_df["bin_activity"] = merged_df["Activity_0"]+merged_df["Activity_1"]
merged_df["total_activity"] = (merged_df["target_domain_activity_0"]+merged_df["target_domain_activity_1"])
merged_df["relative_0_activity"] = merged_df["target_domain_activity_0"]/merged_df["total_activity"]


In [47]:
import numpy as np
merged_df["score"]=class_probability_score(merged_df["Activity"], merged_df["activity_fraction_0"], merged_df["activity_fraction_1"], prior_a=merged_df["relative_0_activity"], total_users=merged_df["bin_activity"])



In [49]:
merged_df["weighted_score"] = merged_df["score"]*np.sqrt(merged_df["bin_activity"])
final_scores = merged_df.groupby(["Device_ID","Domain_Name"])["weighted_score"].mean()



In [51]:
final_scores_pivot = final_scores.to_frame().pivot(index="Device_ID",columns="Domain_Name")

TypeError: unhashable type: 'list'

In [69]:
final_scores_pivot = final_scores.to_frame().reset_index().pivot(index="Device_ID",columns="Domain_Name").fillna(0)