# Fresh Idea
## separate one/zero activity of domains
- replace zeros by minus one
- calculate the class activity for 3 hours bins for each domain
- calculate the user activity for gaussian around center of 3 hour bins
- calculate the likelihood of the person being a 1/-1 in that time 
- add user general metrics including domain cls activity and usage patterns

### questions
- how to take into account times when the person used a website when others didnt?
- how to give likelihood when the person didn't show any nearby activity?
- what if he used a similar website at same time but more nich? 
- how to average the bins weighted by the significance of that bin?
- how to give weight to the magnitude of number of users entering? probability of 1 with confidence
- what about sparse websites?
- how to not let times where there are no activity take a lot of weight?
### enhancements
- create graph embedding of urls
- for each bin, calculate the metric per url
- instead of only looking at the specific website, take into account websites with similar usages,
  for example looking at same domain_cls usage in gaussian around bin, or looking at domain embeddings and looking at the activity in similar embeddings weighted by the distance in the embedding space

In [None]:
import os
import sqlite3
%matplotlib widget
import matplotlib.pyplot as plt
from multiprocessing import freeze_support
from modin.db_conn import ModinDatabaseConnection
import modin.pandas as mpd
%load_ext autoreload
%autoreload 2
os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
import ray
ray.init(ignore_reinit_error=True)
def load_data_from_db(con):
    try:
        df = mpd.read_sql("SELECT * FROM data", con)
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise


freeze_support()
dbfile = '/workspace/data/mini_training_set.db'

conn = ModinDatabaseConnection('sqlalchemy', f'sqlite:///{dbfile}')

# Can use get_connection to get underlying sqlalchemy engine
conn.get_connection()
db_df = load_data_from_db(conn)
print(db_df.head())

In [2]:
# "23-04 to 18-05"
db_df = db_df[db_df["Domain_Name"]!=1732927] # remove empty url
db_df["Datetime"] = mpd.to_datetime(db_df["Datetime"])
db_df.set_index("Datetime",inplace=True)
# db_df.groupby("Device_ID").apply(lambda x: (x-x["Datetime"].min()).dt.days)

In [3]:
#train test split
from sklearn.model_selection import train_test_split
def get_train_test_devices(device_target_df, test_size=0.2, random_state=42):    
    # Perform stratified split on device IDs
    train_device_ids, test_device_ids = train_test_split(
        device_target_df['Device_ID'],
        test_size=test_size,
        random_state=random_state,
        stratify=device_target_df['Target']
    )
    return train_device_ids, test_device_ids


In [4]:
devices = db_df.groupby("Device_ID").first().reset_index()
train_devices, test_device_ids = get_train_test_devices(devices)
train_db_df = db_df[db_df["Device_ID"].isin(train_devices)]

In [51]:
devices_per_domain = train_db_df.groupby("Domain_Name")["Device_ID"].nunique()
domains = devices_per_domain[devices_per_domain>10].index
train_db_df = train_db_df[train_db_df["Domain_Name"].isin(domains)]

In [52]:
# preprocess timeseries
def process_activity_timeseries(domain_df,bin_hours=6,gaussian_filter=True,n_days_each_side=3,std=1.5,drop_na=True,drop_zeros=False):
    activity_per_3h = domain_df["Device_ID"].resample(f'{str(bin_hours)}h').nunique()
    gaussian_window_hours = int(n_days_each_side*24/bin_hours*2) # n_days_each_side * 24h / 3h_per_bin * 2 sides
    if gaussian_filter:
        activity_per_3h = activity_per_3h.rolling(window=gaussian_window_hours, win_type='gaussian',center=True,min_periods=1,closed="both").mean(std=std)
    if drop_na:
        activity_per_3h.dropna(inplace=True)
    if drop_zeros:
        activity_per_3h = activity_per_3h[activity_per_3h!=0]
    activity_per_3h.rename("Activity",inplace=True)
    return activity_per_3h.round().astype(int)


In [53]:
from functools import partial
process_domain_timeseries = partial(process_activity_timeseries,gaussian_filter=True,n_days_each_side=3,std=1.5,drop_na=True,drop_zeros=False)
process_domain_timeseries.__name__ =process_activity_timeseries.__name__
domain_time_series = train_db_df.groupby(["Domain_Name","Target"]).apply(process_domain_timeseries)
# domain_no_target_time_series = train_db_df.groupby(["Domain_Name"]).apply(process_domain_timeseries)


In [69]:
user_domain_time_series = db_df.groupby(["Device_ID","Domain_Name"]).apply(process_activity_timeseries)

# Reset index to get all columns as regular columns

In [70]:
user_domain_time_series = user_domain_time_series.swaplevel(0,1)

In [62]:

domain_fraction_ts = domain_time_series.groupby(["Domain_Name","Target"],group_keys=False).apply(lambda x: x/x.sum()).rename("activity_fraction")

In [None]:
user_domain_time_series
pivot_df = user_domain_time_series.reset_index().pivot(
    index=['Datetime', 'Domain_Name'],
    columns='Device_ID',
    values='Activity'
).fillna(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Activity
Domain_Name,Device_ID,Datetime,Unnamed: 3_level_1
3930,124,2023-04-30 12:00:00+03:00,1
4136,124,2023-05-05 06:00:00+03:00,1
6450,124,2023-04-24 12:00:00+03:00,1
6450,124,2023-04-24 18:00:00+03:00,0
6450,124,2023-04-25 00:00:00+03:00,0
...,...,...,...
2408371,69967,2023-05-12 18:00:00+03:00,1
2408371,69967,2023-05-13 00:00:00+03:00,1
2408371,69967,2023-05-13 06:00:00+03:00,1
2408371,69967,2023-05-13 12:00:00+03:00,1


In [63]:
import numpy as np
from scipy import stats

def class_probability_score(active, p_active_given_a, p_active_given_b, prior_a=0.5, total_users=100):
    # Calculate the posterior probability using Bayes' theorem
    likelihood_a = p_active_given_a if active else (1 - p_active_given_a)
    likelihood_b = p_active_given_b if active else (1 - p_active_given_b)
    
    evidence = (likelihood_a * prior_a) + (likelihood_b * (1 - prior_a))
    posterior_a = (likelihood_a * prior_a) / evidence
    
    # Calculate confidence interval based on total user count
    alpha = 1 + posterior_a * total_users
    beta = 1 + (1 - posterior_a) * total_users
    ci_lower, ci_upper = stats.beta.interval(0.95, alpha, beta)
    
    # Calculate confidence-adjusted score (-1 to 1)
    # Start with the posterior probability scaled to (-1, 1)
    raw_score = 2 * posterior_a - 1
    
    # Adjust score based on confidence interval width
    confidence_width = ci_upper - ci_lower
    confidence_factor = 1 - confidence_width  # Higher when interval is narrower
    
    # Final score: raw prediction weighted by confidence
    final_score = raw_score * confidence_factor
    
    return final_score


In [79]:
domains = db_df["Domain_Name"].unique()
user_result_per_domain={}
for domain in domains:
    domain_ts = domain_fraction_ts.loc[domain]
    domain_users_ts = user_domain_time_series.loc[domain]
    user_result_per_domain[domain] = domain_users_ts.groupby(["Device_ID","Datetime"]).apply(
        lambda x: class_probability_score(
            active=x["Activity"].iloc[0]>0,
            p_active_given_a=domain_ts.loc[0],
            p_active_given_b=domain_ts.loc[0]
        )
    )
user_result_per_domain = mpd.concat(user_result_per_domain)
        
        
        

KeyboardInterrupt: 