# Fresh Idea
## separate one/zero activity of domains
- replace zeros by minus one
- calculate the class activity for 3 hours bins for each domain
- calculate the user activity for gaussian around center of 3 hour bins
- calculate the likelihood of the person being a 1/-1 in that time 
- add user general metrics including domain cls activity and usage patterns

### questions
- how to take into account times when the person used a website when others didnt?
- how to give likelihood when the person didn't show any nearby activity?
- what if he used a similar website at same time but more nich? 
- how to average the bins weighted by the significance of that bin?
- how to give weight to the magnitude of number of users entering? probability of 1 with confidence
- what about sparse websites?
- how to not let times where there are no activity take a lot of weight?
### enhancements
- create graph embedding of urls
- for each bin, calculate the metric per url
- instead of only looking at the specific website, take into account websites with similar usages,
  for example looking at same domain_cls usage in gaussian around bin, or looking at domain embeddings and looking at the activity in similar embeddings weighted by the distance in the embedding space

In [1]:
# !export MODIN_CPUS=2

In [None]:
import os
import sqlite3
# %matplotlib widget
import matplotlib.pyplot as plt
from multiprocessing import freeze_support
# os.environ["MODIN_CPUS"] = "2"
os.environ["RAY_OBJECT_STORE_MEMORY"] = str(5 * (1024 ** 3))
from modin.config import NPartitions
NPartitions.put(12)
from modin.db_conn import ModinDatabaseConnection

import ray


# Set a higher number of partitions to reduce memory per partition
# ctx = ray.init()#object_store_memory=24000000000)#,include_dashboard=True)  # Object store memory (~25GB)
#                 redis_max_memory=10000000000)  # Redis memory (~10GB)


import modin.pandas as mpd
# print(ctx.dashboard_url)
%load_ext autoreload
%autoreload 2
os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Ray
# ray.init()
# NPartitions.put(16)
def load_data_from_db(con):
    try:
        # First get 1000 random Device_IDs
        selective_device_ids_query = """
        WITH random_devices AS (
            SELECT DISTINCT Device_ID 
            FROM data 
            LIMIT 1000
        )
        SELECT * 
        FROM data 
        WHERE Device_ID IN (SELECT Device_ID FROM random_devices)
        """
        device_ids_query = """SELECT * from data"""
        df = mpd.read_sql(selective_device_ids_query, con)
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise


freeze_support()
dbfile = '../train_data_for_competition/training_set.db'

conn = ModinDatabaseConnection('sqlalchemy', f'sqlite:///{dbfile}')

# Can use get_connection to get underlying sqlalchemy engine
conn.get_connection()
db_df = load_data_from_db(conn)
print(db_df.head())
del conn  # Add this line

In [3]:
db_df = db_df._repartition()

In [4]:
# "23-04 to 18-05"
db_df = db_df[db_df["Domain_Name"]!=1732927] # remove empty url
db_df["Datetime"] = mpd.to_datetime(db_df["Datetime"])
# db_df.groupby("Device_ID").apply(lambda x: (x-x["Datetime"].min()).dt.days)

In [5]:
db_df.set_index("Datetime",inplace=True)


In [6]:
#train test split
from sklearn.model_selection import train_test_split
def get_train_test_devices(device_target_df, test_size=0.2, random_state=42):    
    # Perform stratified split on device IDs
    train_device_ids, test_device_ids = train_test_split(
        device_target_df['Device_ID'],
        test_size=test_size,
        random_state=random_state,
        stratify=device_target_df['Target']
    )
    return train_device_ids, test_device_ids


In [7]:
devices = db_df.groupby("Device_ID").first().reset_index()
train_devices, test_device_ids = get_train_test_devices(devices)
train_db_df = db_df[db_df["Device_ID"].isin(train_devices)]

In [8]:
devices_per_domain = train_db_df.groupby("Domain_Name")["Device_ID"].nunique()
domains = devices_per_domain[devices_per_domain>10].index
train_db_df = train_db_df[train_db_df["Domain_Name"].isin(domains)]

In [9]:
# preprocess timeseries
def process_activity_timeseries(domain_df,bin_hours=6,gaussian_filter=True,n_days_each_side=3,std=1.5,drop_na=True,drop_zeros=False):
    activity_per_3h = domain_df["Device_ID"].resample(f'{str(bin_hours)}h').nunique()
    gaussian_window_hours = int(n_days_each_side*24/bin_hours*2) # n_days_each_side * 24h / 3h_per_bin * 2 sides
    if gaussian_filter:
        activity_per_3h = activity_per_3h.rolling(window=gaussian_window_hours, win_type='gaussian',center=True,min_periods=1,closed="both").mean(std=std)
    if drop_na:
        activity_per_3h.dropna(inplace=True)
    if drop_zeros:
        activity_per_3h = activity_per_3h[activity_per_3h!=0]
    activity_per_3h.rename("Activity",inplace=True)
    return activity_per_3h.round().astype(int)


In [None]:
from functools import partial
process_domain_timeseries = partial(process_activity_timeseries,gaussian_filter=True,n_days_each_side=3,std=1.5,drop_na=True,drop_zeros=False)
process_domain_timeseries.__name__ =process_activity_timeseries.__name__
domain_time_series = train_db_df.groupby(["Domain_Name","Target"]).apply(process_domain_timeseries)


In [11]:
user_domain_time_series = db_df.groupby(["Device_ID","Domain_Name"]).apply(process_activity_timeseries).swaplevel(0,1)

# Reset index to get all columns as regular columns

In [12]:
def get_active_days_per_user(user_domain_ts):
    """
    Calculate the number of unique days each user had any activity.
    
    Args:
        user_domain_ts: MultiIndex Series with levels [Domain_Name, Device_ID, Datetime]
        
    Returns:
        Series with index Device_ID and values being number of unique active days
    """
    # Reset index to get Datetime as a column
    df = user_domain_ts.reset_index()
    
    # Convert Datetime to date (removing time component)
    df['Date'] = df['Datetime'].dt.date
    
    # Group by Device_ID and count unique dates where Activity > 0
    active_days = df[df['Activity'] > 0].groupby('Device_ID')['Date'].nunique()
    
    return active_days.astype(int)
daily_active_days = get_active_days_per_user(user_domain_time_series)
daily_active_days.name = "Active_Days"
daily_active_days = (daily_active_days-daily_active_days.min())/(daily_active_days.max()-daily_active_days.min())*2-1

In [None]:

# Calculate domain fractions and activity separately
# First, calculate activity fraction
domain_fraction_ts = domain_time_series.copy()
del domain_time_series
domain_fraction_ts["activity_fraction"] = domain_fraction_ts.groupby(["Domain_Name", "Target"]).transform(lambda x: x/x.sum())
# Add the sum of activity as a new column
domain_activity = domain_fraction_ts.groupby(["Domain_Name", "Target"])[["Activity"]].sum()

domain_activity = domain_activity.rename(columns={"Activity": "target_domain_activity"})
# Merge the results
domain_fraction_ts = domain_fraction_ts.merge(domain_activity, left_index=True, right_index=True)
# Reset index to get Target as a column, then pivot to get Target as columns
pivot_fraction_ts = domain_fraction_ts.reset_index().pivot(
    index=['Datetime', 'Domain_Name'],
    columns='Target'
).fillna(0)
pivot_fraction_ts.columns = [f'{col[0]}_{col[1]}' for col in pivot_fraction_ts.columns]
# Rename columns for clarity
# pivot_fraction_ts.columns = ['activity_0', 'activity_1']

In [14]:
merged_df = pivot_fraction_ts.merge(user_domain_time_series.reset_index(),how="left",on=["Domain_Name","Datetime"],)
merged_df.set_index(["Datetime","Domain_Name","Device_ID"],inplace=True)

In [15]:
del user_domain_time_series
del pivot_fraction_ts
del domain_fraction_ts

In [16]:
# from scipy import stats
import numpy as np

def class_probability_score(active, p_active_given_a, p_active_given_b, prior_a=0.5, total_users=100):
    """
    Calculate class probability score with vectorized operations
    
    Args:
        active: Boolean indicating if user was active
        p_active_given_a: Probability of activity given class A (0)
        p_active_given_b: Probability of activity given class B (1)
        prior_a: Prior probability for class A
        total_users: Total number of users for confidence calculation
    """
    # Use numpy for vectorized operations
    likelihood_a = np.where(active, p_active_given_a, 1 - p_active_given_a)
    likelihood_b = np.where(active, p_active_given_b, 1 - p_active_given_b)
    
    # Avoid division by zero
    evidence = (likelihood_a * prior_a + likelihood_b * (1 - prior_a))
    posterior_a = (likelihood_a * prior_a) / evidence
    
    # Simplified confidence calculation
    # alpha = 1 + posterior_a * total_users
    # beta = 1 + (1 - posterior_a) * total_users
    # ci_width = stats.beta.interval(0.95, alpha, beta)[1] - stats.beta.interval(0.95, alpha, beta)[0]
    
    # # Calculate final score
    # raw_score = 2 * posterior_a - 1
    # confidence_factor = 1 - ci_width
    
    return posterior_a #* confidence_factor


In [17]:
merged_df["bin_activity"] = merged_df["Activity_0"]+merged_df["Activity_1"]
merged_df["total_activity"] = (merged_df["target_domain_activity_0"]+merged_df["target_domain_activity_1"])
merged_df["relative_0_activity"] = merged_df["target_domain_activity_0"]/merged_df["total_activity"]


In [None]:
import numpy as np
merged_df["score"]=class_probability_score(merged_df["Activity"], merged_df["activity_fraction_0"], merged_df["activity_fraction_1"], prior_a=merged_df["relative_0_activity"], total_users=merged_df["bin_activity"])

In [None]:
merged_df["weighted_score"] = (merged_df["score"])*(merged_df["bin_activity"])#np.log(1+merged_df["bin_activity"]).astype(int))
final_scores = merged_df.groupby(["Device_ID","Domain_Name"])["weighted_score"].mean()

In [39]:
final_scores_pivot = final_scores.to_frame().reset_index().pivot(index="Device_ID",columns="Domain_Name").fillna(0)

In [40]:
def get_activity_per_time_bin(df,bin_hours=3):
    # Convert datetime to time only
    # time_index = db_df.index.to_series().dt.time
    # df["time"] = time_index
    df_copy = db_df.copy()
    df_copy["time"] = db_df.index.to_series().dt.hour.astype(int)//bin_hours
    df_copy["day_part_activity"] = 0
    activity_per_3h = df_copy[["Device_ID","time","day_part_activity"]].groupby(["Device_ID","time"]).count()

    # activity_per_bin.rename(columns={"Device_ID":"bin_activity_fraction"},inplace=True)
    # activity_per_3h.rename("Activity",inplace=True)
    return activity_per_3h#.round().astype(int)

activity_per_time_range = get_activity_per_time_bin(db_df)


In [41]:
activity_per_time_range["activity_fraction"] = activity_per_time_range.groupby("Device_ID").apply(lambda x: x/x.sum()).values
activity_per_time_range = activity_per_time_range[["activity_fraction"]].reset_index()
activity_per_time_range = activity_per_time_range.pivot(index="Device_ID",columns="time",values="activity_fraction")
activity_per_time_range.columns = [f"time_{col}" for col in activity_per_time_range.columns]
activity_per_time_range = (activity_per_time_range-activity_per_time_range.stack().min())/(activity_per_time_range.stack().max()-activity_per_time_range.stack().min())*2-1
activity_per_time_range = activity_per_time_range.fillna(0)


In [42]:
final_scores_pivot=(final_scores_pivot-final_scores_pivot.values.min())/(final_scores_pivot.values.max()-final_scores_pivot.values.min())*2-1


In [43]:
final_features = db_df.groupby("Device_ID")["Target"].first().reset_index().set_index("Device_ID").join(final_scores_pivot)
# final_features = final_features.join(daily_active_days)
# final_features = final_features.join(activity_per_time_range)


In [44]:

# Create function to filter features based on mean values
def filter_low_mean_features(features_df, percentile=0.1):
    # Calculate absolute mean values for each feature
    abs_means = abs(features_df).mean()
    
    # Calculate percentile threshold of absolute means
    threshold = abs_means.quantile(percentile)
    
    # Get features with absolute means above threshold
    significant_features = abs_means[abs_means >= threshold].index
    
    # Filter features

    return significant_features

# Apply the filtering function


In [45]:
final_features.columns = [str(col) for col in final_features.columns]

In [None]:
final_features

In [47]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import roc_auc_score

# # Split features and target
# X_train = final_features[final_features.index.isin(train_devices)].drop('Target', axis=1)
# significant_features = filter_low_mean_features(X_train, percentile=0.0)
# X_train = X_train[significant_features]
# y_train = final_features[final_features.index.isin(train_devices)]['Target']

# # Get feature importances



# X_test = final_features.loc[final_features.index.isin(test_device_ids),significant_features]
# y_test = final_features[final_features.index.isin(test_device_ids)]['Target']

# # Train Random Forest model
# rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# rf_model.fit(X_train, y_train)

# # Make predictions
# y_pred = rf_model.predict(X_test)

# # Calculate ROC AUC score
# roc_auc = roc_auc_score(y_test, y_pred)
# print(f'ROC AUC Score: {roc_auc:.4f}')

In [None]:
import xgboost
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score

X_train = final_features[final_features.index.isin(train_devices)].drop('Target', axis=1)
y_train = final_features[final_features.index.isin(train_devices)]['Target']

# Get feature importances



X_test = final_features.loc[final_features.index.isin(test_device_ids)]
y_test = final_features[final_features.index.isin(test_device_ids)]['Target']

xgb_reg = xgboost.XGBRegressor(random_state=0, subsample=0.8, colsample_bytree=0.8, learning_rate= 0.1,
                               n_estimators= 150, max_depth=6, objective ='binary:logistic' ,eval_metric =roc_auc_score)
selector = RFE(xgb_reg, n_features_to_select=1000, step=20000)
selector = selector.fit(X_train, y_train)
best_features = list(X_train.columns[selector.support_])
test_prediction = selector.estimator_.predict(X_test[best_features])
print(f'The auc for validation set: {round(roc_auc_score(y_test,test_prediction), 3)}')