## 1 Fetching Data from Supabase

In [3]:
from supabase import create_client, Client
import pandas as pd
import os as os
from dotenv import load_dotenv

load_dotenv()

# Initialize Supabase client
url: str = os.environ.get("SUPABASE_PUBLIC_URL")
key: str = os.environ.get("SUPABASE_SERVICE_ROLE_KEY")
supabase: Client = create_client(url, key)

# Fetch data from each table
_acceptance_data = supabase.table("_acceptance").select("*").execute().data
_actions_data = supabase.table("_actions").select("*").execute().data
_app_names_data = supabase.table("_app_names").select("*").execute().data
_place_types = supabase.table("_place_types").select("*").execute().data
_rel_user_location_place_types_data = supabase.table("_rel_user_location_place_types").select("*").execute().data
_sex = supabase.table("_sex").select("*").execute().data

user_app_usage_data = supabase.table("user_app_usage").select("*").execute().data
user_behavior_data = supabase.table("user_behavior").select("*").execute().data
user_location_data = supabase.table("user_location").select("*").execute().data
users_data = supabase.table("users").select("*").execute().data

2023-11-21 13:37:14,533:INFO - HTTP Request: GET https://supabase.cloud-atlas.xyz/rest/v1/_acceptance?select=%2A "HTTP/1.1 200 OK"
2023-11-21 13:37:14,545:INFO - HTTP Request: GET https://supabase.cloud-atlas.xyz/rest/v1/_actions?select=%2A "HTTP/1.1 200 OK"
2023-11-21 13:37:14,555:INFO - HTTP Request: GET https://supabase.cloud-atlas.xyz/rest/v1/_app_names?select=%2A "HTTP/1.1 200 OK"
2023-11-21 13:37:14,568:INFO - HTTP Request: GET https://supabase.cloud-atlas.xyz/rest/v1/_place_types?select=%2A "HTTP/1.1 200 OK"
2023-11-21 13:37:14,584:INFO - HTTP Request: GET https://supabase.cloud-atlas.xyz/rest/v1/_rel_user_location_place_types?select=%2A "HTTP/1.1 200 OK"
2023-11-21 13:37:14,596:INFO - HTTP Request: GET https://supabase.cloud-atlas.xyz/rest/v1/_sex?select=%2A "HTTP/1.1 200 OK"
2023-11-21 13:37:14,608:INFO - HTTP Request: GET https://supabase.cloud-atlas.xyz/rest/v1/user_app_usage?select=%2A "HTTP/1.1 200 OK"
2023-11-21 13:37:14,621:INFO - HTTP Request: GET https://supabase.cloud

In [129]:
# Convert to pandas DataFrames
df__acceptance = pd.DataFrame(_acceptance_data)
df__actions = pd.DataFrame(_actions_data)
df__app_names = pd.DataFrame(_app_names_data)
df__place_types = pd.DataFrame(_place_types)
df__rel_user_location_place_types = pd.DataFrame(_rel_user_location_place_types_data)
df__sex = pd.DataFrame(_sex)

df_user_app_usage = pd.DataFrame(user_app_usage_data)
df_user_behavior = pd.DataFrame(user_behavior_data)
df_user_location = pd.DataFrame(user_location_data)
df_users = pd.DataFrame(users_data)

# Verify the structure of the dataframes
df_user_location.head()

Unnamed: 0,id,created_at,user_id,start_time,end_time,workplace,home
0,1,2023-11-19T16:00:21.232935+00:00,f7427faa-c131-402a-9578-b742aaf3b5bd,2023-11-19T10:01:01+00:00,2023-11-19T12:02:02+00:00,False,True


##  2 Data Preprocessing

## 2.1 Calculate/simplify data functions

In [134]:
from datetime import datetime

# Start functions

def convert_boolean_to_numeric(df_original, column_name):
    """
    Converts a boolean column in a DataFrame to 0 or 1.
    """
    df = df_original.copy()

    # Convert boolean to int (True to 1, False to 0)
    df[column_name] = df[column_name].astype(int)

    return df

def convert_string_to_date(df_original, dob_column):
    """
    Converts a date of birth column from string to datetime and calculates the age.
    """
    df = df_original.copy()
    df[dob_column] = pd.to_datetime(df[dob_column])
    df['age'] = df[dob_column].apply(
        lambda dob: datetime.now().year - dob.year - ((datetime.now().month, datetime.now().day) < (dob.month, dob.day))
    )
    return df


def calculate_duration_and_drop_end_time(df_original, start_time_col='start_time', end_time_col='end_time'):
    """
    Calculates the duration in seconds between start and end times, and drops the end time column.
    """
    df = df_original.copy()
    df['duration'] = df.apply(
        lambda row: (pd.to_datetime(row[end_time_col]).tz_convert('UTC') - pd.to_datetime(row[start_time_col]).tz_convert('UTC')).total_seconds(),
        axis=1
    )
    df.drop(end_time_col, axis=1, inplace=True)
    return df

# Final functions

def normalize_numerical_data(df_original, column, fixed_max):
    """
    Normalizes a specified column of the DataFrame.
    """
    df = df_original.copy()

    # Normalize the column
    df[column] = df[column] / fixed_max
    
    # Ensure that the values do not exceed 1, more than 1 are clipped to 1
    df[column] = df[column].clip(lower=0, upper=1)
    
    return df


def normalize_datetimez_data(df_original, column, fixed_min, fixed_max):
    """
    Normalizes a specified datetime column of the DataFrame to a value between 0 and 1.
    """
    df = df_original.copy()

    # Convert datetime to numerical (timestamp)
    df[column] = pd.to_datetime(df[column]).astype('int64')

    # Convert fixed_min and fixed_max to timestamp
    fixed_min_timestamped = pd.to_datetime(fixed_min).value
    fixed_max_timestamped = pd.to_datetime(fixed_max).value
    
    # Normalize the column
    df[column] = (df[column] - fixed_min_timestamped) / (fixed_max_timestamped - fixed_min_timestamped)
    
    # display(df[column], fixed_min_timestamped, fixed_max_timestamped)
    
    # Ensure that the values do not exceed the range [0, 1]
    df[column] = df[column].clip(lower=0, upper=1)

    return df

def one_hot_encoding(df_original: pd.DataFrame, map_df: pd.DataFrame, column_to_encode: str, map_column: str, map_values: str):
    """
    Maps a column to new values and applies one-hot encoding, ensuring all categories are represented.
    """
    df = df_original.copy()
    
    # Map the column to new values
    mapping = dict(zip(map_df[map_column], map_df[map_values]))
    df[column_to_encode] = df[column_to_encode].map(mapping)

    # One-hot encoding
    df = pd.get_dummies(df, columns=[column_to_encode], prefix=column_to_encode)

    # Add missing columns (if any) and fill with 0
    required_columns = map_df[map_values].unique()
    for col in required_columns:
        full_col_name = f'{column_to_encode}_{col}'
        if full_col_name not in df.columns:
            df[full_col_name] = 0
        df[full_col_name] = df[full_col_name].astype(int)

    return df

def one_hot_encoding_multi(df_original: pd.DataFrame, map_df: pd.DataFrame, column_to_encode: str, map_column: str, map_values: str):
    """
    Maps a column to new values and applies one-hot encoding, ensuring all categories are represented.
    This version supports a column that may contain lists of integers, without creating extra rows.
    """
    df = df_original.copy()

    # Map the column to new values
    mapping = dict(zip(map_df[map_column], map_df[map_values]))

    def map_values_fn(x):
        if isinstance(x, list):
            return [mapping.get(item, item) for item in x]
        else:
            return mapping.get(x, x)

    df[column_to_encode] = df[column_to_encode].apply(map_values_fn)

    # Identify all unique items across lists in the column
    all_unique_items = set()
    for items in df[column_to_encode]:
        if isinstance(items, list):
            all_unique_items.update(items)
        else:
            all_unique_items.add(items)

    # Prepare the one-hot encoded columns
    one_hot_encoded_cols = {}
    for item in all_unique_items:
        column_name = f'{column_to_encode}_{item}'
        one_hot_encoded_cols[column_name] = df[column_to_encode].apply(lambda x: 1 if item in x else 0)

    # Add missing columns (if any) and fill with 0
    required_columns = map_df[map_values].unique()
    for col in required_columns:
        full_col_name = f'{column_to_encode}_{col}'
        if full_col_name not in one_hot_encoded_cols:
            one_hot_encoded_cols[full_col_name] = 0

    # Concatenate the new columns to the original dataframe
    df = pd.concat([df, pd.DataFrame(one_hot_encoded_cols)], axis=1)

    # Drop the original encoded column
    df = df.drop(columns=[column_to_encode])

    return df

def one_hot_encoding_many_to_many(df_main: pd.DataFrame, df_linking: pd.DataFrame, df_types: pd.DataFrame,
                                main_id_col, linking_main_col, linking_type_col,
                                types_id_col, types_name_col):
    """
    One-hot encodes a many-to-many relationship between two DataFrames using a linking table.
    """

    # Aggregate type names by df_linking_id_col
    aggregated: pd.DataFrame = df_linking.groupby(linking_main_col)[linking_type_col].apply(lambda x: list(set(x))).reset_index()

    # display(aggregated.head())

    # One-hot encode the aggregated type names
    one_hot = one_hot_encoding_multi(aggregated, df_types, linking_type_col, types_id_col, types_name_col)

    # display(one_hot.head())
    
    # Merge with the main DataFrame
    df_main_merged = df_main.merge(one_hot, left_on=main_id_col, right_on=linking_main_col, how='left')
    
    df_user_location_normalized = df_user_location.drop(columns=[main_id_col])
    
    # display(df_main_merged.head())

    return df_main_merged

### 2.2 Normalize and numericalize data

In [138]:
# Apply

## df_user_app_usage
df_user_app_usage_normalized = df_user_app_usage.drop(columns=['id', 'created_at'])
df_user_app_usage_normalized = convert_boolean_to_numeric(df_user_app_usage_normalized, 'should_be_blocked')
df_user_app_usage_normalized = calculate_duration_and_drop_end_time(df_user_app_usage_normalized)
df_user_app_usage_normalized = normalize_numerical_data(df_user_app_usage_normalized, 'duration', fixed_max=86400) # 24h max
df_user_app_usage_normalized = normalize_datetimez_data(df_user_app_usage_normalized, 'start_time', fixed_min=datetime(2023, 11, 15), fixed_max=datetime.now()) # 24h max
### app_name and acceptance will be embeded

## df_user_behavior
df_user_behavior_normalized = df_user_behavior.drop(columns=['id', 'created_at', 'start_time', 'end_time'])
### action will be embeded

## df_user_location
df_user_location_normalized = df_user_location.drop(columns=['created_at'])
df_user_location_normalized = convert_boolean_to_numeric(df_user_location_normalized, 'workplace')
df_user_location_normalized = convert_boolean_to_numeric(df_user_location_normalized, 'home')
df_user_location_normalized = one_hot_encoding_many_to_many(df_user_location_normalized,
                                                       df__rel_user_location_place_types,
                                                       df__place_types,
                                                       'id', 'user_location', 'place_type',
                                                       'id', 'place_type')
df_user_location_normalized = df_user_location_normalized.drop(columns=['id'])

# df_users
df_users_normalized = convert_string_to_date(df_users, 'date_of_birth')
df_users_normalized = df_users_normalized.drop(columns=['date_of_birth', 'first_name', 'last_name'])
df_users_normalized = normalize_numerical_data(df_users_normalized, 'age', fixed_max=130) # max-age fixed to 130 years
df_users_normalized = one_hot_encoding(df_users_normalized, df__sex, 'sex', 'id', 'sex')


# num_acceptance_categories = df__acceptance['id'].nunique()

# display(num_acceptance_categories)

# Check the results
# display(df_user_app_usage_normalized.head())
# display(df_user_app_usage_normalized.dtypes)

# display(df__acceptance.head())
# display(df__acceptance.dtypes)

# display(df_user_behavior_normalized)

# display(df_user_location_normalized)
# display(df__rel_user_location_place_types)
# display(df__place_types.head())


# display(df_users_normalized)
# display(df__sex)


Unnamed: 0,user_id,start_time,end_time,workplace,home,user_location,place_type_electric_vehicle_charging_station,place_type_car_dealer,place_type_car_rental,place_type_car_repair,...,place_type_ferry_terminal,place_type_heliport,place_type_light_rail_station,place_type_park_and_ride,place_type_subway_station,place_type_taxi_stand,place_type_train_station,place_type_transit_depot,place_type_transit_station,place_type_truck_stop
0,f7427faa-c131-402a-9578-b742aaf3b5bd,2023-11-19T10:01:01+00:00,2023-11-19T12:02:02+00:00,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,id,created_at,user_location,place_type
0,1,2023-11-21T11:00:41.528649+00:00,1,2
1,2,2023-11-21T11:00:54.667944+00:00,1,6


Unnamed: 0,id,place_type
0,2,car_dealer
1,3,car_rental
2,4,car_repair
3,5,car_wash
4,6,electric_vehicle_charging_station


Unnamed: 0,id,age,sex_male,sex_female
0,f7427faa-c131-402a-9578-b742aaf3b5bd,0.176923,1,0


Merge data

In [142]:
# TODO

def optimized_merge_on_time_condition(df_app_usage, df_location):
    """
    Optimized merge of df_user_app_usage with df_user_location based on time intervals.
    """
    # Convert time columns to datetime
    df_app_usage['start_time'] = pd.to_datetime(df_app_usage['start_time'])
    df_location['start_time'] = pd.to_datetime(df_location['start_time'])
    df_location['end_time'] = pd.to_datetime(df_location['end_time'])

    # Generate a range of times for each interval in df_location
    time_ranges = (df_location.apply(lambda x: pd.date_range(start=x['start_time'], end=x['end_time'], freq='T'), axis=1)
                   .explode()
                   .reset_index())

    # Merge the time ranges with df_app_usage
    merged_df = pd.merge(df_app_usage, time_ranges, left_on='start_time', right_on=0, how='left')

    # Drop the extra columns and rename if necessary
    # merged_df = merged_df.drop(columns=[0, 'index'])

    return merged_df

df_user_app_usage_merged =  df_users_normalized.merge(df_user_app_usage_normalized, left_on='id', right_on='user_id')

# df_user_app_usage_merged = df_user_app_usage_merged.merge(df_behavior, left_on='id', right_on='user_id', suffixes=('_location', '_behavior'))
# df_user_app_usage_merged = df_user_app_usage_merged.merge(df_app_usage, on='user_id', suffixes=('', '_app_usage'))

# Usage
merged_df = optimized_merge_on_time_condition(df_user_app_usage, df_user_location)

display(merged_df)
display(df_user_app_usage_merged)
display(df_user_app_usage)
display(df_user_location)

Unnamed: 0,id,created_at,app_name,start_time,end_time,user_id,acceptance,should_be_blocked
0,1,2023-11-19T16:01:15.876828+00:00,2,2023-11-19 10:00:00+00:00,2023-11-19T10:44:56+00:00,f7427faa-c131-402a-9578-b742aaf3b5bd,4,True


Unnamed: 0,id,age,sex_male,sex_female,app_name,start_time,user_id,acceptance,should_be_blocked,duration
0,f7427faa-c131-402a-9578-b742aaf3b5bd,0.176923,1,0,2,0.636203,f7427faa-c131-402a-9578-b742aaf3b5bd,4,1,0.031204


Unnamed: 0,id,created_at,app_name,start_time,end_time,user_id,acceptance,should_be_blocked
0,1,2023-11-19T16:01:15.876828+00:00,2,2023-11-19 10:00:00+00:00,2023-11-19T10:44:56+00:00,f7427faa-c131-402a-9578-b742aaf3b5bd,4,True


Unnamed: 0,id,created_at,user_id,start_time,end_time,workplace,home
0,1,2023-11-19T16:00:21.232935+00:00,f7427faa-c131-402a-9578-b742aaf3b5bd,2023-11-19 10:01:01+00:00,2023-11-19 12:02:02+00:00,False,True


## 3 TensorFlow Model

In [None]:
import tensorflow as tf

def transformer_model(num_unique_apps, num_unique_actions, num_numerical_features, embedding_dim):
    # Embedding layers for categorical variables
    app_input = tf.keras.layers.Input(shape=(None,), name='app_input')
    app_embedding = tf.keras.layers.Embedding(input_dim=num_unique_apps + 1, output_dim=embedding_dim)(app_input)

    action_input = tf.keras.layers.Input(shape=(None,), name='action_input')
    action_embedding = tf.keras.layers.Embedding(input_dim=num_unique_actions + 1, output_dim=embedding_dim)(action_input)

    # Input layer for numerical features
    numerical_input = tf.keras.layers.Input(shape=(num_numerical_features,), name='numerical_input')

    # Reshape numerical input to concatenate with embeddings
    reshaped_numerical = tf.keras.layers.Reshape((num_numerical_features, 1))(numerical_input)

    # Combining embeddings with numerical input
    combined = tf.keras.layers.Concatenate()([app_embedding, action_embedding, reshaped_numerical])

    # Transformer layer(s)
    transformer_block = tf.keras.layers.TransformerEncoder(output_dim=embedding_dim, num_heads=2, dense_dim=embedding_dim * 2)
    x = transformer_block(combined)

    # Pooling & final Dense layers
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x) # Dropout layer for regularization
    output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    # Create and compile model
    model = tf.keras.models.Model(inputs=[app_input, action_input, numerical_input], outputs=output)
    model.compile(optimizer=tf.keras.layers.Adam(learning_rate=0.001), loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

    return model

# Example usage
model = transformer_model(num_unique_apps=1000, num_unique_actions=100, num_numerical_features=2, embedding_dim=32)
model.summary()


## 4. Making Predictions

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')


# Train the model
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))

# Evaluate the model
val_loss, val_accuracy = model.evaluate(x_val, y_val)
print(f'Validation Loss: {val_loss}')
print(f'Validation Accuracy: {val_accuracy}')


# Predicting new data
# x_test = [app_test, action_test, numerical_test]
predictions = model.predict(x_test)

# Convert predictions to meaningful output
# For binary classification, the output is a probability
predicted_labels = [1 if p > 0.5 else 0 for p in predictions]

In [None]:
import matplotlib as plt

# Plot training & validation accuracy values
plt.pyplot.plot(history.history['accuracy'])
plt.pyplot.plot(history.history['val_accuracy'])
plt.pyplot.title('Model accuracy')
plt.pyplot.ylabel('Accuracy')
plt.pyplot.xlabel('Epoch')
plt.pyplot.legend(['Train', 'Val'], loc='upper left')
plt.pyplot.show()

# Plot training & validation loss values
plt.pyplot.plot(history.history['loss'])
plt.pyplot.plot(history.history['val_loss'])
plt.pyplot.title('Model loss')
plt.pyplot.ylabel('Loss')
plt.pyplot.xlabel('Epoch')
plt.pyplot.legend(['Train', 'Val'], loc='upper left')
plt.pyplot.show()
