## 1 Fetching Data from Supabase

In [None]:
import sys
from dotenv import load_dotenv
import os

# Import functions
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(project_root)
from server import functions_aggregated, functions_supabase, functions_basic, functions_model

load_dotenv()

supabase = functions_supabase.auth()

_acceptance_data, _actions_data, _app_names_data, _location_data, _sex, _weekdays, user_app_usage_data, users_data = functions_supabase.fetchTables(supabase)

display(user_app_usage_data)

In [None]:
df__acceptance, df__actions, df__app_names, df__location, df__sex, df__weekdays, df_user_app_usage, df_users = functions_basic.toPandasDataframes(_acceptance_data, _actions_data, _app_names_data, _location_data, _sex, _weekdays, user_app_usage_data, users_data)

# Verify the structure of the dataframes
df_user_app_usage.head()

##  2 Data Preprocessing

## 2.1 Remove uncompleted rows/entrys

In [None]:
def remove_none_rows(df, column_name):
    """
    Removes rows from a DataFrame where the specified column has 'None' or 'NaN'.
    """
    return df.dropna(subset=[column_name])

df_user_app_usage = remove_none_rows(df_user_app_usage, 'app_usage_time')

# Verify the structure of the dataframes
display(df_user_app_usage.head())
display(df_user_app_usage.dtypes)

## 2.2 Calculate/simplify data functions

### 2.2.1 Normalize and numericalize data

In [None]:
df_user_app_usage_normalized, df_users_normalized = functions_aggregated.normalizeAndNumericalize(df__acceptance, df__actions, df__app_names, df__location, df__sex, df__weekdays, df_user_app_usage, df_users)

# num_acceptance_categories = df__acceptance['id'].nunique()

# display(num_acceptance_categories)

# Check the results
display(df_user_app_usage_normalized.head())
# display(df_user_app_usage_normalized.dtypes)

display(df_users_normalized.head())
# display(df_users_normalized.dtypes)


Merge data

In [None]:
merged_df = functions_aggregated.mergeUsersAndAppUsage(df_user_app_usage_normalized, df_users_normalized)

display(merged_df.head())
display(merged_df.dtypes)


## 3 TensorFlow Model

In [None]:
from tensorflow.keras.models import Model

# merged_df is the DataFrame
feature_columns = merged_df.columns.tolist()
# Exclude 'should_be_blocked' from feature columns
feature_columns = [col for col in merged_df.columns if col != 'should_be_blocked']
display(feature_columns)

# model: Model = functions_model.build_and_compile_model(1000, 64, feature_columns)
model: Model = functions_model.build_and_compile_model_with_attention_machanism(1000, 64, feature_columns)
model.summary()

model.save('../model/model.keras')

## 4. Making Predictions

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data
train, test = train_test_split(merged_df, test_size=0.1)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')
print("-------------------------------------")

# Prepare the data for the model, label_column is the column that we are trying to predict
def prepare_data(df, feature_columns, label_column):
    features = {col: df[col].values for col in feature_columns if col != label_column}
    labels = df[label_column].values
    
    return features, labels


x_train, y_train = prepare_data(train, feature_columns, 'should_be_blocked')
x_val, y_val = prepare_data(val, feature_columns, 'should_be_blocked')
x_test, y_test = prepare_data(test, feature_columns, 'should_be_blocked')

# print("Train")
# display(x_train)
# display(y_train)

# print("Val")
# display(x_val)
# display(y_val)

# Train the model
history = model.fit(x_train, y_train, epochs=200, batch_size=32, validation_data=(x_val, y_val))

# Evaluate the model
val_loss, val_accuracy = model.evaluate(x_val, y_val)
print("-------------------------------------")
print(f'Validation Loss: {val_loss}')
print(f'Validation Accuracy: {val_accuracy}')
print("-------------------------------------")

# Predicting new data
predictions = model.predict(x_test)

predicted_values = predictions.flatten() 
display(predicted_values)


## 5. Results

### 5.1 Training and validation loss and accuracy

In [None]:
import matplotlib.pyplot as plt

# display(history.history)

# Plot training & validation loss values
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], 'b', label='Train Loss')  # Blue color
plt.plot(history.history['val_loss'], 'r', label='Validation Loss')  # Red color
plt.title('Training and Validation Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()

# Plot training & validation accuracy
plt.figure(figsize=(10, 5))
plt.plot(history.history['accuracy'], 'g', label='Train Accuracy')  # Green color
plt.plot(history.history['val_accuracy'], 'm', label='Validation Accuracy')  # Magenta
plt.title('Training and Validation Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()


### 5.2 Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

true_labels = y_test
predictions = [value > 0.5 for value in predicted_values]

# display(true_labels)
# display(predictions)

conf_matrix = confusion_matrix(true_labels, predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

### 5.3 ROC Curve and AUC Score

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, predictions)
roc_auc = auc(fpr, tpr)

# Plotting
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

### 5.4 Precision-Recall Curve

In [None]:
from sklearn.metrics import precision_recall_curve, auc
import matplotlib.pyplot as plt

# Calculate precision and recall
precision, recall, _ = precision_recall_curve(y_test, predictions)

# Calculate AUC
pr_auc = auc(recall, precision)

# Plotting
plt.figure()
plt.plot(recall, precision, color='blue', lw=2, label='Precision-Recall curve (area = %0.2f)' % pr_auc)
plt.fill_between(recall, precision, alpha=0.2, color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.show()

### 5.5 Feature Importance

In [None]:
import shap
import numpy as np

def model_predict(data):
    # Convert the data into the format expected by the model (dictionary of features)
    # Data is a 2D array where each column corresponds to a feature
    feature_dict = {feature: data[:, i] for i, feature in enumerate(x_train.keys())}
    return model.predict(feature_dict)

# Prepare a background dataset (a small subset or a mean/median representative)
background_data = np.array([x_train[col][:100] for col in x_train.keys()]).T

# Create the SHAP explainer with the custom prediction function
explainer = shap.KernelExplainer(model_predict, background_data)

# Calculate SHAP values for a subset of test data (for performance reasons)
shap_values = explainer.shap_values(np.array([x_test[col][:100] for col in x_test.keys()]).T)

In [None]:
shap.summary_plot(shap_values, feature_names=list(x_train.keys()), plot_size=(15, 7))

### 5.6 Learning Curve

In [None]:
from sklearn.model_selection import learning_curve
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator

class KerasModelWrapper(BaseEstimator):
    def __init__(self, model, feature_columns):
        self.model = model
        self.feature_columns = feature_columns

    def fit(self, X, y):
        X_dict = {col: X[:, i] for i, col in enumerate(self.feature_columns)}
        self.model.fit(X_dict, y, epochs=200, batch_size=32, verbose=0)
        return self

    def score(self, X, y):
        X_dict = {col: X[:, i] for i, col in enumerate(self.feature_columns)}
        y_pred = self.model.predict(X_dict).flatten()
        return accuracy_score(y, y_pred > 0.5)

    def predict(self, X):
        X_dict = {col: X[:, i] for i, col in enumerate(self.feature_columns) if col != 'should_be_blocked'}
        y_pred = self.model.predict(X_dict)
        return (y_pred > 0.5).astype(int)  # Return binary labels


# Create the wrapped model instance
wrapped_model = KerasModelWrapper(model, feature_columns)

# Now you can use learning_curve with the wrapped model
train_sizes, train_scores, test_scores = learning_curve(
    wrapped_model,
    X=np.array([x_train[col] for col in x_train.keys() if col != 'should_be_blocked']).T,
    y=y_train,
    train_sizes=np.linspace(0.1, 1.0, 5),
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

# Calculate mean and standard deviation for training set scores
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

# Calculate mean and standard deviation for test set scores
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

# Plot the learning curves
plt.figure(figsize=(12, 8))
plt.plot(train_sizes, train_mean, label='Training score', color='blue', marker='o')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, color='blue', alpha=0.15)

plt.plot(train_sizes, test_mean, label='Cross-validation score', color='green', marker='s')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='green', alpha=0.15)

plt.title('Learning curve')
plt.xlabel('Training Data Size')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.grid()
plt.show()

### 5.7 Error Analysis

In [None]:
import pandas as pd

## Get Predictions and Compare with Actual Labels

# Predict on the test set
y_pred_probs = model.predict(x_test)
y_pred = (y_pred_probs > 0.5).astype(int).flatten()

# Combine actual and predicted labels into a DataFrame
error_analysis_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})

# Add a column for correct/incorrect classification
error_analysis_df['Correct'] = error_analysis_df['Actual'] == error_analysis_df['Predicted']

## Analyze errors

# False Positives
false_positives = error_analysis_df[(error_analysis_df['Actual'] == 0) & (error_analysis_df['Predicted'] == 1)]

# False Negatives
false_negatives = error_analysis_df[(error_analysis_df['Actual'] == 1) & (error_analysis_df['Predicted'] == 0)]

## Visualize the Errors

x_test_df = pd.DataFrame(x_test)

# Get indices of false positives and false negatives
fp_indices = false_positives.index
fn_indices = false_negatives.index

# Iterate over each feature to create histograms
for feature in x_test.keys():
    # Extracting the feature values for false positives and false negatives
    fp_feature_values = x_test_df.loc[fp_indices, feature]
    fn_feature_values = x_test_df.loc[fn_indices, feature]

    # Create histogram for the feature
    plt.figure(figsize=(10, 6))
    plt.hist(fp_feature_values, alpha=0.5, bins=20, label='False Positives')
    plt.hist(fn_feature_values, alpha=0.5, bins=20, label='False Negatives')
    plt.xlabel(feature)
    plt.ylabel('Frequency')
    plt.title(f'Error Analysis for {feature}')
    plt.legend()
    plt.show()