## Activity Identification through Screen Text Experimentation

### Importing Required Libraries and Modules
This section imports all necessary libraries and modules for file handling, image processing, data analysis and machine learning.
- **System and file handling**: handles system operations and file management.
- **Image processing**: facilitates image processing for feature extraction and visualization task.
- **Data processing**: used for UI log manipulation, data operations, and analysis.
- **Machine Learning**: tools for the clustering algorithm.
- **CLIP**: pre-trained model from OpenAI and other feature extraction related imports.

In [1]:
# System and file handling
import os
import shutil
import random
import gc

# Image processing and visualization
import numpy as np
from PIL import Image
import imagehash
import matplotlib.pyplot as plt

# Data processing
import pandas as pd
from scipy.optimize import linear_sum_assignment

# Machine Learning
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, precision_score, recall_score

# CLIP
import torch
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer

### Reading UI Log
Loads the UI log file for processing and analysis.

In [2]:
def read_ui_log_as_dataframe(log_path):
    """
    Reads the UI log file into a pandas DataFrame.
    """
    return pd.read_csv(log_path, sep=";")  # , index_col=0

### Feature Extraction from Images and Text
This section includes functions to extract and combine features from images and text using the CLIP model and hashing techniques.

In [4]:
def extract_features_from_images_with_tokenizer(df, image_col, text_col, image_weight, text_weight, img_dir, log_root, header_txt, text_path_col="header_txt"):
    """
    Extracts and combines features from images and text using the CLIP model and tokenizer.

    Parameters:
    df (pandas.DataFrame): dataframe containing the data.
    image_col (str): column name for image paths.
    text_col (str): column name for browser's tab text.
    image_weight (float): weight for image features.
    text_weight (float): weight for text features.
    img_dir (str): directory containing the images.
    log_root (str): root directory for logs.
    header_txt (bool): flag to determine if all on-screen text should be used.
    text_path_col (str): column name for ocr file paths (default is "header_txt").

    Returns:
    pandas.DataFrame: dataframe with combined features.
    """
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

    combined_features = []

    for _, row in df.iterrows():
        if header_txt:
            txt_path = os.path.join(log_root, "ocr_results", row[text_path_col])
            if not os.path.exists(txt_path):
                raise FileNotFoundError(f"Text file does not exist in: {txt_path}")
            with open(txt_path, 'r') as file:
                text = file.read()
        else:
            text = row[text_col]
        input_ids = tokenizer(text, return_tensors="pt", truncation=True)

        image_path = os.path.join(img_dir, row[image_col])
        if not os.path.exists(image_path):
            raise ValueError(f"Image does not exist in: {image_path}")

        image = Image.open(image_path)
        image_inputs = processor(images=[image], return_tensors="pt")

        inputs = {'input_ids': input_ids['input_ids'], 'attention_mask': input_ids['attention_mask'], 'pixel_values': image_inputs['pixel_values']}

        with torch.no_grad():
            outputs = model(**inputs)

        image_features = outputs.image_embeds.cpu().numpy().flatten() * image_weight
        text_features = outputs.text_embeds.cpu().numpy().flatten() * text_weight
        combined_feature = np.hstack((image_features, text_features))
        combined_features.append(combined_feature)

    df['combined_features'] = combined_features
    return df


In [5]:
def extract_hash(df,img_dir,image_col):
    """
    Extracts hash features from images using the wavelet hash method.

    Parameters:
    df (pandas.DataFrame): dataframe containing the data.
    img_dir (str): directory containing the images.
    image_col (str): column name for image paths.

    Returns:
    pandas.DataFrame: dataframe with hash features.
    """
    combined_features = []

    for _, row in df.iterrows():
        img3 = Image.open(os.path.join(img_dir, row[image_col]))
        image_three_hash = imagehash.whash(img3)
        combined_features.append(np.array(image_three_hash.hash).flatten())
    
    df['combined_features'] = combined_features

    return df

### Image Clustering
This section includes a function to perform clustering on the extracted features from images and text. It uses agglomerative clustering to group similar images based on the provided range of cluster numbers and evaluates the clustering performance using various metrics.

In [6]:
def cluster_images(df, n_clusters_range, use_pca, n_components):
    """
    Performs clustering on the features and evaluates the clustering results.

    Parameters:
    df (pandas.DataFrame): dataframe containing the combined features.
    n_clusters_range (tuple): range of cluster numbers to evaluate.
    use_pca (bool): whether to apply PCA for dimensionality reduction.
    n_components (int): number of components to keep if PCA is applied.

    Returns:
    pandas.DataFrame: dataframe with the assigned cluster labels.
    dict: dictionary containing clustering scores for different numbers of clusters.
    int: optimal number of clusters based on silhouette score.
    dict: optimal clustering metrics (Silhouette score, Davies-Bouldin score, Calinski-Harabasz score).
    """
        
    features = np.array(df['combined_features'].tolist())
    
    if use_pca:
        pca = PCA(n_components=n_components)
        features = pca.fit_transform(features)

    clustering_scores = {
        'n_clusters': [],
        'silhouette_score': [],
        'davies_bouldin_score': [],
        'calinski_harabasz_score': []
    }

    for k in range(*n_clusters_range):
        clustering = AgglomerativeClustering(n_clusters=k).fit(features)
        labels = clustering.labels_

        clustering_scores['n_clusters'].append(k)
        clustering_scores['silhouette_score'].append(silhouette_score(features, labels))
        clustering_scores['davies_bouldin_score'].append(davies_bouldin_score(features, labels))
        clustering_scores['calinski_harabasz_score'].append(calinski_harabasz_score(features, labels))

    optimal_index = np.argmax(clustering_scores['silhouette_score'])
    optimal_clusters = clustering_scores['n_clusters'][optimal_index]

    best_clustering = AgglomerativeClustering(n_clusters=optimal_clusters).fit(features)
    df['activity_label'] = best_clustering.labels_

    optimal_metrics = {
        'silhouette_score': clustering_scores['silhouette_score'][optimal_index],
        'davies_bouldin_score': clustering_scores['davies_bouldin_score'][optimal_index],
        'calinski_harabasz_score': clustering_scores['calinski_harabasz_score'][optimal_index]
    }

    return df, clustering_scores, optimal_clusters, optimal_metrics


### Configuration Settings for UI logs
This section defines configuration settings for various log paths, image directories, and columns. These configurations are used to set up the environment for processing different sets of UI logs.

In [31]:
# Problem 1 - Invoice Resolution - Customer details with ID
log_root = 'logs/invoice_def'
log_path = 'logs/invoice_def/log.csv'
image_col = 'screenshot'
image_dir = 'resources/invoice_def'
text_col = 'header'
timestamp_col = 'timestamp'

In [24]:
# Problem 2 - Invoice Resolution - Customer details generalized
log_root = 'logs/invoice_def_customer_view'
log_path = 'logs/invoice_def/log_customer_view.csv'
image_col = 'screenshot'
image_dir = 'resources/invoice_def_customer_view'
text_col = 'header'
timestamp_col = 'timestamp'

In [8]:
# Problem 3 - Payment Notification - Single User recording
log_root = 'logs/SC50_Rebuild'
log_path = 'logs/SC50_Rebuild/log.csv'
image_col = 'screenshot'
image_dir = 'resources/SC50_Rebuild'
text_col = 'header'
timestamp_col = 'timestamp'

In [211]:
# Problem 4 - Payment Notification - Multi-user recording
log_root = 'logs/SC50_Rebuild'
log_path = 'logs/SC50_Rebuild/log.csv'
image_col = 'screenshot'
image_dir = 'resources/SC50_Hybrid'
text_col = 'header'
timestamp_col = 'timestamp'

### Auxiliary Functions for Execution
This section includes auxiliary functions used in executions. These functions handle tasks such as overwriting CSV files, moving files, clearing caches, loading fresh data, and calculating the accuracy of clustering results.

In [9]:
def overwrite_csv(df, file_path):
    """
    Overwrites an existing CSV file with the DataFrame content.

    Parameters:
    df (pandas.DataFrame): dataframe to be saved.
    file_path (str): path where the CSV file will be saved.

    """
    try:
        if os.path.exists(file_path):
            os.remove(file_path)
        df.to_csv(file_path, index=False)
    except Exception as e:
        print(f"Error writing CSV file: {e}")

def move_and_overwrite(source, destination):
    """
    Moves a file from source to destination, overwriting the destination file if it exists.

    Parameters:
    source (str): Path of the source file.
    destination (str): Path of the destination file.

    """
    if os.path.exists(destination):
        os.remove(destination)
    shutil.move(source, destination)
    
def clear_caches():
    """
    Clears the garbage collector and CUDA cache if available.
    """
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def load_fresh_data():
    """
    Reads and returns the UI log as a DataFrame.

    Returns:
    pandas.DataFrame: dataframe containing the UI log data.

    """
    return read_ui_log_as_dataframe(log_path)

def accuracy_calculation(df, activity_label, ground_truth_colname='ground_truth'):
    """
    Calculates precision, recall, and F1-score for the clustering results based on the ground truth labels.

    Parameters:
    df (pandas.DataFrame): dataframe containing the data with predicted and ground truth labels.
    activity_label (str): column name for predicted activity labels.
    ground_truth_colname (str): column name for ground truth labels (default is 'ground_truth').

    Returns:
    tuple: F1-score, precision, and recall for the clustering results.

    """
    predicted_clusters = df[activity_label].unique()
    true_clusters = df[ground_truth_colname].unique()

    cost_matrix = np.zeros((len(predicted_clusters), len(true_clusters)))

    for i, pred_cluster in enumerate(predicted_clusters):
        for j, true_cluster in enumerate(true_clusters):
            cost_matrix[i, j] = -((df[activity_label] == pred_cluster) & (df[ground_truth_colname] == true_cluster)).sum()

    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    cluster_mapping = {predicted_clusters[row]: true_clusters[col] for row, col in zip(row_ind, col_ind)}

    df['mapped_prediction'] = df[activity_label].map(cluster_mapping)
    df['mapped_prediction'] = df['mapped_prediction'].fillna(-1)

    precision = precision_score(df[ground_truth_colname], df['mapped_prediction'], average='macro', zero_division=0)
    recall = recall_score(df[ground_truth_colname], df['mapped_prediction'], average='macro', zero_division=0)
    f1 = f1_score(df[ground_truth_colname], df['mapped_prediction'], average='macro', zero_division=0)
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")

    return f1, precision, recall

### Execute and Save Run
This section sets up initial configurations and parameters, runs multiple executions with different weights for image and text features, and saves the results for further analysis.

In [29]:
model = 'clip'
n_clusters_range = (2, 11)
n_components = 0.95
use_pca = False
tokenize = True # Always True
header_txt = True # Should we use the full text?
use_hash = False # Should we use hash?

# Select directory to save results
ground_truth_colname='ground_truth'
case_study_name = "invoice_full" 
root_dir = os.path.join("executions", case_study_name)
os.makedirs(root_dir, exist_ok=True)

results = []

# Configuration per execution
executions = [
    {'exec': 1, 'image_weight': 1, 'text_weight': 0},
    {'exec': 2, 'image_weight': 0.8, 'text_weight': 0.2},
    {'exec': 3, 'image_weight': 0.6, 'text_weight': 0.4},
    {'exec': 4, 'image_weight': 0.5, 'text_weight': 0.5},
    {'exec': 5, 'image_weight': 0.4, 'text_weight': 0.6},
    {'exec': 6, 'image_weight': 0.2, 'text_weight': 0.8},
    {'exec': 7, 'image_weight': 0, 'text_weight': 1}
]



In [None]:
for exec in executions:
    #Load de the UI log data into a DataFrame
    df = read_ui_log_as_dataframe(log_path)
    clear_caches()  

    # Set random seeds for reproducibility
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(42)

    # Create directory for the current execution   
    exec_dir = f"{case_study_name}_{exec['image_weight']}_{exec['text_weight']}"
    exec_path = os.path.join(root_dir, exec_dir)
    os.makedirs(exec_path, exist_ok=True)

     # Set image and text weights
    image_weight = exec['image_weight']
    text_weight = exec['text_weight']
    
    # Extract features using CLIP or hash
    if tokenize:
        df = extract_features_from_images_with_tokenizer(df, image_col, text_col, image_weight, text_weight, image_dir, log_root, header_txt, text_path_col='header_txt')
    elif use_hash:
        df = extract_hash(df, image_dir, image_col)

    # Perform clustering on the features
    df, clustering_scores, optimal_clusters, optimal_metrics = cluster_images(df, n_clusters_range, use_pca, n_components)

    # Calculate accuracy metrics for the clustering
    f1, precision, recall = accuracy_calculation(df, 'activity_label', ground_truth_colname)

    # Save the DataFrame with clustering results to CSV
    df.to_csv(os.path.join(exec_path, 'df.csv'), index=False)

    # Append the results of the current execution
    results.append({
        'exec': exec['exec'],
        'image_weight': image_weight,
        'text_weight': text_weight,
        'Silhouette': optimal_metrics['silhouette_score'],
        'Davies-Bouldin': optimal_metrics['davies_bouldin_score'],
        'Calinski-Harabasz': optimal_metrics['calinski_harabasz_score'],
        'F1-Score': f1,
        'Precision': precision,
        'Recall': recall,
    })

# Save all execution results to a CSV file
results_df = pd.DataFrame(results)
overwrite_csv(results_df, os.path.join(root_dir, 'resultados.csv'))

# Save all execution results to a CSV file
df.head()
