# Pre processing process
#### Sources adds in the page title a string to identify the provenance of the item. Since we don't want to consider the source in the clustering, we preprocess the data to remove that string from the page title before the embedding.

In [1]:
import argparse
import os
import json
import time
import torch

In [2]:
def assign_device(device_name):
    if device_name == "cuda":
        if torch.cuda.is_available():
            device = torch.device("cuda")
            print("CUDA is available. Using GPU.")
            print(f"CUDA Device Count: {torch.cuda.device_count()}")
            for i in range(torch.cuda.device_count()):
                print(f"Device {i}: {torch.cuda.get_device_name(i)}")
        else:
            print("CUDA is not available. Falling back to CPU.")
            device = torch.device("cpu")
    elif device_name == "mps":
        if torch.backends.mps.is_available():
            device = torch.device("mps")
            print("MPS is available. Using Apple Silicon GPU.")
        else:
            print("MPS is not available. Falling back to CPU.")
            device = torch.device("cpu")
    else:
        device = torch.device("cpu")
    # Display available devices
    print(f"Device: {device}")
    print(f"CUDA Available: {torch.cuda.is_available()}")
    print(f"MPS Available: {torch.backends.mps.is_available()}")
   
    return device

In [3]:
device = assign_device('mps')
print(device)

MPS is available. Using Apple Silicon GPU.
Device: mps
CUDA Available: False
MPS Available: True
mps


In [4]:
import os
import json
from collections import defaultdict
from difflib import SequenceMatcher 

In [24]:
def find_common_prefix(strings):
 
    if not strings:
        return ""
    prefix = strings[0]
    for string in strings[1:]:
        prefix = common_prefix_two_strings(prefix, string)
        if not prefix:
            break
    return prefix

In [19]:
def common_prefix_two_strings(str1, str2):
    matcher = SequenceMatcher(None, str1, str2)
    match = matcher.find_longest_match(0, len(str1), 0, len(str2))
    return str1[match.a: match.a + match.size]

In [27]:
def process_directory(directory):
    file2pagetitle = {}
    for _, _, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(directory, file)
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    if data:
                        page_title = data['<page title>']
                        file2pagetitle[file] = page_title
                    else:
                        print(f"Empty JSON file: {filepath}")
            except Exception as e:
                print(f"Error reading {filepath}: {e}")
                
    prefix = find_common_prefix(list(file2pagetitle.values()))
    
    for key, value in file2pagetitle.items():
        file2pagetitle[key] = value.replace(prefix, '')
        
    return file2pagetitle


def process_sources(root_dir):
    source2files = {}
    for _, dirnames, filenames in os.walk(root_dir):
        for dirname in dirnames:
            source2files[dirname] = process_directory(root_dir + '/' + dirname)
            
    try:
        output_filepath = os.path.join(root_dir + '/../output/preprocessed.json')
        with open(output_filepath, 'w', encoding='utf-8') as f:
            json.dump(source2files, f, ensure_ascii=False, indent=4)
        print(f"Written to {output_filepath}")
    except Exception as e:
        print(f"Error writing {output_filepath}: {e}")
    


In [28]:
root_directory = '/Users/alessandropesare/software/GitHub/ATCSBDIntegration/linkage-project/dataset/2013_monitor_specs'
process_sources(root_directory)


Error writing /Users/alessandropesare/software/GitHub/ATCSBDIntegration/linkage-project/dataset/2013_monitor_specs/../output/preprocessed.json: name 'new_data' is not defined
