Step 1: Choose & prepare your dataset for NetworkX

In [None]:
# Standard library imports
import re
from random import randint

# Third-party imports
import os 
import numpy as np
import pandas as pd
import json
from bson import loads


import matplotlib.pyplot as plt
import networkx as nx
import nx_arangodb as nxadb
from arango import ArangoClient

# LangChain related imports
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver
from langchain_openai import ChatOpenAI
from langchain_community.graphs import ArangoGraph
from langchain_community.chains.graph_qa.arangodb import ArangoGraphQAChain
from langchain_core.tools import tool

"""
This module imports necessary libraries and modules for graph processing,
data manipulation, visualization, and language model interactions.
"""

'\nThis module imports necessary libraries and modules for graph processing,\ndata manipulation, visualization, and language model interactions.\n'

In [None]:
# The cell to check and read the raw data
BASE_RAW_DATA_FOLDER: str = os.path.expanduser("~/Documents/Code/De_Alignment/data/raw/")

# Folders to view and process
folders_to_process = ["mongodump_full_snapshot", "aiidprob", "translations"]

# Store datasets into dictionary
datasets = {}

# Read json files
def read_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

# Read bson files
def read_bson(file_path):
    with open(file_path, "rb") as f:
        return loads(f.read())

# Iteration through the folders
for folder in folders_to_process:
    folder_path = os.path.join(BASE_RAW_DATA_FOLDER, folder)
    
    # Checking if the folders exist
    if not os.path.exists(folder_path):
        print(f"Folder '{folder_path}' does not exist. Skipping.")
        continue

    # Exact all files in the folders
    raw_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    # Read them
    for file in raw_files:
        file_path = os.path.join(folder_path, file)
        dataset_name = f"{folder}_{os.path.splitext(file)[0]}"

        try:
            # Check if the files extensions are correct
            if file.endswith('.csv'):
                datasets[dataset_name] = pd.read_csv(file_path)
            elif file.endswith('.json'):
                datasets[dataset_name] = pd.DataFrame(read_json(file_path))
            elif file.endswith('.bson'):
                datasets[dataset_name] = pd.DataFrame(read_bson(file_path))
            else:
                print(f"Unsupported file format: {file} in folder {folder}")
        except Exception as e:
            print(f"Error reading file {file} in folder {folder}: {e}")

# Print preview of the dtasets for verifaction
for name, df in datasets.items():
    print(f"Dataset: {name}")
    print(df.head())
    print("\n")

Unsupported file format: .DS_Store in folder mongodump_full_snapshot
Unsupported file format: license.txt in folder mongodump_full_snapshot
Folder '/Users/sylvesterduah/Documents/Code/De_Alignment/data/raw/aiidprob' does not exist. Skipping.
Folder '/Users/sylvesterduah/Documents/Code/De_Alignment/data/raw/translations' does not exist. Skipping.
Dataset: mongodump_full_snapshot_classifications_GMF
  Namespace  Incident ID  Published  \
0       GMF            1       True   
1       GMF           13       True   
2       GMF           31       True   
3       GMF           34       True   
4       GMF           36       True   

                                       Known AI Goal  \
0  Content Recommendation, Content Search, Hate S...   
1                              Hate Speech Detection   
2                                 Autonomous Driving   
3                                 AI Voice Assistant   
4                                   Face Recognition   

                           

In [16]:
def preprocess_data(dataset, dataset_name):
    """
    Custom preprocessing function to extract nodes and edges based on dataset name.
    """
    nodes = None
    edges = None

    if dataset_name == "mongodump_full_snapshot_duplicates":
        nodes = pd.concat([dataset['duplicate_incident_number'], dataset['true_incident_number']]).unique()
        edges = dataset[['duplicate_incident_number', 'true_incident_number']]
        edges['weight'] = 1  
        
    elif dataset_name == "mongodump_full_snapshot_incidents":
        dataset['Alleged deployer of AI system'] = dataset['Alleged deployer of AI system'].apply(lambda x: x[0] if isinstance(x, list) else x)
        dataset['Alleged developer of AI system'] = dataset['Alleged developer of AI system'].apply(lambda x: x[0] if isinstance(x, list) else x)
        nodes = pd.concat([dataset['Alleged deployer of AI system'], dataset['Alleged developer of AI system']]).unique()
        edges = dataset[['Alleged deployer of AI system', 'Alleged developer of AI system']]
        edges['weight'] = 1

    elif dataset_name == "mongodump_full_snapshot_reports":
        dataset['authors'] = dataset['authors'].apply(lambda x: x[0] if isinstance(x, list) else x)
        nodes = pd.concat([dataset['authors'], dataset['source_domain']]).unique()
        edges = dataset[['authors', 'source_domain']]
        edges['weight'] = 1 

    elif dataset_name == "mongodump_full_snapshot_classifications_GMF":
        nodes = pd.concat([dataset['Known AI Goal'], dataset['Known AI Technology']]).unique()
        edges = dataset[['Known AI Goal', 'Known AI Technology']]
        edges['weight'] = 1 
        
    else:
        print(f"Skipping dataset '{dataset_name}' as it does not match any predefined structure.")
        return None, None

    nodes_df = pd.DataFrame(nodes, columns=["node"])

    return nodes_df, edges

# Preprocess all datasets
processed_data = {}
for name, dataset in datasets.items():
    try:
        nodes, edges = preprocess_data(dataset, name)
        if nodes is not None and edges is not None:
            processed_data[name] = {"nodes": nodes, "edges": edges}
    except Exception as e:
        print(f"Error processing dataset '{name}': {str(e)}")
        continue

# Save processed data
processed_data_folder = "processed/"
os.makedirs(processed_data_folder, exist_ok=True)

for name, data in processed_data.items():
    try:
        # Save nodes
        data["nodes"].to_csv(os.path.join(processed_data_folder, f"{name}_nodes.csv"), index=False)

        # Save edges
        data["edges"].to_csv(os.path.join(processed_data_folder, f"{name}_edges.csv"), index=False)

        print(f"Successfully processed and saved data for '{name}'")
    except Exception as e:
        print(f"Error saving processed data for '{name}': {str(e)}")

print("Data preprocessing complete! Processed data saved in 'processed/'.")

Skipping dataset 'mongodump_full_snapshot_classifications_CSETv1_Annotator-1' as it does not match any predefined structure.
Skipping dataset 'mongodump_full_snapshot_classifications_CSETv1_Annotator-2' as it does not match any predefined structure.
Skipping dataset 'mongodump_full_snapshot_classifications_CSETv1_Annotator-3' as it does not match any predefined structure.
Skipping dataset 'mongodump_full_snapshot_submissions' as it does not match any predefined structure.
Skipping dataset 'mongodump_full_snapshot_classifications_CSETv0' as it does not match any predefined structure.
Skipping dataset 'mongodump_full_snapshot_classifications_CSETv1' as it does not match any predefined structure.
Skipping dataset 'mongodump_full_snapshot_quickadd' as it does not match any predefined structure.
Successfully processed and saved data for 'mongodump_full_snapshot_classifications_GMF'
Successfully processed and saved data for 'mongodump_full_snapshot_duplicates'
Successfully processed and save

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges['weight'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges['weight'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges['weight'] = 1
