In [159]:
# load data from csv use panda
import pandas as pd
import json
from collections import defaultdict

In [165]:
# Load the two verification datasets again to ensure correct handling
verification_v1 = pd.read_csv('new data/CLD data - Verification_data V1.csv')
verification_v2 =pd.read_csv('new data/CLD data - Verification_data V2.csv')

# Add version columns
verification_v1['version'] = 'V1'
verification_v2['version'] = 'V2'

# Concatenate DataFrames
verification_combined = pd.concat([verification_v1, verification_v2], ignore_index=True)


verification_combined['source_color'] = verification_combined['source_color'].fillna('0').astype('int').astype(str)
verification_combined['target_color'] = verification_combined['target_color'].fillna('0').astype('int').astype(str)
verification_combined['link_color'] = verification_combined['link_color'].fillna('0').astype('int').astype(str)
# Process participant_type column
verification_combined['participant_type'] = (
    verification_combined['participant_type'].str.strip().str.lower().str.replace(' ', '_')
)
verification_combined['group'] = verification_combined['participant_type'] + '_' + verification_combined['participant_number'].astype(str)

# Define column mapping
column_mapping = {
    'cause': 'source',
    'effect': 'target',
    'Master ID': 'source_master_id',
    'Node ID': 'source_node_id',
    'Master ID.1': 'target_master_id',
    'Node ID.1': 'target_node_id',
    'polarity': 'polarity',
    'participant_number': 'participant_number',
    'participant_type': 'participant_type',
    'source_color': 'source_color',
    'target_color': 'target_color',
    'Comment': 'comment'
}

# Rename columns
verification_combined.rename(columns=column_mapping, inplace=True)

# Function to categorize comments
def map_comment_to_type(row):
    if row['version'] == 'V1':
        return 'O'
    elif pd.isna(row['comment']):
        return 'K'
    
    short_comment = ''.join(w.capitalize()[0] for w in row['comment'].split())
    
    return short_comment

# Add types column
verification_combined['types'] = verification_combined.apply(map_comment_to_type, axis=1)


# Load Node_Mapping dataset
node_mapping = pd.read_csv('new data/CLD data - Node Mapping.csv')

# Prepare index mapping for source and target nodes

# Create node_v1_index_map
node_v1_index_map = node_mapping.set_index('Node ID 1')['Index'].to_dict()

# Create node_v2_index_map
node_v2_index_map = node_mapping.set_index('Node ID 2')['Index'].to_dict()

node_index_map = {**node_v1_index_map, **node_v2_index_map}

# Function to map node IDs to indices
def map_node_id_to_index(node_id, node_map):
    return str(node_map.get(node_id))

# Apply mapping for source_node_id and target_node_id
verification_combined['source_map_id'] = verification_combined['source_node_id'].apply(map_node_id_to_index, node_map=node_index_map)
verification_combined['target_map_id'] = verification_combined['target_node_id'].apply(map_node_id_to_index, node_map=node_index_map)


# Handle missing values
verification_combined.fillna('', inplace=True)

# Display the first few rows
verification_combined.head()

verification_combined.to_csv('new_data.csv')


  verification_combined.fillna('', inplace=True)


In [185]:
def get_nodes_from_df(df,node_history_map):

    nodes = []

    for _, row in df.iterrows():
        for node_type in ["source", "target"]:
            node_id_key = f"{node_type}_node_id"
            node_map_key = f"{node_type}_map_id"
            node_color_key = f"{node_type}_color"

            node_data = {
                "data": {
                    "id": row[node_map_key],
                    "key": row[node_id_key],
                    "label": row[node_type],
                    "participant_type": row["participant_type"],
                    "color": row[node_color_key],
                    "history": node_history_map.get(row[node_map_key], []),
                },
                "classes": "",
            }
            nodes.append(node_data)

    return nodes


def get_edges_from_df(df):
    edges = []
    for _, row in df.iterrows():
        if row["polarity"] == "positive":
            value = 1
            sign = "+"
        elif row["polarity"] == "negative":
            value = -1
            sign = "-"
        else:
            value = 0  # or some other default value
            sign = row["polarity"]  # or any other default handling

        edge_data = {
            "data": {
                # "id": f"{row['source_node_id']}-{row['target_node_id']}",
                "id": f"{row['source_map_id']}-{row['target_map_id']}",
                "label": sign,
                "value": value,
                "source": row["source_node_id"],
                "target": row["target_node_id"],
                "source": row["source_map_id"],
                "target": row["target_map_id"],
                "polarity": row["polarity"],
                "participant_type": row["participant_type"],
                "group": row["group"],
                "color": row["link_color"],
            }
        }
        edges.append(edge_data)

    return edges


# Split the dataset based on 'group' column
dataset_group_by_group = {
    group: data for group, data in verification_combined.groupby("group")
}


def process_dataframe(df):
    # Perform desired operations on the dataframe
    node_history_map = {}
    df_v1 = df[df['version'] == "V1"]

    print()

    for _, row in df_v1.iterrows():
        for node_type in ["source", "target"]:
            node_id_key = f"{node_type}_node_id"
            node_master_key = f"{node_type}_master_id"
            node_map_key = f"{node_type}_map_id"
            node_history_entry = {
                "id": row[node_map_key],
                "key": row[node_master_key],
                "label": row[node_type],
                "participant_type": row['participant_type'],
                "group": row['group'],
                "history": []
            }
     
            if row[node_type] in node_history_map:
                if node_history_entry not in node_history_map[row[node_type]]:
                    node_history_map[row[node_map_key]].append(node_history_entry)
            else:
                node_history_map[row[node_map_key]] = [node_history_entry]
    print(node_history_map)

    processed_data = {
        "final" if version == 'V2' else version: {
            "nodes": get_nodes_from_df(data, node_history_map),
            "edges": get_edges_from_df(data),
            # 'raw_data': data.to_dict(orient='records')
        }
        for version, data in df.groupby("version")
    }
    return processed_data


# Display each group as separate datasets with processed data
json_data = {
    group: process_dataframe(df) for group, df in dataset_group_by_group.items()
}


with open("new_data.json", "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=4)


{'30': [{'id': '30', 'key': 30, 'label': 'Legal permission to use data (Identify and Disrupt Act)', 'participant_type': 'advocate', 'group': 'advocate_7', 'history': []}], '19': [{'id': '19', 'key': 19, 'label': 'Data used by law enforcement', 'participant_type': 'advocate', 'group': 'advocate_7', 'history': []}], '24': [{'id': '24', 'key': 24, 'label': 'Exemptions in different acts', 'participant_type': 'advocate', 'group': 'advocate_7', 'history': []}], '1': [{'id': '1', 'key': 1, 'label': 'Access to data from other entities', 'participant_type': 'advocate', 'group': 'advocate_7', 'history': []}], '13': [{'id': '13', 'key': 13, 'label': 'Collection of data by law enforcement', 'participant_type': 'advocate', 'group': 'advocate_7', 'history': []}], '69': [{'id': '69', 'key': 69, 'label': 'Social and political expectations for law enforcement to be using data and technologies to better police', 'participant_type': 'advocate', 'group': 'advocate_7', 'history': []}], '78': [{'id': '78',