In [1]:
pip install sentence-transformers scikit-learn pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
import warnings
warnings.filterwarnings('ignore')


In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# Load the original dataset
data = pd.read_csv("AI_Incident_Dataset.csv")

# Combine entities from all columns and ensure uniqueness
unique_entities = pd.concat([data['Deployer'], data['Developer'], data['Victim']]).drop_duplicates().tolist()

# Initialize the sentence embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the unique entities
embeddings = embedding_model.encode(unique_entities)

# Apply K-Means clustering with 100 clusters
num_clusters = 100
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

# Create a DataFrame to map unique entities to their clusters
clustered_entities = pd.DataFrame({
    'Entity': unique_entities,
    'Cluster': cluster_labels
})

# Save the clustered unique entities to a CSV
clustered_entities.to_csv("Clustered_Entities.csv", index=False)

print("Clustering completed. Results saved to 'Clustered_Entities.csv'.")


Clustering completed. Results saved to 'Clustered_Entities.csv'.


In [4]:
clustered_entities.head()

Unnamed: 0,Entity,Cluster
0,judiciary of italy,17
1,meta,3
2,new york city government,26
3,eric adams administration,60
4,brookdale senior living,67


In [5]:
# Group by the 'Cluster' column and combine entities in each cluster
clustered_rows = clustered_entities.groupby('Cluster')['Entity'].apply(list).reset_index()
clustered_rows.to_csv('Stacked_Clusters.csv')

In [6]:
import pandas as pd

# Load the clustered entities CSV
clustered_data = pd.read_csv("Clustered_Entities.csv")

# Load the original dataset
original_data = pd.read_csv("AI_Incident_Dataset.csv")

# Melt the original dataset to create a long format
melted_data = original_data.melt(var_name="Column", value_name="Entity")

# Deduplicate melted data by entity
melted_data = melted_data.drop_duplicates(subset=["Entity", "Column"])

# Merge cluster assignments back with melted data
clustered_data = pd.merge(melted_data, clustered_data, on="Entity", how="inner")

# Group by Cluster and Column, deduplicate entities, and reset index
clustered_grouped = (
    clustered_data
    .groupby(['Cluster', 'Column'])['Entity']
    .apply(lambda x: list(set(x)))  # Deduplicate entities within each group
    .unstack(fill_value=[])         # Unstack columns for Developer, Deployer, Victim
    .reset_index()
)

# Save to a new CSV for analysis
clustered_grouped.to_csv("Clustered_By_Columns.csv", index=False)

print("Clusters stacked in Developer, Deployer, Victim columns saved to 'Clustered_By_Columns.csv'.")


Clusters stacked in Developer, Deployer, Victim columns saved to 'Clustered_By_Columns.csv'.


In [8]:
# Load the grouped data with cluster labels
grouped_clusters = pd.read_csv("Clustered_By_Columns_labels.csv")

# Melt the data back to a long format to associate roles with clusters
long_format = grouped_clusters.melt(
    id_vars=["Cluster", "Cluster_label"], 
    value_vars=["Developer", "Deployer", "Victim"], 
    var_name="Role", 
    value_name="Entities"
)

# Explode the entities list to create one row per entity
long_format = long_format.explode("Entities").dropna(subset=["Entities"])

# Save the long format with roles and clusters
long_format.to_csv("Mapped_Entities_With_Roles.csv", index=False)
