# Get the graph structure based on ids

Do this based on basic example graph, and then try to extend this based on year later.

In [1]:
import pandas as pd
import numpy as np
from itertools import combinations

In [2]:
file_path = "../2.Initial_Graph_Building/initial_datasets/boardex_sample_1000.csv"

data = pd.read_csv(file_path)

# Display the first few rows of the data to understand its structure
data.head()

Unnamed: 0,Associationtype,Boardname,Companyname,Directorname,Overlapyearstart,Overlapyearend,Role,Associatedrole,Conncompanyorgtype,Boardid,...,Roletitle,Roleboardposition,Roleedflag,Overlapyearstart_int,Overlapyearend_int,Startcompanydatestartrole,Startcompanydateendrole,Conncompanydatestartrole,Conncompanydateendrole,Orgtype
0,Other,1 800 CONTACTS INC (De-listed 09/2007),Rhode Island School of Design (RISD),Steve Key,2006,2007,Independent Director (Brd) (SD),Vice Chairman,Universities,3,...,Independent Director,Brd,SD,2006,2007.0,2005-07-28,2007-09-07,,,Quoted
1,Unlisted Org,1 800 CONTACTS INC (De-listed 09/2007),Key Consulting LLC,Steve Key,2005,2007,Independent Director (Brd) (SD),Sole Proprietor (Non-Brd),Private,3,...,Independent Director,Brd,SD,2005,2007.0,2005-07-28,2007-09-07,2001-01-01,,Quoted
2,Unlisted Org,1 800 CONTACTS INC (De-listed 09/2007),JD Watkins Enterprises Inc,Steve Key,2006,2007,Independent Director (Brd) (SD),Vice Chairman/COO (Brd) (ED),Private,3,...,Independent Director,Brd,SD,2006,2007.0,2005-07-28,2007-09-07,2006-01-01,,Quoted
3,Listed Org,1 800 CONTACTS INC (De-listed 09/2007),SITEL CORP (De-listed 01/2007),Steve Key,2006,2007,Independent Director (Brd) (SD),Independent Director (Brd) (SD),Quoted,3,...,Independent Director,Brd,SD,2006,2007.0,2005-07-28,2007-09-07,2006-08-04,2007-01-31,Quoted
4,Unlisted Org,1 800 CONTACTS INC (De-listed 09/2007),JD Watkins Enterprises Inc,Steve Key,2005,2006,Independent Director (Brd) (SD),Vice Chairman/CFO (Brd) (ED),Private,3,...,Independent Director,Brd,SD,2005,2006.0,2005-07-28,2007-09-07,2001-02-01,2006-01-01,Quoted


In [3]:
data.columns

Index(['Associationtype', 'Boardname', 'Companyname', 'Directorname',
       'Overlapyearstart', 'Overlapyearend', 'Role', 'Associatedrole',
       'Conncompanyorgtype', 'Boardid', 'Companyid', 'Directorid', 'Roletitle',
       'Roleboardposition', 'Roleedflag', 'Overlapyearstart_int',
       'Overlapyearend_int', 'Startcompanydatestartrole',
       'Startcompanydateendrole', 'Conncompanydatestartrole',
       'Conncompanydateendrole', 'Orgtype'],
      dtype='object')

In [4]:
graph_simplified_df = data[["Boardid", "Companyid", "Directorid"]]

In [18]:
# Filter data for the two directors
filtered_data = data[data['Directorid'].isin([335970, 335975])]

# Count unique companies for each director
unique_companies_each = filtered_data.groupby('Directorid')['Companyid'].nunique()

# Determine the shared companies between these two directors
shared_companies = filtered_data.groupby('Companyid')['Directorid'].nunique()
shared_companies_count = shared_companies[shared_companies > 1].count()

(unique_companies_each, shared_companies_count)

(Directorid
 335970    8
 335975    1
 Name: Companyid, dtype: int64,
 1)

Appoach looks at the common pairs here

In [32]:
import pandas as pd
import numpy as np
from itertools import combinations

# Assuming graph_simplified_df is already loaded
# Step 1: Create a sorted unique company identifier
graph_simplified_df['company'] = graph_simplified_df.apply(
    lambda row: '-'.join(sorted([str(row['Boardid']), str(row['Companyid'])])),
    axis=1
)  # 2-3 and 3-2 should be equivalent in graph_simplified_df['company']

# Ensure data uniqueness per director per company to avoid inflated pair counts
graph_simplified_df = graph_simplified_df.drop_duplicates(subset=['company', 'Directorid'])

# Step 2 & 3: Create combinations of directors for each company, only if there are at least two directors
director_pairs = graph_simplified_df.groupby('company')['Directorid'].apply(
    lambda x: pd.DataFrame(combinations(x.unique(), 2), columns=['Director1', 'Director2']) if len(x) > 1 else pd.DataFrame(columns=['Director1', 'Director2'])
).reset_index(drop=True)

# Step 4: Count occurrences of each pair
director_pairs = director_pairs.groupby(['Director1', 'Director2']).size().reset_index(name='weight')

# Step 5: Create the adjacency matrix
if not director_pairs.empty:
    directors = np.unique(director_pairs[['Director1', 'Director2']])
    adj_matrix = pd.DataFrame(np.zeros((len(directors), len(directors))), columns=directors, index=directors)

    # Populate the adjacency matrix
    for index, row in director_pairs.iterrows():
        adj_matrix.at[row['Director1'], row['Director2']] = row['weight']
        adj_matrix.at[row['Director2'], row['Director1']] = row['weight']

    # Ensure no self-loops
    np.fill_diagonal(adj_matrix.values, 0)

    print(adj_matrix)
else:
    print("No director pairs were found to have shared companies.")


         32412    34066    36354    71107    86917    87882    140120   \
32412        0.0      0.0      1.0      0.0      0.0      0.0      0.0   
34066        0.0      0.0      0.0      1.0      0.0      0.0      0.0   
36354        1.0      0.0      0.0      0.0      0.0      0.0      0.0   
71107        0.0      1.0      0.0      0.0      0.0      0.0      0.0   
86917        0.0      0.0      0.0      0.0      0.0      0.0      0.0   
...          ...      ...      ...      ...      ...      ...      ...   
1475128      0.0      0.0      0.0      0.0      0.0      1.0      0.0   
1604806      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1664742      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
1863745      0.0      0.0      0.0      1.0      0.0      0.0      0.0   
2157370      0.0      0.0      0.0      0.0      0.0      0.0      0.0   

         310234   320417   327069   ...  1313362  1313363  1313364  1330357  \
32412        0.0      0.0      0

In [33]:
adj_matrix.loc[335970, 335975]

1.0

In [34]:
adj_matrix

Unnamed: 0,32412,34066,36354,71107,86917,87882,140120,310234,320417,327069,...,1313362,1313363,1313364,1330357,1391443,1475128,1604806,1664742,1863745,2157370
32412,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34066,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
36354,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71107,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
86917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1475128,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1604806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1664742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1863745,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
adj_matrix.to_csv("test.csv")

PermissionError: [Errno 13] Permission denied: 'test.csv'

In [36]:
from scipy.sparse import lil_matrix, triu

In [37]:
# # Assuming graph_simplified_df is already loaded
# # Step 1: Create a sorted unique company identifier
# graph_simplified_df['company'] = graph_simplified_df.apply(
#     lambda row: '-'.join(sorted([str(row['Boardid']), str(row['Companyid'])])),
#     axis=1
# )  # 2-3 and 3-2 should be equivalent in graph_simplified_df['company']

# # Ensure data uniqueness per director per company to avoid inflated pair counts
# graph_simplified_df = graph_simplified_df.drop_duplicates(subset=['company', 'Directorid'])

# # Step 2 & 3: Create combinations of directors for each company, only if there are at least two directors
# director_pairs = graph_simplified_df.groupby('company')['Directorid'].apply(
#     lambda x: pd.DataFrame(combinations(x.unique(), 2), columns=['Director1', 'Director2']) if len(x) > 1 else pd.DataFrame(columns=['Director1', 'Director2'])
# ).reset_index(drop=True)

# # Step 4: Count occurrences of each pair
# director_pairs = director_pairs.groupby(['Director1', 'Director2']).size().reset_index(name='weight')

# # Step 5: Create the sparse adjacency matrix
# if not director_pairs.empty:
#     directors = np.unique(director_pairs[['Director1', 'Director2']])
#     director_index = {director: idx for idx, director in enumerate(directors)}
#     size = len(directors)
#     adj_matrix = lil_matrix((size, size))

#     # Populate the adjacency matrix
#     for index, row in director_pairs.iterrows():
#         i = director_index[row['Director1']]
#         j = director_index[row['Director2']]
#         adj_matrix[i, j] = row['weight']
#         adj_matrix[j, i] = row['weight']

#     # Ensure no self-loops
#     adj_matrix.setdiag(0)

#     # Convert to a more efficient sparse format if needed
#     adj_matrix = adj_matrix.tocsr()

#     # # If you want to visualize or convert the sparse matrix to a dense format
#     # dense_matrix = adj_matrix.todense()
#     # print(dense_matrix)

# else:
#     print("No director pairs were found to have shared companies.")

In [38]:

# # If you want to visualize or convert the sparse matrix to a dense format
# dense_matrix = adj_matrix.todense()
# print(dense_matrix)

In [39]:
import networkx as nx

In [40]:
G = nx.from_pandas_adjacency(adj_matrix)
print(G)

Graph with 121 nodes and 847 edges


# Graph and density graph details



In [41]:
density = nx.density(G)
print("Density of the graph:", density)


Density of the graph: 0.11666666666666667


In [42]:
clustering = nx.clustering(G)
print("Local clustering for each node:", clustering)


 # id details for each for director for this.

Local clustering for each node: {32412: 0, 34066: 1.0, 36354: 0, 71107: 0.4, 86917: 0, 87882: 1.0, 140120: 0.3333333333333333, 310234: 1.0, 320417: 0.3333333333333333, 327069: 0, 333598: 1.0, 335970: 1.0, 335975: 1.0, 335980: 1.0, 335983: 1.0, 335987: 1.0, 335989: 1.0, 335990: 1.0, 335993: 1.0, 335996: 1.0, 335999: 1.0, 336002: 1.0, 336004: 1.0, 336007: 1.0, 336008: 1.0, 336011: 1.0, 336012: 1.0, 336021: 1.0, 336024: 1.0, 336025: 1.0, 336027: 1.0, 340891: 0, 341397: 0, 341468: 1.0, 341621: 0, 341811: 0, 341813: 0, 341837: 1.0, 341979: 1.0, 370490: 0, 371708: 1.0, 371960: 1.0, 373892: 1.0, 373894: 1.0, 373919: 1.0, 374153: 0.9261538461538461, 374182: 1.0, 374225: 1.0, 374276: 0, 374438: 1.0, 374588: 1.0, 374862: 1.0, 374881: 0.5376955903271693, 374893: 1.0, 374895: 1.0, 374896: 1.0, 441854: 1.0, 493920: 1.0, 493939: 1.0, 500955: 0.7142857142857143, 500997: 1.0, 504236: 0.8603988603988604, 505371: 0.9259259259259259, 506540: 1.0, 508736: 1.0, 509294: 1.0, 514534: 1.0, 544197: 1.0, 551393

In [43]:
degree_centrality = nx.degree_centrality(G)
print("Degree centrality for each node:", degree_centrality)


Degree centrality for each node: {32412: 0.008333333333333333, 34066: 0.025, 36354: 0.008333333333333333, 71107: 0.041666666666666664, 86917: 0.016666666666666666, 87882: 0.016666666666666666, 140120: 0.025, 310234: 0.016666666666666666, 320417: 0.03333333333333333, 327069: 0.008333333333333333, 333598: 0.025, 335970: 0.21666666666666667, 335975: 0.21666666666666667, 335980: 0.21666666666666667, 335983: 0.21666666666666667, 335987: 0.05, 335989: 0.21666666666666667, 335990: 0.21666666666666667, 335993: 0.21666666666666667, 335996: 0.21666666666666667, 335999: 0.21666666666666667, 336002: 0.21666666666666667, 336004: 0.21666666666666667, 336007: 0.21666666666666667, 336008: 0.05, 336011: 0.05, 336012: 0.21666666666666667, 336021: 0.21666666666666667, 336024: 0.21666666666666667, 336025: 0.21666666666666667, 336027: 0.21666666666666667, 340891: 0.008333333333333333, 341397: 0.008333333333333333, 341468: 0.016666666666666666, 341621: 0.008333333333333333, 341811: 0.008333333333333333, 341

In [44]:
betweenness_centrality = nx.betweenness_centrality(G)
print("Betweenness centrality for each node:", betweenness_centrality)


Betweenness centrality for each node: {32412: 0.0, 34066: 0.0, 36354: 0.0, 71107: 0.0008403361344537815, 86917: 0.00014005602240896358, 87882: 0.0, 140120: 0.00028011204481792715, 310234: 0.0, 320417: 0.0003501400560224089, 327069: 0.0, 333598: 0.0, 335970: 0.0, 335975: 0.0, 335980: 0.0, 335983: 0.0, 335987: 0.0, 335989: 0.0, 335990: 0.0, 335993: 0.0, 335996: 0.0, 335999: 0.0, 336002: 0.0, 336004: 0.0, 336007: 0.0, 336008: 0.0, 336011: 0.0, 336012: 0.0, 336021: 0.0, 336024: 0.0, 336025: 0.0, 336027: 0.0, 340891: 0.0, 341397: 0.0, 341468: 0.0, 341621: 0.0, 341811: 0.0, 341813: 0.0, 341837: 0.0, 341979: 0.0, 370490: 0.0, 371708: 0.0, 371960: 0.0, 373892: 0.0, 373894: 0.0, 373919: 0.0, 374153: 0.002637721755368814, 374182: 0.0, 374225: 0.0, 374276: 4.668534080298786e-05, 374438: 0.0, 374588: 0.0, 374862: 0.0, 374881: 0.05280112044817927, 374893: 0.0, 374895: 0.0, 374896: 0.0, 441854: 0.0, 493920: 0.0, 493939: 0.0, 500955: 0.0226890756302521, 500997: 0.0, 504236: 0.008029878618113911, 5053

In [None]:
# do relevant workflow for these