In [None]:
# %matplotlib notebook
import os
import sys
import igraph
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
from tqdm.notebook import tqdm as tqdm
from scipy.cluster import hierarchy
from mpl_toolkits.mplot3d import Axes3D

Looking for link files...

In [None]:
# Parameters that will be overwritten
Filename = 'box_Viscosity_trial22_8-1_Long'

In [None]:
# # Consider following for batch processing in the future
# data_folder = './Ensemble-node-links'
# for fname in os.listdir(data_folder):
#     path = os.path.join(data_folder,fname)
#     if os.path.isdir(path):
#         path_true = path;
        
# Single file processing - directly specifying path for a folder that contains 'Links' and 'Coords' files
# path_true = '/Volumes/G-DRIVE USB/Experimental Data - Holt Lab/Simulation Results/Box simulation results/box Visocsity trial22 (newjar5)/synDrop_whole_system_simulation/' + Filename + '/'+ Filename + '-NL'
# path_true = '/Users/tongshu/Documents/Lab project 2020/Levy Droplets/Simulations/Example data for analysis debugging/' + Filename + '/'+ Filename + '-NL'
path_true = '/Users/shut01/Documents/Levy simulation folder/Re-analyze simulation data/' + Filename + '/' + Filename + '-NL'

In [None]:
files = []
for f in os.listdir(path_true):
#     if 'Links' in f:
    if 'Links' in f and f.split('.')[1][2]=='0':  # Only extract the files with time *.**000s to avoid mistakes after too many files processed
        files.append(os.path.join(path_true,f))
        
# Sort files
files = sorted(files)
print(f'Number of files found: {len(files)}')

Creating a dataframe with time and file paths for links and coordinates...

In [None]:
df = pd.DataFrame({
    'links_file_path': files,
    'coord_file_path': [f.replace('Links','Coords') for f in files],
    'time': [float(f.split('Links')[1]) for f in files]
})
print(df.shape)
df.head()
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     print(df)

Specifying initial parameters for the simulation, check everytime in different conditions...

In [None]:
L = 0.86 #the length of each side of cubic simulation box in the unit of um (0.86um for larger system )
N_A = 390 # "N_A" is the total number of nodes that are A (hexamer) (located at the first part of nodes, from 0 to N_A-1)
N_B = int(N_A*3) #"N_B" is the total number of nodes that are B (dimer) (located at the second part of nodes, from N_A to N_A+N_B-1)

Loading link files and converting into graphs...

In [None]:
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    # Load and format the data
    df_links = pd.read_csv(row['links_file_path'],
                           header = None,
                           names = ['source','target'])
    df_coord = pd.read_csv(row['coord_file_path'],
                           header = None,
                           # There is an extra comma that can be removed at
                           # data generation time
                           names = ['node','x','y','z','diffusivity','dummy'],
                           index_col = 0)
    df_coord = df_coord.drop(columns=['dummy'])
    
    # Convert to a graph
    nnodes = df_coord.shape[0]
    g = igraph.Graph(nnodes, directed=False)
    edges = [(df_links.source[idx],df_links.target[idx]) for idx in df_links.index]
    length = [
        np.sqrt(
            np.power(df_coord.loc[edge[0],['x','y','z']].values-df_coord.loc[edge[1],['x','y','z']].values,2).sum()
        )
        for edge in edges
    ]
    g.add_edges(edges)
    g.es['length'] = length
    g.vs['diffusivity'] = df_coord.diffusivity.values
    g.vs['coordinate'] = list(zip(df_coord.x.values, df_coord.y.values, df_coord.z.values))
    df.loc[index,'graph'] = g

After incoorporating link and coord information into the graph colume, nodes without appearing in the link files needs to be separated since they are ribosomes...

In [None]:
if nnodes > N_A+N_B: #If number of total nodes is larger than summation of total dimer and hexamer nodes
    numUseful_nodes = N_A+N_B  #Identify the nodes that locate at the front part of nodes file, including dimer and hexamer
    for g_temp in df.graph:
        g_temp.delete_vertices(list(range(numUseful_nodes,len(g_temp.vs)))) #delete the nodes in the graph which have not participated in the edges: nodes number from 'numUseful_nodes' to 'total_nodes_number'

Calculate the molecular concentration within random fixed volume changing with time ...

In [None]:
r_box = L/4
r_center = [[L/4,L/4,L/4],[-L/4,L/4,L/4],[L/4,-L/4,L/4],[-L/4,-L/4,L/4],[L/4,L/4,-L/4],[-L/4,L/4,-L/4],[L/4,-L/4,-L/4],[-L/4,-L/4,-L/4]] #center of the investigate areas
N_center = [] #N_center is a list of list: [[],[],[],...] each list element contains the number of nodes within a sphere with center being r_center
for i in range(len(r_center)): 
    N_center.append(list())

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
# for index, row in tqdm(df.head(n=5).iterrows(), total=5):
    g = row['graph']
    Coord = g.vs['coordinate']
    for i in range(len(r_center)):
        r_center_temp = r_center[i]
##This is to count nodes within each sphere with L/4 radius         
#         length_vector = [[j1-r_center_temp[0],j2-r_center_temp[1],j3-r_center_temp[2]] for [j1,j2,j3] in Coord]
#         length = [np.sqrt(pow(k1,2)+pow(k2,2)+pow(k3,2)) for [k1,k2,k3] in length_vector]
#         N_center[i].append(len([elem for elem in length if elem < r_box]))
##This is to count nodes within each cubic with L/2 length
        length_vector = [[j1-r_center_temp[0],j2-r_center_temp[1],j3-r_center_temp[2]] for [j1,j2,j3] in Coord]
        length_max = [max([np.abs(k1),np.abs(k2),np.abs(k3)]) for [k1,k2,k3] in length_vector]
        N_center[i].append(len([elem for elem in length_max if elem < r_box]))
        
# check = [sum(x) for x in zip(*N_center)]
# print(check)
        
fig, ax = plt.subplots(1,1, figsize=(6,4))
for i in range(len(r_center)):
#     concen_temp = [k*3/(4*pow(r_box,3)*602*3.14) for k in N_center[i]] #Calculate concentration within the sphere with uM unit
    concen_temp = [k/(pow(r_box*2,3)*602) for k in N_center[i]] #Calculate concentration within the cubic with uM unit
    ax.plot(df.time,concen_temp,label=f'center at {r_center[i]}')
    ax.set_ylabel(f'Molecular concentration within cubic \n with length L/4={r_box} (µM)', fontsize=15)
    ax.set_xlabel('Time (s)', fontsize=15)
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.9))
    ax.set_title(f'Dynamic of local molecular concentration')
ave_con = len(Coord)/(pow(L,3)*602)
ax.plot(df.time,ave_con*np.ones(len(df.time))) #This is the average concentration of all molecules within the system
plt.show()


Calculate number of neighbors based on elements directly from graph or within largest cluster in each time frame

In [None]:
# Here the first part of nodes (A) are hexamers and remainings (B) are dimers
N_neighbor_time = [] #At each time points, number of neighbors of all nodes
giant_node = [] #At each time points, node index within the largest cluster
N_CONNECT_TOTAL = [] #Record number of nodes in the shortest path connecting two nodes in all time points
ORDER = [] #Record nodes order in the rearranged cluster graphs
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
# for index, row in tqdm(df.head(n=5).iterrows(), total=5):
    g = row['graph']
    N_neighbor = [len(g.neighborhood(i))-1 for i in range(0,g.vcount())] #The first value in the g.neighborhood is the node index itself
    N_neighbor_time.append(N_neighbor)
    
    Direct_neighbor = np.zeros((g.vcount(),g.vcount())) #Direct_neighbor[i,j] is 1 if (i,j) are direct neighbors
    for i in range(0,g.vcount()):
        temp_neighbor = g.neighborhood(i)[1:]
        for j in temp_neighbor:
            Direct_neighbor[i][j] = 1 
    
    ccs = g.clusters()
    ccslistsize = list(ccs.sizes())
    giant_node.append(ccs[ccslistsize.index(max(ccslistsize))])
    
    # Calculate shortest paths length between nodes
    spl_total1 = []
    N_connect_total = []
    for v_source in range(0,g.vcount()):
        spl = g.shortest_paths_dijkstra(v_source, g.vs(), weights=g.es['length'])
        spl_total1.append(spl)
        N_connect = [float(len(i)) for i in g.get_shortest_paths(v_source, g.vs())] #Number of nodes connecting v_source and v_target (including v_source and v_target)
        N_connect_total.append(N_connect)
    distances = np.array(spl_total1).reshape((g.vcount(),g.vcount()))
    N_connect_total = np.array(N_connect_total).reshape((g.vcount(),g.vcount()))

    # Replace infinities with a very large distance
    distances[np.isinf(distances)] = distances[~np.isinf(distances)].max()
    
    # Clustering
    threshold = 1
    fig, axs = plt.subplots(1,5,figsize=(35,6))
    sys.setrecursionlimit(10000) #Required for much larger system as trail20_(5-5) which has 1170 type A molecules
    linkage = hierarchy.linkage(distances, method="single")
    clusters = hierarchy.fcluster(linkage, threshold, criterion="distance")
    dend = hierarchy.dendrogram(linkage, color_threshold=threshold, ax=axs[4])
    order = dend['leaves']
    ORDER = np.append(ORDER,order,axis=0)
    distances = distances[order,:]
    distances = distances[:,order]
    N_connect_total = N_connect_total[order,:]
    N_connect_total = N_connect_total[:,order]
    Direct_neighbor = Direct_neighbor[order,:]
    Direct_neighbor = Direct_neighbor[:,order]
    if len(N_CONNECT_TOTAL)==0:
        N_CONNECT_TOTAL = N_connect_total
    else:
        N_CONNECT_TOTAL = np.append(N_CONNECT_TOTAL,N_connect_total,axis=0)

    
    #Calculate the direct neighbor numbers and the label for each nodes
    N_direct_neighbor = [N_neighbor[i] for i in order]
    N_direct_neighbor_matrix = np.tile(np.array([N_direct_neighbor]).transpose(),(1,np.int(g.vcount())))
    N_direct_neighbor_per = []
    N_label = []
    for i in range(0,len(order)):
        if order[i]<N_A:
            N_direct_neighbor_per.append(N_direct_neighbor[i]/6)
            N_label.append(1) #'A' is labled as 1
        else:
            N_direct_neighbor_per.append(N_direct_neighbor[i]/2)
            N_label.append(0) #'B' is labled as 0
    N_direct_neighbor_per_matrix = np.tile(np.array([N_direct_neighbor_per]).transpose(),(1,np.int(g.vcount())))
    N_label_matrix = np.tile(np.array([N_label]).transpose(),(1,np.int(g.vcount())))

#   Plot all the analysis results  
    #After clustering, convert the largest distances (& not indirectly connected nodes) to nan and plot as white in the colormap
    max_index = np.where(distances == np.amax(distances)) 
    distances[max_index] = np.nan
    no_connect_index = np.where(N_connect_total == 0)
    N_connect_total[no_connect_index] = np.nan
    current_cmap = plt.cm.get_cmap()
    current_cmap.set_bad(color='white')
    
    [x,y] = np.where(Direct_neighbor == 1) #Extract x,y node position for those are direct neighbors
    
    sc0 = axs[0].imshow(distances)
#     sc1 = axs[1].imshow(N_label_matrix)
    sc1 = axs[1].scatter(x, y, s=0.1) 
    sc2 = axs[2].imshow(N_direct_neighbor_per_matrix)
    sc3 = axs[3].imshow(N_connect_total)
    
    cbar0 = fig.colorbar(sc0, ax=axs[0],extend='neither')
    sc0.set_clim(vmin=0,vmax=1.1)

    axs[1].set_xlim(0,Direct_neighbor.shape[0])
    axs[1].set_ylim(0,Direct_neighbor.shape[0])
    axs[1].invert_yaxis()
    axs[1].set_aspect('equal')
#     cbar1 = fig.colorbar(sc1, ax=axs[1],ticks=[0, 1])
# #     cbar1.ax.set_yticklabels(['B', 'A'])
#     cbar1.ax.set_yticklabels(['N','Y'])
    
    cbar2 = fig.colorbar(sc2, ax=axs[2],extend='neither')
    sc2.set_clim(vmin=0,vmax=1)
    
    cbar3 = fig.colorbar(sc3, ax=axs[3],extend='neither')
    sc3.set_clim(vmin=0,vmax=53)
    
    axs[2].xaxis.set_visible(False)
    cbar0.set_label(r'Topological shortest distance ($\mu m$)', fontsize=18)
#     cbar1.set_label('Node type', fontsize=14)
    axs[1].yaxis.set_label_position('right')
    axs[1].set_ylabel('Direct Neighbors', fontsize=18)
    cbar2.set_label('precentage of direct neighbors', fontsize=18)
    cbar3.set_label('Number of nodes connecting path', fontsize=18)
    axs[0].set_title(f't = {df.time[index]:.3f} s', fontsize=18)    
    axs[3].set_title(f't = {df.time[index]:.3f} s', fontsize=18)    
    axs[4].set_xlabel("Node",fontsize=19)
#     axs[4].set_ylabel("Dissimilarity")
#     plt.subplots_adjust(wspace = 0.3)
    plt.show()

Plot individual node clusters in 3d using coordinate values by plotting 5 or 6 largest clusters at each time points or tracked clusters from first 3 largest clusters in the first frame...

In [None]:
R = [] #first cluster radius
MIU2_XY = [] #normalized first central moment on x,y direction, suggesting deviation from circular shape (0 suggests circular)
MIU2_XZ = [] #normalized first central moment on x,z direction
MIU2_YZ = [] #normalized first central moment on y,z direction
Cluster_size = [] #Number of nodes within the tracked cluster at each time frames
# for index, row in tqdm(df.head(n=5).iterrows(), total=5):
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    g = row['graph']
    Coord = g.vs['coordinate']
    order = ORDER[index*g.vcount():(index+1)*g.vcount()]
    n_connect_total = N_CONNECT_TOTAL[index*g.vcount():(index+1)*g.vcount()]
    
    ccs = g.clusters()
    N_cluster = len(ccs) #N_cluster is the number of clusters within each frame
    Ni_index = [] #Store the actual node index (not index directly from n_connect_total matrix) before clustering using order array
    check_temp_cluster = 0 #Checkpoint for the row index between different squares in graph
    if np.any(np.isnan(n_connect_total)): #whether N_CONNECT_TOTAL elements have been replace to NaN for plotting
        for j in range(0,N_cluster):
            temp_cluster = [i for i, e in enumerate(n_connect_total[check_temp_cluster]) if ~np.isnan(e)]
            Ni_index.append(order[temp_cluster])
            check_temp_cluster = check_temp_cluster+len(temp_cluster)
    else:
        for j in range(0,N_cluster):
            temp_cluster = [i for i, e in enumerate(n_connect_total[check_temp_cluster]) if e!=0]
            Ni_index.append(order[temp_cluster])
            check_temp_cluster = check_temp_cluster+len(temp_cluster)

    Len_Ni_index = [len(i) for i in Ni_index]
    
    #Find the cluster index that have the largest number of common node compared to the cluster in the previous frame, track 3 clusters        
    if index !=0:
        common_node_N1 = []
        common_node_N2 = []
        common_node_N3 = []
        for i_node in Ni_index:
            common_node_N1.append(len(set(N1_max_track_index_pre).intersection(set(i_node))))
            common_node_N2.append(len(set(N2_max_track_index_pre).intersection(set(i_node))))
            common_node_N3.append(len(set(N3_max_track_index_pre).intersection(set(i_node))))
        max_index_N1 = common_node_N1.index(max(common_node_N1)) #node cluster index that has largest number of common nodes with cluster in the previous timeframe
        max_index_N2 = common_node_N2.index(max(common_node_N2))
        max_index_N3 = common_node_N3.index(max(common_node_N3))

    #Find first five/six largest cluster at each time points
    Sort_Len = np.argsort(Len_Ni_index) #Returns the indices that would sort the array in ascending order
    N1_max_index = Ni_index[Sort_Len[len(Sort_Len)-1]]
    N2_max_index = Ni_index[Sort_Len[len(Sort_Len)-2]]
    N3_max_index = Ni_index[Sort_Len[len(Sort_Len)-3]]
    N4_max_index = Ni_index[Sort_Len[len(Sort_Len)-4]]
    N5_max_index = Ni_index[Sort_Len[len(Sort_Len)-5]]
#     N6_max_index = Ni_index[Sort_Len[len(Sort_Len)-6]]

    
    #Define the connected cluster at this current time frame and is used for comparison at next time frame, track 3 clusters
    if index == 0:
        N1_max_track_index_pre = N1_max_index  #Choose the tracking cluster at first frame within range(0,N_cluster), here are the clusters among first five largest clusters in the first frame
        N2_max_track_index_pre = N2_max_index
        N3_max_track_index_pre = N3_max_index
    else: 
        N1_max_track_index_pre = Ni_index[max_index_N1]
        N2_max_track_index_pre = Ni_index[max_index_N2]
        N3_max_track_index_pre = Ni_index[max_index_N3]
    
    Cluster_size.append(len(N1_max_track_index_pre))
    
    #Plot scatter plot for the first five/six largest clusters at each time points
    fig = plt.figure(figsize=(14,5))
    ax1 = fig.add_subplot(1,2,1, projection='3d')
    N1_max_coord = [Coord[np.int(i)] for i in N1_max_index]
    N2_max_coord = [Coord[np.int(i)] for i in N2_max_index]
    N3_max_coord = [Coord[np.int(i)] for i in N3_max_index]
    N4_max_coord = [Coord[np.int(i)] for i in N4_max_index]
    N5_max_coord = [Coord[np.int(i)] for i in N5_max_index]  
#     N6_max_coord = [Coord[np.int(i)] for i in N6_max_index]  
    
    miu1 = np.average(N1_max_coord,axis=0)
    r_vector = N1_max_coord-miu1
    r = np.median([np.sqrt(np.power(i,2).sum()) for i in r_vector]) #or max(...) or np.median(...) or np.average(...)
    miu2_xy = sum([i[0]*i[1] for i in r_vector])/(len(r_vector)^2)
    miu2_xz = sum([i[0]*i[2] for i in r_vector])/(len(r_vector)^2)
    miu2_yz = sum([i[1]*i[2] for i in r_vector])/(len(r_vector)^2)
    
    R.append(r)
    MIU2_XY.append(miu2_xy)
    MIU2_XZ.append(miu2_xz)
    MIU2_YZ.append(miu2_yz)
    
    ax1.scatter([i[0] for i in N1_max_coord], [i[1] for i in N1_max_coord], [i[2] for i in N1_max_coord], c='r', marker='o', alpha=0.7, s=8, linewidths=0)
    ax1.scatter([i[0] for i in N2_max_coord], [i[1] for i in N2_max_coord], [i[2] for i in N2_max_coord], c='b', marker='o', alpha=0.7, s=8, linewidths=0)
    ax1.scatter([i[0] for i in N3_max_coord], [i[1] for i in N3_max_coord], [i[2] for i in N3_max_coord], c='g', marker='o', alpha=0.7, s=8, linewidths=0)    
    ax1.scatter([i[0] for i in N4_max_coord], [i[1] for i in N4_max_coord], [i[2] for i in N4_max_coord], c='c', marker='o', alpha=0.7, s=8, linewidths=0)    
    ax1.scatter([i[0] for i in N5_max_coord], [i[1] for i in N5_max_coord], [i[2] for i in N5_max_coord], c='m', marker='o', alpha=0.7, s=8, linewidths=0)    
#     ax1.scatter([i[0] for i in N6_max_coord], [i[1] for i in N6_max_coord], [i[2] for i in N6_max_coord], c='y', marker='o', alpha=0.7, s=8, linewidths=0)    
   
    ax1.set_title(f't = {df.time[index]:.3f} s')    

    ticks = np.arange(-0.5, 0.5, 0.2)
    ax1.set_xticks(ticks)
    ax1.set_yticks(ticks)
    ax1.set_zticks(ticks)

    ax1.set_xlabel(r'Box X ($\mu m$)')
    ax1.set_ylabel(r'Box Y ($\mu m$)')
    ax1.set_zlabel(r'Box Z ($\mu m$)')
    
    #Plot scatter plot for the tracked clusters (the three clusters in the 1st frame) over time 
    ax2 = fig.add_subplot(1,2,2, projection='3d')
    connected_coord_N1 = [Coord[np.int(i)] for i in N1_max_track_index_pre]
    connected_coord_N2 = [Coord[np.int(i)] for i in N2_max_track_index_pre]
    connected_coord_N3 = [Coord[np.int(i)] for i in N3_max_track_index_pre]
    ax2.scatter([i[0] for i in connected_coord_N1], [i[1] for i in connected_coord_N1], [i[2] for i in connected_coord_N1], c='r', marker='o', alpha=0.7, s=8, linewidths=0)
    ax2.scatter([i[0] for i in connected_coord_N2], [i[1] for i in connected_coord_N2], [i[2] for i in connected_coord_N2], c='b', marker='o', alpha=0.7, s=8, linewidths=0)
    ax2.scatter([i[0] for i in connected_coord_N3], [i[1] for i in connected_coord_N3], [i[2] for i in connected_coord_N3], c='g', marker='o', alpha=0.7, s=8, linewidths=0)
    ax2.set_title(f't = {df.time[index]:.3f} s')

    ticks = np.arange(-0.5, 0.5, 0.2)
    ax2.set_xticks(ticks)
    ax2.set_yticks(ticks)
    ax2.set_zticks(ticks)

    ax2.set_xlabel(r'Box X ($\mu m$)')
    ax2.set_ylabel(r'Box Y ($\mu m$)')
    ax2.set_zlabel(r'Box Z ($\mu m$)')

    plt.show()    

#Plot 0th and 1st moment with time for the largest cluster, indicating its actual radius and circularity in x/y/z plane
fig, axs = plt.subplots(1,2, figsize=(15,6))
axs[0].plot(df.time,R)
axs[0].set_ylabel(r'Radius of first cluster ($\mu m$)', fontsize=18)
axs[0].set_xlabel('Time (s)', fontsize=18)
axs[0].set_ylim(0,0.18)

axs[1].plot(df.time,MIU2_XY,color='blue', label='X,Y direction')
axs[1].plot(df.time,MIU2_XZ,color='red', label='X,Z direction')
axs[1].plot(df.time,MIU2_YZ,color='green', label='Y,Z direction')
axs[1].set_ylabel('Normalized first central moment', fontsize=18)
axs[1].set_xlabel('Time (s)', fontsize=18)
axs[1].legend(loc='center left', bbox_to_anchor=(1, 0.9))
axs[1].set_ylim(-0.012,0.012)

#Plot the number of nodes within the first tracked cluster over time
dt = np.diff(df.time)
dCs = np.diff(Cluster_size)
fig,axs = plt.subplots(1,2, figsize=(15,6))
axs[0].plot(df.time,Cluster_size)
axs[0].set_ylabel(r'Number of nodes within first tracked cluster', fontsize=18)
axs[0].set_xlabel('Time (s)', fontsize=18)
# ax[0].set_ylim(0,280)

axs[1].plot(df.time[1:],[x/y for x, y in zip(dCs, dt)])
axs[1].set_ylabel(r'First derivation of node number', fontsize=18)
axs[1].set_xlabel('Time (s)', fontsize=18)
plt.show()

Plot individual node clusters in 3d using coordinate values by plotting 5 or 6 largest clusters at each time points or tracked clusters from first 3 largest clusters in the last frame...

In [None]:
R = [] #first cluster radius
MIU2_XY = [] #normalized first central moment on x,y direction, suggesting deviation from circular shape (0 suggests circular)
MIU2_XZ = [] #normalized first central moment on x,z direction
MIU2_YZ = [] #normalized first central moment on y,z direction
Cluster_size = [] #Number of nodes within the tracked cluster at each time frames

df_copy = df.copy()  #Copy dataframe 'df' to a new dataframe 'df_copy' to avoid changing original dataframe
reversed_df = df_copy.loc[::-1] #Reverse copied dataframe in terms of index and saved to 'reversed_df' dataframe
# for index, row in tqdm(reversed_df.head(n=5).iterrows(), total=5):
for index, row in tqdm(reversed_df.iterrows(), total=reversed_df.shape[0]):
#     print(row.time, reversed_df.time[index], index, reversed_df.shape[0])
    g = row['graph']
    Coord = g.vs['coordinate']
    order = ORDER[index*g.vcount():(index+1)*g.vcount()]
    n_connect_total = N_CONNECT_TOTAL[index*g.vcount():(index+1)*g.vcount()]

    
    ccs = g.clusters()
    N_cluster = len(ccs) #N_cluster is the number of clusters within each frame
    Ni_index = [] #Store the actual node index (not index directly from n_connect_total matrix) before clustering using order array
    check_temp_cluster = 0 #Checkpoint for the row index between different squares in graph
    if np.any(np.isnan(n_connect_total)): #whether N_CONNECT_TOTAL elements have been replace to NaN for plotting
        for j in range(0,N_cluster):
            temp_cluster = [i for i, e in enumerate(n_connect_total[check_temp_cluster]) if ~np.isnan(e)]
            Ni_index.append(order[temp_cluster])
            check_temp_cluster = check_temp_cluster+len(temp_cluster)
    else:
        for j in range(0,N_cluster):
            temp_cluster = [i for i, e in enumerate(n_connect_total[check_temp_cluster]) if e!=0]
            Ni_index.append(order[temp_cluster])
            check_temp_cluster = check_temp_cluster+len(temp_cluster)

    Len_Ni_index = [len(i) for i in Ni_index]
    
    #Find the cluster index that have the largest number of common node compared to the cluster in the previous frame, track 3 clusters        
    if index != reversed_df.shape[0]-1:
        common_node_N1 = []
        common_node_N2 = []
        common_node_N3 = []
        for i_node in Ni_index:
            common_node_N1.append(len(set(N1_max_track_index_pre).intersection(set(i_node))))
            common_node_N2.append(len(set(N2_max_track_index_pre).intersection(set(i_node))))
            common_node_N3.append(len(set(N3_max_track_index_pre).intersection(set(i_node))))
        max_index_N1 = common_node_N1.index(max(common_node_N1)) #node cluster index that has largest number of common nodes with cluster in the previous timeframe
        max_index_N2 = common_node_N2.index(max(common_node_N2))
        max_index_N3 = common_node_N3.index(max(common_node_N3))

    #Find first five/six largest cluster at each time points
    Sort_Len = np.argsort(Len_Ni_index) #Returns the indices that would sort the array in ascending order
    N1_max_index = Ni_index[Sort_Len[len(Sort_Len)-1]]
    N2_max_index = Ni_index[Sort_Len[len(Sort_Len)-2]]
    N3_max_index = Ni_index[Sort_Len[len(Sort_Len)-3]]
    N4_max_index = Ni_index[Sort_Len[len(Sort_Len)-4]]
    N5_max_index = Ni_index[Sort_Len[len(Sort_Len)-5]]
#     N6_max_index = Ni_index[Sort_Len[len(Sort_Len)-6]]

    
    #Define the connected cluster at this current time frame and is used for comparison at next time frame, track 3 clusters
    if index == reversed_df.shape[0]-1:
        N1_max_track_index_pre = N1_max_index  #Choose the tracking cluster at first frame within range(0,N_cluster), here are the clusters among first five largest clusters in the first frame
        N2_max_track_index_pre = N2_max_index
        N3_max_track_index_pre = N3_max_index
    else: 
        N1_max_track_index_pre = Ni_index[max_index_N1]
        N2_max_track_index_pre = Ni_index[max_index_N2]
        N3_max_track_index_pre = Ni_index[max_index_N3]
    
    Cluster_size.append(len(N1_max_track_index_pre))
    
    #Plot scatter plot for the first five/six largest clusters at each time points
    fig = plt.figure(figsize=(14,5))
    ax1 = fig.add_subplot(1,2,1, projection='3d')
    N1_max_coord = [Coord[np.int(i)] for i in N1_max_index]
    N2_max_coord = [Coord[np.int(i)] for i in N2_max_index]
    N3_max_coord = [Coord[np.int(i)] for i in N3_max_index]
    N4_max_coord = [Coord[np.int(i)] for i in N4_max_index]
    N5_max_coord = [Coord[np.int(i)] for i in N5_max_index]  
#     N6_max_coord = [Coord[np.int(i)] for i in N6_max_index]  
    
    miu1 = np.average(N1_max_coord,axis=0)
    r_vector = N1_max_coord-miu1
    r = np.median([np.sqrt(np.power(i,2).sum()) for i in r_vector]) #or max(...) or np.median(...) or np.average(...)
    miu2_xy = sum([i[0]*i[1] for i in r_vector])/(len(r_vector)^2)
    miu2_xz = sum([i[0]*i[2] for i in r_vector])/(len(r_vector)^2)
    miu2_yz = sum([i[1]*i[2] for i in r_vector])/(len(r_vector)^2)
    
    R.append(r)
    MIU2_XY.append(miu2_xy)
    MIU2_XZ.append(miu2_xz)
    MIU2_YZ.append(miu2_yz)
    
    ax1.scatter([i[0] for i in N1_max_coord], [i[1] for i in N1_max_coord], [i[2] for i in N1_max_coord], c='r', marker='o', alpha=0.7, s=8, linewidths=0)
    ax1.scatter([i[0] for i in N2_max_coord], [i[1] for i in N2_max_coord], [i[2] for i in N2_max_coord], c='b', marker='o', alpha=0.7, s=8, linewidths=0)
    ax1.scatter([i[0] for i in N3_max_coord], [i[1] for i in N3_max_coord], [i[2] for i in N3_max_coord], c='g', marker='o', alpha=0.7, s=8, linewidths=0)    
    ax1.scatter([i[0] for i in N4_max_coord], [i[1] for i in N4_max_coord], [i[2] for i in N4_max_coord], c='c', marker='o', alpha=0.7, s=8, linewidths=0)    
    ax1.scatter([i[0] for i in N5_max_coord], [i[1] for i in N5_max_coord], [i[2] for i in N5_max_coord], c='m', marker='o', alpha=0.7, s=8, linewidths=0)    
#     ax1.scatter([i[0] for i in N6_max_coord], [i[1] for i in N6_max_coord], [i[2] for i in N6_max_coord], c='y', marker='o', alpha=0.7, s=8, linewidths=0)    
   
    ax1.set_title(f't = {reversed_df.time[index]:.3f} s')    

    ticks = np.arange(-0.5, 0.5, 0.2)
    ax1.set_xticks(ticks)
    ax1.set_yticks(ticks)
    ax1.set_zticks(ticks)

    ax1.set_xlabel(r'Box X ($\mu m$)')
    ax1.set_ylabel(r'Box Y ($\mu m$)')
    ax1.set_zlabel(r'Box Z ($\mu m$)')
    
    #Plot scatter plot for the tracked clusters (the three clusters in the 1st frame) over time 
    ax2 = fig.add_subplot(1,2,2, projection='3d')
    connected_coord_N1 = [Coord[np.int(i)] for i in N1_max_track_index_pre]
    connected_coord_N2 = [Coord[np.int(i)] for i in N2_max_track_index_pre]
    connected_coord_N3 = [Coord[np.int(i)] for i in N3_max_track_index_pre]
    ax2.scatter([i[0] for i in connected_coord_N1], [i[1] for i in connected_coord_N1], [i[2] for i in connected_coord_N1], c='r', marker='o', alpha=0.7, s=8, linewidths=0)
    ax2.scatter([i[0] for i in connected_coord_N2], [i[1] for i in connected_coord_N2], [i[2] for i in connected_coord_N2], c='b', marker='o', alpha=0.7, s=8, linewidths=0)
    ax2.scatter([i[0] for i in connected_coord_N3], [i[1] for i in connected_coord_N3], [i[2] for i in connected_coord_N3], c='g', marker='o', alpha=0.7, s=8, linewidths=0)
    ax2.set_title(f't = {reversed_df.time[index]:.3f} s')

    ticks = np.arange(-0.5, 0.5, 0.2)
    ax2.set_xticks(ticks)
    ax2.set_yticks(ticks)
    ax2.set_zticks(ticks)

    ax2.set_xlabel(r'Box X ($\mu m$)')
    ax2.set_ylabel(r'Box Y ($\mu m$)')
    ax2.set_zlabel(r'Box Z ($\mu m$)')

    plt.show()    

#Plot 0th and 1st moment with time for the largest cluster, indicating its actual radius and circularity in x/y/z plane
fig, axs = plt.subplots(1,2, figsize=(15,6))
axs[0].plot(reversed_df.time,R)
axs[0].set_ylabel(r'Radius of first cluster ($\mu m$)', fontsize=18)
axs[0].set_xlabel('Time (s)', fontsize=18)
axs[0].set_ylim(0,0.18)

axs[1].plot(reversed_df.time,MIU2_XY,color='blue', label='X,Y direction')
axs[1].plot(reversed_df.time,MIU2_XZ,color='red', label='X,Z direction')
axs[1].plot(reversed_df.time,MIU2_YZ,color='green', label='Y,Z direction')
axs[1].set_ylabel('Normalized first central moment', fontsize=18)
axs[1].set_xlabel('Time (s)', fontsize=18)
axs[1].legend(loc='center left', bbox_to_anchor=(1, 0.9))
axs[1].set_ylim(-0.012,0.012)

#Plot the number of nodes within the first tracked cluster over time
dt = np.diff(reversed_df.time)
dCs = np.diff(Cluster_size)
fig,axs = plt.subplots(1,2, figsize=(15,6))
axs[0].plot(reversed_df.time,Cluster_size)
axs[0].set_ylabel(r'Number of nodes within first tracked cluster', fontsize=18)
axs[0].set_xlabel('Time (s)', fontsize=18)
# ax[0].set_ylim(0,280)

axs[1].plot(reversed_df.time[1:],[x/y for x, y in zip(dCs, dt)])
axs[1].set_ylabel(r'First derivation of node number', fontsize=18)
axs[1].set_xlabel('Time (s)', fontsize=18)
plt.show()

Calculate the neighbor properties of all nodes as well as the nodes within the largest cluster...

In [None]:
indexA = np.array(range(N_A)) #Node index number for typeA molecules
indexB = np.array(range(N_A,N_A+N_B)) #Node index number for typeB molecules

N_neighbor_node = [] #For each node, number of neighbors at all time points
N_neighbor_node_percen = [] #For each node, percentage of neighbors number relative to all available binding sites
# node_label = [None]*len(N_neighbor_time[0]) #initialize the node_label with total number of A/B nodes
for node_index in range(0,len(N_neighbor_time[0])):
    N_neighbor_indi_node = [N_neighbor_time[time_index][node_index] for time_index in range(0,len(N_neighbor_time))]
    #number of neighbors for each individual nodes at all time points
    N_neighbor_node.append(N_neighbor_indi_node)
    if node_index in indexA:
#         node_label[node_index] = 'A'
        N_neighbor_node_percen.append([x/6 for x in N_neighbor_indi_node])
    else:
#         node_label[node_index] = 'B'
        N_neighbor_node_percen.append([x/2 for x in N_neighbor_indi_node])

    
N_neighbor_node_giant = [] #Initialize average neighbor numbers for nodes within largest cluster
N_neighbor_node_giant_std = [] #std for neighbor numbers of nodes within largest cluster
for time_index in range(0,len(giant_node)):
    N_giant_node = giant_node[time_index]
    N_neighbor_tempA = []
    N_neighbor_tempB = []
    for i in N_giant_node: 
        if i in indexA:
            N_neighbor_tempA.append(N_neighbor_node[i][time_index])
        else:
            N_neighbor_tempB.append(N_neighbor_node[i][time_index])
    N_neighbor_node_giant.append([np.average(N_neighbor_tempA),np.average(N_neighbor_tempB)])
    N_neighbor_node_giant_std.append([np.std(N_neighbor_tempA),np.std(N_neighbor_tempB)])

Plot number of neighbors for all nodes or nodes within largest cluster over time ...

In [None]:
# Plot number of neighbors for all nodes over time
fig, ax = plt.subplots(1,1, figsize=(6,4))
ax.set_ylim(0,6.5)
ax.plot(df.time,np.average([N_neighbor_node[i] for i in indexA],axis=0), color='blue',label='hexamer neighbors')
ax.plot(df.time,np.average([N_neighbor_node[i] for i in indexB],axis=0), color='red',label='dimer neighbors')
ax.fill_between(
    df.time,
    y1 = np.average([N_neighbor_node[i] for i in indexA],axis=0)-np.std([N_neighbor_node[i] for i in indexA],axis=0),
    y2 = np.average([N_neighbor_node[i] for i in indexA],axis=0)+np.std([N_neighbor_node[i] for i in indexA],axis=0), alpha=0.3, facecolor='gray')

ax.fill_between(
    df.time,
    y1 = np.average([N_neighbor_node[i] for i in indexB],axis=0)-np.std([N_neighbor_node[i] for i in indexB],axis=0),
    y2 = np.average([N_neighbor_node[i] for i in indexB],axis=0)+np.std([N_neighbor_node[i] for i in indexB],axis=0), alpha=0.3, color='gray')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.9))
ax.set_ylabel('Number of neighbors', fontsize=18)
ax.set_xlabel('Time (s)', fontsize=18)
plt.title('Average of all nodes')
plt.show()

# Plot number of neighbors for nodes within largest cluster over time
fig, ax = plt.subplots(1,1, figsize=(6,4))
ax.set_ylim(0,6.5)
ax.plot(df.time,[N_neighbor_node_giant[i][0] for i in range(0,len(N_neighbor_node_giant))], color='blue',label='hexamer neighbors')
ax.plot(df.time,[N_neighbor_node_giant[i][1] for i in range(0,len(N_neighbor_node_giant))], color='red',label='dimer neighbors')
ax.fill_between(
    df.time,
    y1 = [N_neighbor_node_giant[i][0]-N_neighbor_node_giant_std[i][0] for i in range(0,len(N_neighbor_node_giant))],
    y2 = [N_neighbor_node_giant[i][0]+N_neighbor_node_giant_std[i][0] for i in range(0,len(N_neighbor_node_giant))], alpha=0.3, facecolor='gray')

ax.fill_between(
    df.time,
    y1 = [N_neighbor_node_giant[i][1]-N_neighbor_node_giant_std[i][1] for i in range(0,len(N_neighbor_node_giant))],
    y2 = [N_neighbor_node_giant[i][1]+N_neighbor_node_giant_std[i][1] for i in range(0,len(N_neighbor_node_giant))], alpha=0.3, facecolor='gray')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.9))
ax.set_ylabel('Number of neighbors', fontsize=18)
ax.set_xlabel('Time (s)', fontsize=18)
plt.title('Average of the nodes within largest cluster')
plt.show()

# Plot autocorrelation of number of neighbors for all nodes over time
fig, axes = plt.subplots(1,2, figsize=(12,4), sharey=True)
# ax.set_ylim(0,6.5)

if df.shape[0]>=100: # Defining the largest correlation length based on the size of the time points
    MAXLAG = 100
else:
    MAXLAG = df.shape[0]-1
    
axes[0].acorr(np.average([N_neighbor_node[i] for i in indexA],axis=0), maxlags = MAXLAG, color='blue',label='hexamer neighbors')
axes[1].acorr(np.average([N_neighbor_node[i] for i in indexB],axis=0), maxlags = MAXLAG, color='red',label='dimer neighbors') 
# axes.legend(loc='center left', bbox_to_anchor=(1, 0.9))
axes[0].set_ylabel(f'Autocorrelation of \n node neighbor numbers', fontsize=18)
axes[0].set_xlabel('Time lag', fontsize=18)
axes[1].set_xlabel('Time lag', fontsize=18)
axes[0].set_title('Average of all hexamer neighbors')
axes[1].set_title('Average of all dimer neighbors')
plt.show()

# Plot autocorrelation of number of neighbors of nodes within largest cluster in the last frame over time
fig, axes = plt.subplots(1,2, figsize=(12,4), sharey=True)
Final_giant_node = giant_node[-1]
Final_giant_neighbor_A = [] #For each typeA node (hexamer) within last giant cluster, number of neighbors at all time points
Final_giant_neighbor_B = [] #For each typeB node (dimer) within last giant cluster, number of neighbors at all time points
for node_index in Final_giant_node:
    if node_index in indexA:
        Final_giant_neighbor_A.append(N_neighbor_node[node_index])
    else:
        Final_giant_neighbor_B.append(N_neighbor_node[node_index])
axes[0].acorr(np.average(Final_giant_neighbor_A,axis=0), maxlags = MAXLAG, color='blue',label='hexamer neighbors')
axes[1].acorr(np.average(Final_giant_neighbor_B,axis=0), maxlags = MAXLAG, color='red',label='dimer neighbors') 
# axes.legend(loc='center left', bbox_to_anchor=(1, 0.9))
axes[0].set_ylabel(f'Autocorrelation of \n node neighbor numbers \n within last largest giant cluster', fontsize=18)
axes[0].set_xlabel('Time lag', fontsize=18)
axes[1].set_xlabel('Time lag', fontsize=18)
axes[0].set_title(f'Average of hexamer neighbors \n within last largest giant cluster')
axes[1].set_title(f'Average of dimer neighbors \n within last largest giant cluster')
plt.show()

# Plot colormap distribution of percentage neighbors and diffusivity for nodes within largest cluster
cmap = plt.cm.get_cmap('jet')
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
# for index, row in tqdm(df.head(n=5).iterrows(), total=5):
    g = row['graph']
    Coord = g.vs['coordinate']
    N_giant_node = giant_node[index]
    N_giant_coord = [Coord[i] for i in N_giant_node]
    N_giant_node_percen = [N_neighbor_node_percen[i][index] for i in N_giant_node]
    N_giant_diffusion = [g.vs['diffusivity'][i] for i in N_giant_node]
    
    N_giant_node_A = []
    N_giant_node_B = []
    for i in range(0, len(N_giant_node)):
        if N_giant_node[i] in indexA:
            N_giant_node_A.append(N_giant_node[i])
        else:
            N_giant_node_B.append(N_giant_node[i])
    N_giant_coord_A = [Coord[i] for i in N_giant_node_A]
    N_giant_coord_B = [Coord[i] for i in N_giant_node_B]
    N_giant_node_percen_A = [N_neighbor_node_percen[i][index] for i in N_giant_node_A]
    N_giant_node_percen_B = [N_neighbor_node_percen[i][index] for i in N_giant_node_B]
    N_giant_diffusion_A = [g.vs['diffusivity'][i] for i in N_giant_node_A]
    N_giant_diffusion_B = [g.vs['diffusivity'][i] for i in N_giant_node_B]
    
    #First subplot - 3d node images
    fig = plt.figure(figsize=(30,4))
    gs = gridspec.GridSpec(1, 5, width_ratios=[1.5, 1.2, 1.2, 0.8, 0.8]) 
    ax3D = fig.add_subplot(gs[0], projection='3d')
    sc3D = ax3D.scatter([i[0] for i in N_giant_coord], [i[1] for i in N_giant_coord], [i[2] for i in N_giant_coord], c=[i for i in N_giant_node_percen], cmap=cmap, marker='o', alpha=0.7, s=8, linewidths=0, depthshade=0)    
    ax3D.set_title(f't = {df.time[index]:.3f} s')    
    cbar = plt.colorbar(sc3D,extend='neither')
    sc3D.set_clim(vmin=0,vmax=1)
    cbar.set_label('percentage of direct neighbors', fontsize=14)
    
    def forceUpdate(event): #Solve the problem of point color changing in 3d scatter plot compared to 2d scatter plot
        global sc3D
        sc3D.changed()
    fig.canvas.mpl_connect('draw_event', forceUpdate)

    ax3D.set_xlim(-0.5,0.5)
    ax3D.set_ylim(-0.5,0.5)
    ax3D.set_zlim(-0.5,0.5)
    ticks = np.arange(-0.5, 0.5, 0.2)
    ax3D.set_xticks(ticks)
    ax3D.set_yticks(ticks)
    ax3D.set_zticks(ticks)

    ax3D.set_xlabel(r'Box X ($\mu m$)')
    ax3D.set_ylabel(r'Box Y ($\mu m$)')
    ax3D.set_zlabel(r'Box Z ($\mu m$)')
    
    #Second subplot - projection on x/y aixs node images with noder neighbor percentage as colorcode
    ax = fig.add_subplot(gs[1])
    sc = ax.scatter([i[0] for i in N_giant_coord], [i[1] for i in N_giant_coord], c=[i for i in N_giant_node_percen], cmap=cmap, marker='o', s=5, linewidths=0)    
    ax.set_title(f't = {df.time[index]:.3f} s')    
    cbar = plt.colorbar(sc,extend='neither')
    sc.set_clim(vmin=0,vmax=1)
    cbar.set_label('percentage of direct neighbors', fontsize=14)

    ax.set_xlim(-0.5,0.5)
    ax.set_ylim(-0.5,0.5)
    ticks = np.arange(-0.5, 0.5, 0.2)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)

    ax.set_xlabel(r'Box X ($\mu m$)')
    ax.set_ylabel(r'Box Y ($\mu m$)')

    #Third subplot - projection on x/y aixs node images with diffusivity as colorcode
    ax = fig.add_subplot(gs[2])
    sc = ax.scatter([i[0] for i in N_giant_coord], [i[1] for i in N_giant_coord], c=[g.vs['diffusivity'][i]/(1e-12) for i in N_giant_node], cmap=cmap, marker='o', s=5, linewidths=0)    
    ax.set_title(f't = {df.time[index]:.3f} s')    
    cbar = plt.colorbar(sc,extend='neither')
    sc.set_clim(vmin=0,vmax=1.0)
    cbar.set_label(r'Avg. node diffusivity ($\mu m^2/s$)', fontsize=14)

    ax.set_xlim(-0.5,0.5)
    ax.set_ylim(-0.5,0.5)
    ticks = np.arange(-0.5, 0.5, 0.2)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)

    ax.set_xlabel(r'Box X ($\mu m$)')
    ax.set_ylabel(r'Box X ($\mu m$)')
    
    #Forth subplot - A/B node separation projection on x/y aixs
    ax = fig.add_subplot(gs[3])
    sc = ax.scatter([i[0] for i in N_giant_coord_A], [i[1] for i in N_giant_coord_A], c='m', cmap=cmap, marker='o', label='A: Hexamer', s=5,alpha=0.5)    
    sc = ax.scatter([i[0] for i in N_giant_coord_B], [i[1] for i in N_giant_coord_B], c='y', cmap=cmap, marker='o', label='B: Dimer', s=5, alpha=0.5)    
    ax.set_title(f't = {df.time[index]:.3f} s')    

#     ax.legend(loc='center left', bbox_to_anchor=(1, 0.9))
    ax.set_xlim(-0.5,0.5)
    ax.set_ylim(-0.5,0.5)
    ticks = np.arange(-0.5, 0.5, 0.2)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)

    ax.set_xlabel(r'Box X ($\mu m$)')
    ax.set_ylabel(r'Box X ($\mu m$)')

    #Fifth subplot - Plots of number of neighbors for each node vs. node diffusivity
    ax = fig.add_subplot(gs[4])
    sc = ax.plot(N_giant_node_percen_A, [i/(1e-12) for i in N_giant_diffusion_A], 'mo', label='A: Hexamer',alpha=0.5)    
    sc = ax.plot(N_giant_node_percen_B, [i/(1e-12) for i in N_giant_diffusion_B], 'yo', label='B: Dimer', alpha=0.5)    
    ax.set_title(f't = {df.time[index]:.3f} s')    

    ax.legend(loc='center left', bbox_to_anchor=(1, 0.9))
    ax.set_ylim(-0.005,1.0)

    ax.set_xlabel(r'Percentage of node neighbors')
    ax.set_ylabel(r'Avg. node diffusivity ($\mu m^2/s$)')
    
    plt.show()
    

Calculating topological properties of largest cluster...

In [None]:
df['others_size'] = None

for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        
    g = row['graph']
    
    # Calculate connected compoents (CCs)
    ccs = g.clusters()
    
    # Get giant component
    giant = ccs.giant()

    # Size of giant component
    df.loc[index,'giant_size'] = giant.vcount()

#     # Size of giant component in terms of #connections
#     df.loc[index,'giant_size'] = giant.ecount()
    
    # Diameter of giant connected component
    # Diameters is defined as the longest shortest path between two pairs of nodes
#     df.loc[index,'giant_diameter'] = giant.diameter()
    df.loc[index,'giant_diameter'] = giant.diameter(weights=giant.es['length'])
    
    # Std and mean of sizes of remaining components
    sizes = ccs.sizes()  # This is equivalent to  [g.vcount() for g in ccs.subgraphs()]
#     sizes = [g.ecount() for g in ccs.subgraphs()]

    # Exclude giant cc
    sizes.pop(sizes.index(giant.vcount()))
#     sizes.pop(sizes.index(giant.ecount()))
    df.loc[index,'mean_others_size'] = np.mean(sizes) if sizes else None
    df.loc[index,'std_others_size'] = np.std(sizes) if sizes else None
    
    # Frequency of clusters with size 0, 1, 2..
    hist = np.bincount(sizes)
    df.at[index,'others_size'] = hist

Show results...

In [None]:
# ymax = 1.1*df.giant_size.max()
fig, ax1 = plt.subplots(1,1, figsize=(6,4))
ax1.plot(df.time[:77],df.giant_size[:77], color='blue', label='Number of nodes within largest cluster')
ax1.set_ylabel("Number of nodes \n within largest cluster", fontsize=18, color='blue')
ax1.set_xlabel('Time (s)', fontsize=18)
# ax1.set_ylim(0,280)
ax1.set_ylim(0,250)
plt.show()

fig, ax2 = plt.subplots(1,1, figsize=(6,4))
ax2.set_ylabel('Topological diameter \n of largest cluster (µm)', fontsize=18, color='red')
ax2.plot(df.time[1:78],df.giant_diameter[:77], color='red', label='Topological diameter of largest cluster')
ax2.set_ylim(0,1.2)
# ax1.legend(loc='center left', bbox_to_anchor=(1.15, 0.9))
# ax2.legend(loc='center left', bbox_to_anchor=(1.15, 0.8))
plt.show
# fig.savefig('./Ensemble-node-links/box_Viscosity_trail19_(1)_nodelinks/giant cluster trail19 (1).pdf')

#Plot first derivative of nodes number within largest cluster vs. time
dt = np.diff(df.time[:77])
dGDs = np.diff(df.giant_size[:77])
fig, ax = plt.subplots(1,1, figsize=(6,4))
ax.plot(df.time[1:77],[x/y for x, y in zip(dGDs, dt)])
ax.set_ylabel("First derivative of node number \n within largest cluster", fontsize=18)
ax.set_xlabel('Time (s)', fontsize=18)
plt.show()

Creating a dataframe with time and file paths for links and coordinates...

In [None]:
# xmax = df.time.max()
xmax = 0.77
ymax = 1.1*df.giant_size.max()
fig, ax = plt.subplots(1,1, figsize=(6,4))
ax.set_ylabel('Size of remaining CCs', fontsize=18)
ax.set_xlabel('Time (s)', fontsize=18)
ax.set_xlim(0,xmax)
ax.set_ylim(0,30)
df_sub = df.dropna()
ax.fill_between(
    df_sub.time,
    y1 = df_sub.mean_others_size-df_sub.std_others_size,
    y2 = df_sub.mean_others_size+df_sub.std_others_size, alpha=0.3, color='gray')
ax.plot(df_sub.time,df_sub.mean_others_size, '-o', color='black')
plt.show()

In [None]:
# Evolution of clusters sizes as a heatmap
# This does not include the giant cluster

# Calculate the maximum number of co-occurent clusters
nmax = []
for index in df.index[:77]:
    hist = df.others_size[index]
    if hist.size:
        nmax.append(len(hist))
# Empty heatmap
data = np.zeros((np.max(nmax),df.shape[0]), dtype=np.uint64)
# Fill heatmap in
for i, index in enumerate(df.index[:77]):
    hist = df.others_size[index]
    data[:len(hist),i] = hist
fig, ax = plt.subplots(1,1, figsize=(12,4))
sc0 = ax.imshow(np.log(1+data[:40,:77]),cmap='jet')
# sc0 = ax.imshow(data,cmap='jet')
# ax.set_xlabel(f'Time (s x {df.shape[0]})', fontsize=18)
# ax.set_xlabel('Time (s x %1.2f)' %np.diff(df_sub.time)[0], fontsize=18)
ax.set_xlabel(f'Time (x {np.diff(df_sub.time)[0]:.3f} s)', fontsize=18)
ax.set_ylabel('Cluster size', fontsize=18)
clbar = fig.colorbar(sc0,extend='neither')
sc0.set_clim(vmin=0,vmax=6.8)
clbar.set_label('ln(1 + #Clusters)',fontsize=14)
plt.show()

Fraction of neighboors that remains the same

In [None]:
for idxi, idxj in tqdm(zip(df.index[:-1],df.index[1:]), total=df.shape[0]-1):
    
    # Load graph at time   t: gi
    # Load graph at time t+1: gj
    gi = df.graph[idxi]
    gj = df.graph[idxj]
    
    # Get list of neighbors
    neighi = gi.neighborhood()
    neighj = gj.neighborhood()
        
    fraction = []
    
    # Check the fraction of nodes with unchanged neighborhood
    for nik, njk in zip(neighi,neighj):
        
        # Intersection between the two sets of neighs
        common = set(njk).intersection(set(nik))
        
        fraction.append(len(common) / np.max([len(nik),len(njk)]))
        
    df.loc[idxj,'frac_sim_neighs_avg'] = np.mean(fraction)
    df.loc[idxj,'frac_sim_neighs_std'] =  np.std(fraction)

# Plot results
# xmax = df.time.max()
xmax = 0.77
fig, ax = plt.subplots(1,1, figsize=(6,4))
ax.set_ylabel('Fraction of unchanged neighbors', fontsize=18)
ax.set_xlabel('Time (s)', fontsize=18)
ax.set_xlim(0,xmax)
ax.set_ylim(0.6,1.1)
ax.fill_between(
    df.time,
    y1 = df.frac_sim_neighs_avg-df.frac_sim_neighs_std,
    y2 = df.frac_sim_neighs_avg+df.frac_sim_neighs_std, alpha=0.3, color='gray')
ax.plot(df.time,df.frac_sim_neighs_avg, '-o', color='black')

# Plot zoom-in results from 0-0.25s
fig, ax_zoom = plt.subplots(1,1, figsize=(1,4))
# ax_zoom.set_ylabel('Fraction of unchanged neighbors', fontsize=18)
ax_zoom.set_xlabel('Time (s)', fontsize=18)
ax_zoom.set_xlim(0,0.25)
ax_zoom.set_ylim(0.6,1.1)
ax_zoom.fill_between(
    df.time[0:25],
    y1 = df.frac_sim_neighs_avg[0:25]-df.frac_sim_neighs_std[0:25],
    y2 = df.frac_sim_neighs_avg[0:25]+df.frac_sim_neighs_std[0:25], alpha=0.3, color='gray')
ax_zoom.plot(df.time[0:25],df.frac_sim_neighs_avg[0:25], '-o', color='black')
plt.show()

Time aggregated graph and hierarchical clustering

In [None]:
# Number of graphs to be aggregated together
n_agg_graphs = 3 # how many time frames (in the selected nodelink files) are used for each aggregation, not second in unit

times = df['time'].values
tstep = np.median(np.diff(df.time.values))
nbins = np.int(np.round(len(times)/n_agg_graphs))+1

# nbins = np.int(np.round((times.max()-times.min())/(n_agg_graphs*tstep)))+1
# df['agg_time'] = np.digitize(
#     times,
#     np.linspace(times.min(),times.max(),nbins)
# )

df['agg_index'] = np.digitize(
    range(0,len(times)),
    np.linspace(0,len(times),nbins)
)
# print(df['agg_index'])
# print(np.linspace(0,len(times),nbins))
# print([[agg_time] for agg_time, df_agg in df.groupby('agg_index')])
# print(len([[agg_time] for agg_time, df_agg in df.groupby('agg_index')]))

for agg_time, df_agg in df.groupby('agg_index'):

    # Get first adjacency matrix
    adj = df_agg.graph[df_agg.index[0]].get_adjacency(attribute='length')
    
    for g in df_agg.graph.values[1:]:
        adj += g.get_adjacency(attribute='length')
        
    adj = np.array(adj.data).reshape(adj.shape)
        
    # Min and max in the weighted adj matrix
    dmax = adj.max()
    dmin = adj[adj>0].min()
        
    # Neighbors more often connected have low weight
    adj[adj>0] = np.abs( adj[adj>0] - (dmin+dmax) ).astype(np.float)
    adj = adj / n_agg_graphs
    
    # Create a graph from the aggregated adj matrix
    g_agg = igraph.Graph.Adjacency((adj > 0).tolist(), mode=igraph.ADJ_UNDIRECTED)
    g_agg.es['length'] = adj[adj.nonzero()]
    
    # Calculate shortest paths length
    spl = g_agg.shortest_paths_dijkstra(weights=g_agg.es['length'])
    distances = np.array(spl).reshape(adj.shape)

    # Replace infinities with a very large distance
    distances[np.isinf(distances)] = distances[~np.isinf(distances)].max()
    
    # Clustering
    threshold = 0.3
    fig, axs = plt.subplots(1,2,figsize=(12,4))
    linkage = hierarchy.linkage(distances, method="single")
    clusters = hierarchy.fcluster(linkage, threshold, criterion="distance")
    dend = hierarchy.dendrogram(linkage, color_threshold=threshold, ax=axs[1])
    order = dend['leaves']
    distances = distances[order,:]
    distances = distances[:,order]
    
    #After clustering, convert the largest distances (& not indirectly connected nodes) to nan and plot as white in the colormap
    max_index2 = np.where(distances == np.amax(distances)) 
    distances[max_index2] = np.nan
    current_cmap = plt.cm.get_cmap()
    current_cmap.set_bad(color='white')
 
    sc1 = axs[0].imshow(distances)
#     cbar = fig.colorbar(sc1,ax=axs[0])
    cbar = fig.colorbar(sc1, ax=axs[0],extend='neither')
    sc1.set_clim(vmin=0,vmax=0.9)
    
    cbar.set_label(r'Adjusted frame-wise distance ($\mu m$)', fontsize=14)
    axs[0].set_title(f'Aggregate: {df_agg.time.min():.3f} to {df_agg.time.max():.3f} s')    
    axs[1].set_xlabel("Node")
#     axs[1].set_ylabel("Dissimilarity")
    plt.show()

Diffusivity as a function of cluster size

In [None]:
df_diff = pd.DataFrame([])

for index in tqdm(df.index):
    
    g = df.graph[index]
    
    ccs = g.clusters()
    
    df_cc = pd.DataFrame([{'cluster': m, 'diffusivity': d} for (m,d) in zip(ccs.membership,g.vs['diffusivity'])])
    sizes = df_cc.groupby('cluster').size()
    df_cc = df_cc.groupby('cluster').agg(['mean','std'])
    df_cc['time'] = df.time[index]
    df_cc['nmols'] = sizes
#     print(sum(sizes))
    df_cc.head()
    
    df_diff = pd.concat([df_diff,df_cc], axis=0, ignore_index=True)


In [None]:
cmap = plt.cm.get_cmap('jet')
fig, ax = plt.subplots(1,1, figsize=(6,4))
# sc = ax.scatter(df_diff.nmols,df_diff[('diffusivity','mean')]/(1e-13), s=1, c=df_diff.time, cmap=cmap)
# timemax = max(df.time)
# timemax = max(df.time)
timemax = 0.77
timemin = max(df.time)*0
timeindex = df_diff.time[(df_diff.time >= timemin) & (df_diff.time <= timemax)].index
sc = ax.scatter(df_diff.nmols[timeindex],df_diff[('diffusivity','mean')][timeindex]/(1e-12), s=1,c=df_diff.time[timeindex], cmap=cmap)

ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Cluster size (#molecules)', fontsize=18)
ax.set_ylabel(r'Avg. diffusivity ($\mu m^2/s$)', fontsize=18)
# ax.set_ylabel(r'Avg. diffusivity ($\times 10^{-13}~m^2/s$)', fontsize=18)
ax.set_xlim(0.75,300)
ax.set_ylim(0.18,1.3)
# ax.set_ylim(0.25,1.3)
cbar = plt.colorbar(sc,extend='neither')
sc.set_clim(vmin=timemin,vmax=timemax)
cbar.set_label('time (s)', fontsize=14)

# Plot the average molecular diffusion constants vs the cluster size that molecules are within
fig2, ax2 = plt.subplots(1,1, figsize=(6,4))
df_diff_timeindex = pd.DataFrame({"cluster_size": df_diff.nmols[timeindex], "Diff_cluster_size": df_diff[('diffusivity','mean')][timeindex]})
Diff_cluster_size = df_diff_timeindex.groupby('cluster_size').agg(['mean','std'])
Diff_cluster_size = Diff_cluster_size.reset_index() #Reset the dataframe to regular column
# Linear fitting the log scale graph (check power law index)
coef = np.polyfit(np.log10(Diff_cluster_size["cluster_size"]),np.log10(Diff_cluster_size[('Diff_cluster_size','mean')]),1)
poly1d_fn = np.poly1d(coef) # poly1d_fn is now a function which takes in x and returns an estimate for y
Diff_cluster_size_fit = [pow(10,i)/(1e-12) for i in poly1d_fn(np.log10(Diff_cluster_size["cluster_size"]))]
plt.errorbar(Diff_cluster_size["cluster_size"], Diff_cluster_size[('Diff_cluster_size','mean')]/(1e-12), yerr = Diff_cluster_size[('Diff_cluster_size','std')]/(1e-12), xerr = None)
ax2.plot(Diff_cluster_size["cluster_size"],Diff_cluster_size_fit,'--r')
ax2.text(50, 0.5, f'Exponent $\\alpha$={round(coef[0], 2)}')
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_xlabel('Cluster size (#molecules)', fontsize=18)
ax2.set_ylabel(r'Avg. diffusivity ($\mu m^2/s$)', fontsize=18)

# Plot the average cluster size vs time
fig3, ax3 = plt.subplots(1,1, figsize=(6,4))
cluster_timeindex = pd.DataFrame({"time": df_diff.time[timeindex],"cluster_size": df_diff.nmols[timeindex]})
cluster_time = cluster_timeindex.groupby("time").agg(['mean','std'])
cluster_time = cluster_time.reset_index() #Reset the dataframe to regular column
# Linear fitting the log scale graph (check power law index)
coef = np.polyfit(np.log10(cluster_time["time"]),np.log10(cluster_time[('cluster_size','mean')]),1)
poly1d_fn = np.poly1d(coef) # poly1d_fn is now a function which takes in x and returns an estimate for y
cluster_time_fit = [pow(10,i) for i in poly1d_fn(np.log10(cluster_time["time"]))]
plt.errorbar(cluster_time["time"], cluster_time[('cluster_size','mean')], yerr = cluster_time[('cluster_size','std')], xerr = None)
ax3.plot(cluster_time['time'],cluster_time_fit,'--r')
ax3.text(0.01, 20, f'Exponent $\\alpha$={round(coef[0], 2)}')
# ax3.set_ylim([1,35])
ax3.set_xscale('log')
ax3.set_yscale('log')
ax3.set_xlabel('time / s', fontsize=18)
ax3.set_ylabel(r'Avg. cluster size (#molecules)', fontsize=18)

plt.show()

In [None]:
cmap = plt.cm.get_cmap('jet')
fig, ax = plt.subplots(1,1, figsize=(6,4))
# sc = ax.scatter(df_diff.nmols,df_diff[('diffusivity','mean')]/(1e-13), s=1, c=df_diff.time, cmap=cmap)
# timemax = max(df.time)
# timemax = max(df.time)
timemax = 0.2
timemin = max(df.time)*0
timeindex = df_diff.time[(df_diff.time >= timemin) & (df_diff.time <= timemax)].index
sc = ax.scatter(df_diff.nmols[timeindex],df_diff[('diffusivity','mean')][timeindex]/(1e-12), s=1,c=df_diff.time[timeindex], cmap=cmap)

ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Cluster size (#molecules)', fontsize=18)
ax.set_ylabel(r'Avg. diffusivity ($\mu m^2/s$)', fontsize=18)
# ax.set_ylabel(r'Avg. diffusivity ($\times 10^{-13}~m^2/s$)', fontsize=18)
ax.set_xlim(0.75,300)
ax.set_ylim(0.18,1.3)
# ax.set_ylim(0.25,1.3)
cbar = plt.colorbar(sc,extend='neither')
sc.set_clim(vmin=timemin,vmax=timemax)
cbar.set_label('time (s)', fontsize=14)

# Plot the average molecular diffusion constants vs the cluster size that molecules are within
fig2, ax2 = plt.subplots(1,1, figsize=(6,4))
df_diff_timeindex = pd.DataFrame({"cluster_size": df_diff.nmols[timeindex], "Diff_cluster_size": df_diff[('diffusivity','mean')][timeindex]})
Diff_cluster_size = df_diff_timeindex.groupby('cluster_size').agg(['mean','std'])
Diff_cluster_size = Diff_cluster_size.reset_index() #Reset the dataframe to regular column
# Linear fitting the log scale graph (check power law index)
coef = np.polyfit(np.log10(Diff_cluster_size["cluster_size"]),np.log10(Diff_cluster_size[('Diff_cluster_size','mean')]),1)
poly1d_fn = np.poly1d(coef) # poly1d_fn is now a function which takes in x and returns an estimate for y
Diff_cluster_size_fit = [pow(10,i)/(1e-12) for i in poly1d_fn(np.log10(Diff_cluster_size["cluster_size"]))]
plt.errorbar(Diff_cluster_size["cluster_size"], Diff_cluster_size[('Diff_cluster_size','mean')]/(1e-12), yerr = Diff_cluster_size[('Diff_cluster_size','std')]/(1e-12), xerr = None)
ax2.plot(Diff_cluster_size["cluster_size"],Diff_cluster_size_fit,'--r')
ax2.text(250, 0.5, f'Exponent $\\alpha$={round(coef[0], 2)}')
ax2.set_ylim([0.22,0.7])
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_xlabel('Cluster size (#molecules)', fontsize=18)
ax2.set_ylabel(r'Avg. diffusivity ($\mu m^2/s$)', fontsize=18)

# Plot the average cluster size vs time
fig3, ax3 = plt.subplots(1,1, figsize=(6,4))
cluster_timeindex = pd.DataFrame({"time": df_diff.time[timeindex],"cluster_size": df_diff.nmols[timeindex]})
cluster_time = cluster_timeindex.groupby("time").agg(['mean','std'])
cluster_time = cluster_time.reset_index() #Reset the dataframe to regular column
# Linear fitting the log scale graph (check power law index)
coef = np.polyfit(np.log10(cluster_time["time"]),np.log10(cluster_time[('cluster_size','mean')]),1)
poly1d_fn = np.poly1d(coef) # poly1d_fn is now a function which takes in x and returns an estimate for y
cluster_time_fit = [pow(10,i) for i in poly1d_fn(np.log10(cluster_time["time"]))]
plt.errorbar(cluster_time["time"], cluster_time[('cluster_size','mean')], yerr = cluster_time[('cluster_size','std')], xerr = None)
ax3.plot(cluster_time['time'],cluster_time_fit,'--r')
ax3.text(0.01, 25, f'Exponent $\\alpha$={round(coef[0], 2)}')
ax3.set_ylim([1,20])
ax3.set_xscale('log')
ax3.set_yscale('log')
ax3.set_xlabel('time / s', fontsize=18)
ax3.set_ylabel(r'Avg. cluster size (#molecules)', fontsize=18)

plt.show()

In [None]:
cmap = plt.cm.get_cmap('jet')
fig, ax = plt.subplots(1,1, figsize=(6,4))
# sc = ax.scatter(df_diff.nmols,df_diff[('diffusivity','mean')]/(1e-13), s=1, c=df_diff.time, cmap=cmap)
# timemax = max(df.time)
# timemax = max(df.time)
timemax = 0.77
# timemin = max(df.time)*0
timemin = 0.2
timeindex = df_diff.time[(df_diff.time >= timemin) & (df_diff.time <= timemax)].index
sc = ax.scatter(df_diff.nmols[timeindex],df_diff[('diffusivity','mean')][timeindex]/(1e-12), s=1,c=df_diff.time[timeindex], cmap=cmap)

ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Cluster size (#molecules)', fontsize=18)
ax.set_ylabel(r'Avg. diffusivity ($\mu m^2/s$)', fontsize=18)
# ax.set_ylabel(r'Avg. diffusivity ($\times 10^{-13}~m^2/s$)', fontsize=18)
ax.set_xlim(0.75,300)
ax.set_ylim(0.18,1.3)
# ax.set_ylim(0.25,1.3)
cbar = plt.colorbar(sc,extend='neither')
sc.set_clim(vmin=timemin,vmax=timemax)
cbar.set_label('time (s)', fontsize=14)

# Plot the average molecular diffusion constants vs the cluster size that molecules are within
fig2, ax2 = plt.subplots(1,1, figsize=(6,4))
df_diff_timeindex = pd.DataFrame({"cluster_size": df_diff.nmols[timeindex], "Diff_cluster_size": df_diff[('diffusivity','mean')][timeindex]})
Diff_cluster_size = df_diff_timeindex.groupby('cluster_size').agg(['mean','std'])
Diff_cluster_size = Diff_cluster_size.reset_index() #Reset the dataframe to regular column
# Linear fitting the log scale graph (check power law index)
coef = np.polyfit(np.log10(Diff_cluster_size["cluster_size"]),np.log10(Diff_cluster_size[('Diff_cluster_size','mean')]),1)
poly1d_fn = np.poly1d(coef) # poly1d_fn is now a function which takes in x and returns an estimate for y
Diff_cluster_size_fit = [pow(10,i)/(1e-12) for i in poly1d_fn(np.log10(Diff_cluster_size["cluster_size"]))]
plt.errorbar(Diff_cluster_size["cluster_size"], Diff_cluster_size[('Diff_cluster_size','mean')]/(1e-12), yerr = Diff_cluster_size[('Diff_cluster_size','std')]/(1e-12), xerr = None)
ax2.plot(Diff_cluster_size["cluster_size"],Diff_cluster_size_fit,'--r')
ax2.text(250, 0.5, f'Exponent $\\alpha$={round(coef[0], 2)}')
ax2.set_ylim([0.22,0.7])
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_xlabel('Cluster size (#molecules)', fontsize=18)
ax2.set_ylabel(r'Avg. diffusivity ($\mu m^2/s$)', fontsize=18)

# Plot the average cluster size vs time
fig3, ax3 = plt.subplots(1,1, figsize=(6,4))
cluster_timeindex = pd.DataFrame({"time": df_diff.time[timeindex],"cluster_size": df_diff.nmols[timeindex]})
cluster_time = cluster_timeindex.groupby("time").agg(['mean','std'])
cluster_time = cluster_time.reset_index() #Reset the dataframe to regular column
# Linear fitting the log scale graph (check power law index)
coef = np.polyfit(np.log10(cluster_time["time"]),np.log10(cluster_time[('cluster_size','mean')]),1)
poly1d_fn = np.poly1d(coef) # poly1d_fn is now a function which takes in x and returns an estimate for y
cluster_time_fit = [pow(10,i) for i in poly1d_fn(np.log10(cluster_time["time"]))]
plt.errorbar(cluster_time["time"], cluster_time[('cluster_size','mean')], yerr = cluster_time[('cluster_size','std')], xerr = None)
ax3.plot(cluster_time['time'],cluster_time_fit,'--r')
ax3.text(0.25, 35, f'Exponent $\\alpha$={round(coef[0], 2)}')
ax3.set_ylim([2,30])
ax3.set_xscale('log')
ax3.set_yscale('log')
ax3.set_xlabel('time / s', fontsize=18)
ax3.set_ylabel(r'Avg. cluster size (#molecules)', fontsize=18)

plt.show()

### Replot figures for the paper:

* Make figures vectorized 
* Graph represented cluster at t = 0.01s, 0.25s, 0.50s, 0.77s by rescale the colorbar
* Overlay the histogram of cluster size distribution at the above time points t = 0.01s, 0.25s, 0.50s, 0.77s
* Time course of tracking the largest cluster at the last time point
* Replot the cluster diffusivity with cluster size by combining all time information and use single color for each dots but also add median and sem values with dots on the background

1. Extract selected time points for graph represented clusters

In [None]:
## Change figure output format from 'png' to 'pdf/svg' vectoried figure format
%config InlineBackend.figure_formats = ['pdf','svg']

## Graph theory represented cluter replot
giant_node_replot = [] #At each time points, node index within the largest cluster
ORDER_replot = [] #Record nodes order in the rearranged cluster graphs
CCSLISTSIZE_replot = [] #Record cluster size at all selected timepoints

selected_time = [0.01,0.25,0.50,0.77]
Select_replot_df = df.loc[df['time'] == selected_time[0]] #Create new dataframe for replotting purpose with selected timepoints
for i in selected_time[1:len(selected_time)]:
    Select_replot_df = Select_replot_df.append(df.loc[df['time'] == i])
    
# for index, row in tqdm(df.iterrows(), total=df.shape[0]):
# for index, row in tqdm(df.head(n=2).iterrows(), total=2):
for index,row in tqdm(Select_replot_df.iterrows(), total=Select_replot_df.shape[0]):
    g = row['graph']
    ccs = g.clusters()
    ccslistsize = list(ccs.sizes())
    giant_node_replot.append(ccs[ccslistsize.index(max(ccslistsize))])
    CCSLISTSIZE_replot.append(ccslistsize)
    
    # Calculate shortest paths length between nodes
    spl_total1 = []
    N_connect_total = []
    for v_source in range(0,g.vcount()):
        spl = g.shortest_paths_dijkstra(v_source, g.vs(), weights=g.es['length'])
        spl_total1.append(spl)
        N_connect = [float(len(i)) for i in g.get_shortest_paths(v_source, g.vs())] #Number of nodes connecting v_source and v_target (including v_source and v_target)
        N_connect_total.append(N_connect)
    distances = np.array(spl_total1).reshape((g.vcount(),g.vcount()))
    N_connect_total = np.array(N_connect_total).reshape((g.vcount(),g.vcount()))

    # Replace infinities with a very large distance
    distances[np.isinf(distances)] = distances[~np.isinf(distances)].max()
    
    # Clustering
    threshold = 1
    fig, axs = plt.subplots(1,2,figsize=(35,6))
    sys.setrecursionlimit(10000) #Required for much larger system as trail20_(5-5) which has 1170 type A molecules
    linkage = hierarchy.linkage(distances, method="single")
    clusters = hierarchy.fcluster(linkage, threshold, criterion="distance")
    dend = hierarchy.dendrogram(linkage, color_threshold=threshold, ax=axs[1])
    order = dend['leaves']
    ORDER_replot = np.append(ORDER_replot,order,axis=0)
    distances = distances[order,:]
    distances = distances[:,order]

#   Plot all the analysis results  
    #After clustering, convert the largest distances (& not indirectly connected nodes) to nan and plot as white in the colormap
    max_index = np.where(distances == np.amax(distances)) 
    distances[max_index] = np.nan
    no_connect_index = np.where(N_connect_total == 0)
    N_connect_total[no_connect_index] = np.nan
    current_cmap = plt.cm.get_cmap()
    current_cmap.set_bad(color='white')
    
    [x,y] = np.where(Direct_neighbor == 1) #Extract x,y node position for those are direct neighbors
    
    sc0 = axs[0].imshow(distances)
    cbar0 = fig.colorbar(sc0, ax=axs[0],extend='neither')
    sc0.set_clim(vmin=0,vmax=0.8)
    cbar0.set_label(r'Topological shortest distance ($\mu m$)', fontsize=18)
    axs[0].set_title(f't = {df.time[index]:.3f} s', fontsize=18)

    plt.show()

2. Overlay the histogram of cluster size distribution at the above selected time points

In [None]:
plt.figure()
for i in range(len(selected_time)):
    plt.hist(CCSLISTSIZE_replot[i],bins = range(min(CCSLISTSIZE_replot[i]),max(CCSLISTSIZE_replot[i])+1,1),histtype='step',density=True,label=[str(selected_time[i])+'s'],fill=False)
    
plt.ylabel('Probability', fontsize=15)
plt.xlabel('Cluster size', fontsize=15)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.9))
plt.show()

# print(CCSLISTSIZE_replot)

3. Replot the cluster diffusivity with cluster size by combining all time information and use single color for each dots but also add mean(not median) and std values with dots on the background

In [None]:
## Combined 1st and 2nd figure of Deff vs. cluster size without time information as colormap
cmap = plt.cm.get_cmap('jet')
fig, ax = plt.subplots(1,1, figsize=(6,4))
timemax = 0.77
timemin = max(df.time)*0
timeindex = df_diff.time[(df_diff.time >= timemin) & (df_diff.time <= timemax)].index
sc = ax.scatter(df_diff.nmols[timeindex],df_diff[('diffusivity','mean')][timeindex]/(1e-12), s=1, c='grey')

# sc = ax.scatter(df_diff.nmols[timeindex],df_diff[('diffusivity','mean')][timeindex]/(1e-12), s=1,c=df_diff.time[timeindex], cmap=cmap)
# cbar = plt.colorbar(sc,extend='neither')
# sc.set_clim(vmin=timemin,vmax=timemax)
# cbar.set_label('time (s)', fontsize=14)

# Plot the average molecular diffusion constants vs the cluster size that molecules are within
df_diff_timeindex = pd.DataFrame({"cluster_size": df_diff.nmols[timeindex], "Diff_cluster_size": df_diff[('diffusivity','mean')][timeindex]})
Diff_cluster_size = df_diff_timeindex.groupby('cluster_size').agg(['mean','std'])
Diff_cluster_size = Diff_cluster_size.reset_index() #Reset the dataframe to regular column
# # Linear fitting the log scale graph (check power law index)
# coef = np.polyfit(np.log10(Diff_cluster_size["cluster_size"]),np.log10(Diff_cluster_size[('Diff_cluster_size','mean')]),1)
# poly1d_fn = np.poly1d(coef) # poly1d_fn is now a function which takes in x and returns an estimate for y
# Diff_cluster_size_fit = [pow(10,i)/(1e-12) for i in poly1d_fn(np.log10(Diff_cluster_size["cluster_size"]))]
# ax2.plot(Diff_cluster_size["cluster_size"],Diff_cluster_size_fit,'--r')
# ax2.text(250, 0.5, f'Exponent $\\alpha$={round(coef[0], 2)}')
plt.errorbar(Diff_cluster_size["cluster_size"], Diff_cluster_size[('Diff_cluster_size','mean')]/(1e-12), yerr = Diff_cluster_size[('Diff_cluster_size','std')]/(1e-12), xerr = None)
ax.set_xlim(0.75,300)
ax.set_ylim(0.17,1.35)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Cluster size (#molecules)', fontsize=18)
ax.set_ylabel(r'Avg. diffusivity ($\mu m^2/s$)', fontsize=18)

plt.show()

4. Time course of tracking the largest cluster at the last time point

In [None]:
#Plot the number of nodes within the first tracked cluster (largest cluster at the last time point) over time
timemax = 0.77
timemin = max(df.time)*0
timemax_index = int(timemax/0.01);
timemin_index = int(timemin/0.01);

dt = np.diff(reversed_df.time[-timemax_index:])
dCs = np.diff(Cluster_size[-timemax_index:])
fig,axs = plt.subplots(1,2, figsize=(15,6))
axs[0].plot(reversed_df.time[-timemax_index:],Cluster_size[-timemax_index:])
axs[0].set_ylabel(r'Number of nodes within first tracked cluster', fontsize=18)
axs[0].set_xlabel('Time (s)', fontsize=18)
axs[0].set_ylim(0,220)

axs[1].plot(reversed_df.time[-(timemax_index-1):],[x/y for x, y in zip(dCs, dt)])
axs[1].set_ylabel(r'First derivation of node number', fontsize=18)
axs[1].set_xlabel('Time (s)', fontsize=18)
plt.show()
    
## Change back figure output format to 'png'
%config InlineBackend.figure_formats = ['png']

### Re-analyze cluster diffusivity using center of mass rather than averaged individual molecular diffusivity for the paper:

* only clusters with size larger than **'Minimum_size'** are considered for linking and tracking
* All possible tracks are first identified through 'Trac_cluster_...' variables
* 'Trac_cluster_...' are further processed through function: 'cluster_selection' to only select clusters if nodes number change between neighboring frames is smaller or equal than **'del_cluster'**

In [None]:
## Pre-requisite variables:
# df, L, ORDER, N_CONNECT_TOTAL (first calculated in cell 8,9,10)

timemax = 0.77
Select_df = df.loc[df['time'] <= timemax] #Create new dataframe for replotting purpose with selected timepoints

## Initiate important variables
Minimum_size = 10 #Minimum cluster size for mesoscale diffusivity analyses
# All variables listed below have the same structure as list of list and values within the same position refer to the same cluster
# list of list of list:[[[previous cluster #, current cluster #],different links],different times]
Link_cluster_index = [] #linkage cluster index between two neighboring frames: ['PreFrame-Index_Cluster-Index-in-PreFrame','CurrentFrame-Index_Cluster-Index-in-CurrentFrame']
Link_cluster_time = []
Link_cluster_size = []
Link_cluster_x = []
Link_cluster_y = []
Link_cluster_z = []

df_copy = Select_df.copy()  #Copy dataframe 'df' to a new dataframe 'df_copy' to avoid changing original dataframe
reversed_df = df_copy.loc[::-1] #Reverse copied dataframe in terms of index and saved to 'reversed_df' dataframe
## Generate intial linkage variables for subsequence analyses
# for index, row in tqdm(reversed_df.head(n=10).iterrows(), total=10):
for index, row in tqdm(reversed_df.iterrows(), total=reversed_df.shape[0]):
    g = row['graph']
    Coord = g.vs['coordinate']
    order = ORDER[index*g.vcount():(index+1)*g.vcount()] #ORDER: Record nodes order in the rearranged cluster graphs; order: has a size of g.vcount()
    n_connect_total = N_CONNECT_TOTAL[index*g.vcount():(index+1)*g.vcount()] # N_CONNETCT_TOTAL: Record number of nodes in the shortest path connecting two nodes in all time points; n_connect_total: has a size of g.vcount() * g.vcount()
    
    ##Extract node index for each ordered cluster within each frame after clustering (Ni_index)
    ccs = g.clusters()
    N_cluster = len(ccs) #N_cluster is the number of clusters within each frame    
    Ni_index = [] #Store the actual node index (not index directly from n_connect_total matrix since it records the number of nodes rather than actual node number) for each cluster using order array
    check_temp_cluster = 0 #Checkpoint for the row index between different squares in graph
    if np.any(np.isnan(n_connect_total)): #whether N_CONNECT_TOTAL elements have been replace to NaN for plotting
        for j in range(0,N_cluster):
            temp_cluster = [i for i, e in enumerate(n_connect_total[check_temp_cluster]) if ~np.isnan(e)]
            Ni_index.append(order[temp_cluster])
            check_temp_cluster = check_temp_cluster+len(temp_cluster)
    else:
        for j in range(0,N_cluster):
            temp_cluster = [i for i, e in enumerate(n_connect_total[check_temp_cluster]) if e!=0]
            Ni_index.append(order[temp_cluster])
            check_temp_cluster = check_temp_cluster+len(temp_cluster)

    Len_Ni_index = np.vectorize(len)(Ni_index) #Size of each sorted cluster using np.array
    
    ##Identify clusters that are larger than Minimum_size for each frame and setup initial pre variables at the last frame
    if index == reversed_df.shape[0]-1:
        temp_index_pre = np.where(Len_Ni_index > Minimum_size) #Only track those larger clusters
        temp_cluster_pre = [] #Record nodes number within each cluster
        t_pre = reversed_df.time[index]
        for i in temp_index_pre[0]:
            temp_cluster_pre.append(Ni_index[i])
    else:            
        temp_index = np.where(Len_Ni_index > Minimum_size) 
        temp_cluster = []
        t_now = reversed_df.time[index]
        for i in temp_index[0]:
            temp_cluster.append(Ni_index[i])
    
    ##Create the linkage variables as list of list of list:[[[previous cluster #, current cluster #],different links],different times] (Link_cluster_index & Link_cluster_time)
    if index != reversed_df.shape[0]-1: #Compare only when temp_cluster has been created
        Link_cluster_index_temp = [] #Create temperal cluster linkage between each neighboring frames
        Link_cluster_time_temp = []
        Link_cluster_size_temp = []
        Link_cluster_x_temp = []
        Link_cluster_y_temp = []
        Link_cluster_z_temp = []
        for j in range(len(temp_cluster_pre)):
            j_node = temp_cluster_pre[j]
            cluster_coord_pre = [Coord[int(i)] for i in j_node]
            com_pre = np.average(cluster_coord_pre, axis=0)
            common_nodes = []
            for i in range(len(temp_cluster)):
                i_node = temp_cluster[i]
                common_nodes.append(len(np.intersect1d(i_node, j_node)))
            if len(common_nodes) !=0: #Save when there are clusters with size larger than Minimum_size
                if max(common_nodes) != 0: #When common_nodes is not empty and there is an overlap
                    max_node_index = common_nodes.index(max(common_nodes)) #This index is used for current cluster
                    Link_index1 = '_'.join([f'{index+1}', f'{j}']) #Index is in the reverse order
                    Link_index2 = '_'.join([f'{index}', f'{max_node_index}'])
                    Link_cluster_index_temp.append([Link_index1,Link_index2])
                    Link_cluster_time_temp.append([t_pre,t_now])
                    Link_cluster_size_temp.append([len(j_node),len(temp_cluster[max_node_index])])
                    
                    cluster_coord = [Coord[int(i)] for i in temp_cluster[max_node_index]]
                    com = np.average(cluster_coord, axis=0)
                    Link_cluster_x_temp.append([com_pre[0],com[0]])
                    Link_cluster_y_temp.append([com_pre[1],com[1]])
                    Link_cluster_z_temp.append([com_pre[2],com[2]])
                                               
        
        #Append the temperal cluster linkage to the global variables
        Link_cluster_index.append(Link_cluster_index_temp)
        Link_cluster_time.append(Link_cluster_time_temp)
        Link_cluster_size.append(Link_cluster_size_temp)
        Link_cluster_x.append(Link_cluster_x_temp)
        Link_cluster_y.append(Link_cluster_y_temp)
        Link_cluster_z.append(Link_cluster_z_temp)
       
        #Overwrite the temp_cluster_pre & t_pre and prepare for comparison between next two frames
        temp_cluster_pre = temp_cluster
        t_pre = t_now
        
# print(Link_cluster_index)
# print(Link_cluster_time)
# print(Link_cluster_size)
# print('-----------------------------------------')

## Retransform the linkage variables to the actual cluster trackings
# All variables listed below have the same structure as list of list and values within the same position refer to the same cluster
Trac_cluster_index = [] #index identified in the Link_cluster_index rearrange into identified tracks
Trac_cluster_size = [] #Number of nodes within each tracked cluster, [[one cluster at different time],different clusters]
Trac_cluster_t = [] #Time information when cluster is recorded
Trac_cluster_x = [] #Center of mass for each tracked cluster
Trac_cluster_y = []
Trac_cluster_z = []

for k in range(len(Link_cluster_index[0])):
    Trac_cluster_index.append(Link_cluster_index[0][k].copy()) #'.copy' used here is because 'append' is adding list address not values. If not used, 'append' will also change list itself
    Trac_cluster_size.append(Link_cluster_size[0][k].copy())
    Trac_cluster_t.append(Link_cluster_time[0][k].copy())
    Trac_cluster_x.append(Link_cluster_x[0][k].copy())
    Trac_cluster_y.append(Link_cluster_y[0][k].copy())
    Trac_cluster_z.append(Link_cluster_z[0][k].copy())

for i in range(1, len(Link_cluster_index)):
    for j in range(len(Link_cluster_index[i])):
        pathFound = False
        for k in range(len(Trac_cluster_index)):
            if Trac_cluster_index[k][-1] == Link_cluster_index[i][j][0]:
                Trac_cluster_index[k].append(Link_cluster_index[i][j][1])
                Trac_cluster_size[k].append(Link_cluster_size[i][j][1])
                Trac_cluster_t[k].append(Link_cluster_time[i][j][1])
                Trac_cluster_x[k].append(Link_cluster_x[i][j][1])
                Trac_cluster_y[k].append(Link_cluster_y[i][j][1])
                Trac_cluster_z[k].append(Link_cluster_z[i][j][1])
                pathFound = True
        if pathFound == False:
            Trac_cluster_index.append(Link_cluster_index[i][j].copy())
            Trac_cluster_size.append(Link_cluster_size[i][j].copy())
            Trac_cluster_t.append(Link_cluster_time[i][j].copy())
            Trac_cluster_x.append(Link_cluster_x[i][j].copy())
            Trac_cluster_y.append(Link_cluster_y[i][j].copy())
            Trac_cluster_z.append(Link_cluster_z[i][j].copy())

# print('-----------------------------------------')
# print(Trac_cluster_index)
# print(Trac_cluster_size)
# print(Trac_cluster_t)

In [None]:
from matplotlib.colors import ListedColormap
## Calculate MSD for a particular known trajectory of cluster center of mass
def MSD_boundary_condition(trajectory_x, trajectory_y, trajectory_z, trajectory_t):
    MSD = np.zeros(len(trajectory_t)-1)
    tau = np.zeros(len(trajectory_t)-1)
    for i in range(len(trajectory_t)-1):
        tau_temp = np.mean([i-j for i,j in zip(trajectory_t[:-(i+1)],trajectory_t[(i+1):])])
        tau[i] = tau_temp
        dx = np.array([i-j for i,j in zip(trajectory_x[:-(i+1)],trajectory_x[(i+1):])])
        dy = np.array([i-j for i,j in zip(trajectory_y[:-(i+1)],trajectory_y[(i+1):])])
        dz = np.array([i-j for i,j in zip(trajectory_z[:-(i+1)],trajectory_z[(i+1):])])
        MSD[i] = np.mean(np.sqrt(dx*dx + dy*dy + dz*dz))
    return [MSD,tau]

## Self defined criteria of selecting partial cluster trajectory for MSD calculations
def cluster_selection(cluster_size, cluster_x, cluster_y, cluster_z, cluster_t, del_cluster):
    size_change = np.abs(np.diff(cluster_size))
    idx = np.where(size_change > del_cluster)[0]
    if len(idx) != 0: #If index is not an empty array
        if idx[0] !=0: #If the first element is not 0
            idx = np.insert(idx,0,-1)
        cluster_size_update = []
        cluster_x_update = []
        cluster_y_update = []
        cluster_z_update = []
        cluster_t_update = []
        for i in range(len(idx)-1):
            cluster_size_update.append(cluster_size[idx[i]+1:idx[i+1]+1])
            cluster_x_update.append(cluster_x[idx[i]+1:idx[i+1]+1])
            cluster_y_update.append(cluster_y[idx[i]+1:idx[i+1]+1])
            cluster_z_update.append(cluster_z[idx[i]+1:idx[i+1]+1])
            cluster_t_update.append(cluster_t[idx[i]+1:idx[i+1]+1])
    else:
        cluster_size_update = [cluster_size] #Create list of list for easier processing afterwards
        cluster_x_update = [cluster_x]
        cluster_y_update = [cluster_y]
        cluster_z_update = [cluster_z]
        cluster_t_update = [cluster_t]
    return cluster_size_update, cluster_x_update, cluster_y_update, cluster_z_update, cluster_t_update

# Calculate Deff for each cluster and plot the trajectories for all tracked cluster
L_trac_cutoff = 10
del_cluster = 20; #Cluster size change between neighboring time point should be smaller than 'del_cluster'
TOTAL_MSD = [] # Save all MSD for individual tracks
TOTAL_TAU = []
Deff = []
Deff_cluster_size = []
for i in range(len(Trac_cluster_size)):
    [cluster_size_update, cluster_x_update, cluster_y_update, cluster_z_update, cluster_t_update] = cluster_selection(Trac_cluster_size[i], Trac_cluster_x[i], Trac_cluster_y[i], Trac_cluster_z[i], Trac_cluster_t[i],del_cluster)
    for j in range(len(cluster_size_update)):
        if len(cluster_size_update[j]) >= L_trac_cutoff:
            x = cluster_x_update[j]
            y = cluster_y_update[j]
            z = cluster_z_update[j]
            clus_size = cluster_size_update[j]
            t = cluster_t_update[j]
            [MSD,tau] = MSD_boundary_condition(x,y,z,t)
            TOTAL_MSD.append(MSD)
            TOTAL_TAU.append(tau)
            
            #Fitting to get Effective Deff
            coef = np.polyfit(tau[:L_trac_cutoff],MSD[:L_trac_cutoff],1) #Use MSD=6*D*tau+error for fitting
            poly1d_fn = np.poly1d(coef)
            MSD_fit = [i for i in poly1d_fn(tau)]
            Deff.append(coef[0]/6)
            Deff_cluster_size.append(np.mean(clus_size))

            fig = plt.figure(figsize=(15,10))
            ax1 = fig.add_subplot(221, projection='3d')
            scatter1 = ax1.scatter(x, y, z, c=clus_size, marker='o', cmap='viridis')
            cbar1 = plt.colorbar(scatter1)
            cbar1.set_label('Cluster Size') 

            ax2 = fig.add_subplot(222)
            colormap = ListedColormap(plt.cm.viridis(1 - (t-min(t))/(max(t)-min(t))))
            scatter2 = ax2.scatter(x, y, c=t, cmap=colormap, marker='.', s=100)
            cbar2 = plt.colorbar(scatter2)
            cbar2.set_label('Time')

            ax3 = fig.add_subplot(223)
            plt.plot(tau, MSD, marker='o')
            plt.plot(tau,MSD_fit,'--r')


            ax4 = fig.add_subplot(224)
            plt.plot(t, clus_size, marker='o')

            # Add labels and title
            ax1.set_xlabel('Box X ($\mu$m)')
            ax1.set_ylabel('Box Y ($\mu$m)')
            ax1.set_zlabel('Box Z ($\mu$m)')

            ax2.set_xlabel('Box X ($\mu$m)')
            ax2.set_ylabel('Box Y ($\mu$m)')

            ax3.set_xlabel(r'$\tau$ (s)')
            ax3.set_ylabel(r'MSD ($\mu$m$^2$)')
            
            ax4.set_xlabel('time (s)')
            ax4.set_ylabel('Cluster Size')

            plt.tight_layout()
            plt.show()

fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(Deff_cluster_size, Deff)
plt.xlabel('Cluster size')
plt.ylabel(r'$D_{eff} $ $\mu$m$^2/s$')
plt.show()

    
    
# ## Below is to determine what's proper values for cluster size change between neighboring frames by changing bins value
# dClust = []
# for size in Trac_cluster_size:
#     dClust.append(np.diff(size))
    
# dClust_combined = np.abs(np.concatenate(dClust))
# bin_width = 10
# num_bins = int((dClust_combined.max() - dClust_combined.min()) / bin_width)
# fig = plt.figure()
# hist, bin_edges, _ = plt.hist(dClust_combined, bins=num_bins) #bins = 10-20 could cover most size changes between neighboring frames
# print("Histogram Values (Frequencies):", hist)
# plt.xlabel('Cluster size change between neighboring frame')
# plt.show()

In [None]:
# Save the following outputs and parameters into an excel file: gsd_file, Minimum_size, del_cluster, L_trac_cutoff, Deff_cluster_size, Deff

output = {'Deff': Deff, 'Deff_cluster_size': Deff_cluster_size}
df_output = pd.DataFrame(output)
df_output['filename'] = Filename
df_output['Minimum_cluster_size'] = Minimum_size
df_output['Maximum_cluster_rate_change'] = del_cluster
df_output['Minimum_track_length'] = L_trac_cutoff

Output_folder_path = f"/Users/shut01/Documents/Levy simulation folder/Re-analyze simulation data/{Filename}"
csv_filename = f"{Output_folder_path}/Output_Mesoscale_Diffusivity_{Filename}.csv"
df_output.to_csv(csv_filename, index=False)