#  Email Network creation to excel

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from networkx.algorithms import bipartite

In [2]:
#Load Edge List
file = '../data/initial/email-edges.txt'
edge_list = pd.read_table(file, delim_whitespace=True, names=('Senders', 'Recievers'))

This is a randomly picked data set taken from: (Guimera, R, Danon, L, Diaz-Guilera, A, Giralt, F, Arenas, A.
Phys. Rev. E 68 , art. no. 065103 (2003).) in order to ilustrate a simple Network Analysis retrieving general measures from it

In [3]:
edge_list.head(5)

Unnamed: 0,Senders,Recievers
0,1,2
1,1,3
2,1,4
3,1,5
4,1,6


In [4]:
len(edge_list['Recievers'].unique())

1133

In [13]:
len(edge_list['Recievers'])

10903

# Creation of the Graph

In [7]:
G = nx.read_edgelist(file)

In [9]:
len(G.nodes())

1133

In [10]:
len(G.edges())

5452

The library that we are using avoid repetition of edges, therefore if Alice send a message to Bob, the edge is already created an it will not count the entry 'Bob send a message to Alice'. That explains the difference between the lenght of the edgelist (edge_list) and the Graph (G) 

We could create the graph using the edge_list to confirm it

In [26]:
#Creating the Graph2
G2 = nx.Graph()  
G2.add_nodes_from(edge_list['Senders'].unique())  
G2.add_edges_from(  
     [(row['Recievers'], row['Senders']) for idx, row in edge_list.iterrows()])

In [29]:
len(G2.nodes())

1133

In [30]:
len(G2.edges())

5452

# General Network Stats

In [33]:
Connected_Network = nx.is_connected(G)
Density = len(G.edges())*2/(len(G.nodes())*(len(G.nodes())-1))
Number_of_Edges = len(G.edges())
Number_of_Nodes = len(G.nodes())
Separated_Components = nx.number_connected_components(G)
Assortativity = nx.degree_assortativity_coefficient(G)
Clustering = nx.average_clustering(G)
Diameter = nx.diameter(G)

print(
    "PROYECTED SSO NETWORK" + '\n'
        "Connected Network: " + str(Connected_Network) +'\n'
        "Number of Nodes: "+ str(Number_of_Nodes) +'\n'
        "Number of Edges: "+ str(Number_of_Edges) +'\n'
        "Density: "+ str(Density) +'\n'
        "Separated Components: "+ str(Separated_Components) +'\n'
        "Assortativity: "+ str(Assortativity) + '\n'
        "Average Clustering Coefficient: "+ str(Clustering) + '\n'
        "Diameter : " + str(Diameter) + '\n'
     )

PROYECTED SSO NETWORK
Connected Network: True
Number of Nodes: 1133
Number of Edges: 5452
Density: 0.0085017730219967
Separated Components: 1
Assortativity: 0.0782987692548041
Average Clustering Coefficient: 0.2201760865041161
Diameter : 8



# Stats per node

In [40]:
# Stats 1: Degree List -- For Excel:  df_degrees
degrees = [(node,val) for (node, val) in G.degree()]
df_degrees = pd.DataFrame(degrees)
df_degrees.columns = ["Senders", "Degree"]
df_degrees = df_degrees.set_index("Senders")

In [43]:
# Stats 2: Neighbour Degree -- For Excel:  df_nei_degrees
df_nei_degrees = pd.DataFrame(nx.average_neighbor_degree(G).items())
df_nei_degrees.columns = ["Senders", "Neighbour Degree"]
df_nei_degrees = df_nei_degrees.set_index("Senders")

In [44]:
# Stats 3: Clustering -- For Excel:  df_cluster_node
cluster_node = nx.clustering(G)
df_cluster_node = pd.DataFrame(cluster_node.items())
df_cluster_node.columns = ["Senders", "Clustering"]
df_cluster_node = df_cluster_node.set_index("Senders")

In [45]:
# Stats 4: Eccentricity -- For Excel:  dist
Eccentricity_G = nx.eccentricity(G)
dist=pd.DataFrame(Eccentricity_G.items())
dist.columns=['Senders', 'Eccentricity']
dist=dist.set_index('Senders')

In [46]:
# Stats 5.1: Degree Centrality -- For Excel:  df_degree_centrality
degree_centrality=nx.degree_centrality(G)
df_degree_centrality=pd.DataFrame(degree_centrality.items())
df_degree_centrality.columns=["Senders", "Degree Centrality"]
df_degree_centrality=df_degree_centrality.set_index("Senders")

In [47]:
# Stats 5.2: Closeness Centrality  -- For Excel:  df_closeness_centrality
closeness_centrality=nx.closeness_centrality(G)
df_closeness_centrality=pd.DataFrame(closeness_centrality.items())
df_closeness_centrality.columns=["Senders", "Closeness"]
df_closeness_centrality=df_closeness_centrality.set_index("Senders")

In [48]:
# Stats 5.3: Betweeness Centrality  -- For Excel:  df_betweeness_centrality
betweeness_centrality=nx.betweenness_centrality(G)
df_betweeness_centrality=pd.DataFrame(betweeness_centrality.items())
df_betweeness_centrality.columns=["Senders", "Betweeness"]
df_betweeness_centrality=df_betweeness_centrality.set_index("Senders")

In [49]:
# Stats 5.4: Random Walk Centrality  -- For Excel:  df_rw_centrality
rw_centrality=nx.current_flow_betweenness_centrality(G)
df_rw_centrality=pd.DataFrame(rw_centrality.items())
df_rw_centrality.columns=["Senders", "RandomWalk"]
df_rw_centrality=df_rw_centrality.set_index("Senders")

In [50]:
# Final Excel Creation
df_full_excel = pd.concat([
    df_degrees,
    df_nei_degrees,
    df_cluster_node,
    dist,
    df_degree_centrality,
    df_closeness_centrality,
    df_betweeness_centrality,
    df_rw_centrality
    ], 
    axis=1)

In [51]:
file = '../data/output/Email_Network_Stats.xlsx'
writer = pd.ExcelWriter(file)
df_full_excel.to_excel(writer)
writer.save()