In [1]:
import pandas as pd

# Specify the path to your dataset
file_path = 'Cit-HepPh.txt'

# Read the dataset into a DataFrame, skipping the first few comment lines
df = pd.read_csv(file_path, delimiter='\t', comment='#', header=None, names=['source', 'target'])

# Display the DataFrame
print(df)


         source   target
0       9907233  9301253
1       9907233  9504304
2       9907233  9505235
3       9907233  9506257
4       9907233  9606402
...         ...      ...
421573  9912461  9305269
421574  9912461  9609445
421575  9912461  9902379
421576  9912461  9902403
421577  9405397  9303299

[421578 rows x 2 columns]


In [2]:
df.shape

(421578, 2)

In [3]:
unique_nodes = pd.unique(pd.concat([df['source'], df['target']]))
num_unique_nodes = len(unique_nodes)

print("Number of unique nodes:", num_unique_nodes)
num_edges = df.shape[0]
print("Number of edges:", num_edges)

Number of unique nodes: 34546
Number of edges: 421578


In [4]:
import networkx as nx
import matplotlib.pyplot as plt

G = nx.from_pandas_edgelist(df, 'source', 'target', create_using=nx.DiGraph())

In [12]:
centrality = nx.degree_centrality(G)

avg_centrality = sum(centrality.values()) / len(centrality)

print("Degree Centrality:", avg_centrality)

Degree Centrality: 0.000706520827862103


In [13]:
density = nx.density(G)
print("Graph Density:", density)

Graph Density: 0.00035326041393102855


In [17]:
#top connected nodes

most_connected_nodes = sorted(centrality, key=centrality.get, reverse=True)
print("Most Connected Nodes:", most_connected_nodes[:5])

Most Connected Nodes: [9803315, 9512380, 9804398, 9407339, 9606399]


In [19]:
# Ranks of nodes

node_ranks = {node: rank + 1 for rank, node in enumerate(most_connected_nodes)}
print("Node Ranks:", node_ranks)

Node Ranks: 34546


In [27]:
print("Sorted Node Ranks :")

sorted_node_ranks = sorted(node_ranks.items(), key=lambda item: item[1], reverse=True)
for node, rank in sorted_node_ranks[:5]:
    print(f"Node {node}: Rank {rank}")

Sorted Node Ranks :
Node 9405397: Rank 34546
Node 9309318: Rank 34545
Node 111372: Rank 34544
Node 110373: Rank 34543
Node 9405415: Rank 34542


In [29]:
average_degree = sum(dict(G.degree()).values()) / len(G.nodes)
print("Average Degree:", average_degree)

Average Degree: 24.406761998494762


In [37]:
in_degree = dict(G.in_degree())
out_degree = dict(G.out_degree())



#printing the first 10 top in degee and out degree
sorted_in_degree = dict(sorted(in_degree.items(), key=lambda item: item[1], reverse=True)[:10])
sorted_out_degree = dict(sorted(out_degree.items(), key=lambda item: item[1], reverse=True)[:10])


In [38]:
print("Sorted In-Degree:")
for node, degree in sorted_in_degree.items():
    print(f"Node {node}: In-Degree {degree}")

print("\nSorted Out-Degree:")
for node, degree in sorted_out_degree.items():
    print(f"Node {node}: Out-Degree {degree}")

Sorted In-Degree:
Node 9803315: In-Degree 846
Node 9804398: In-Degree 616
Node 9407339: In-Degree 557
Node 9512380: In-Degree 550
Node 9606399: In-Degree 542
Node 9807344: In-Degree 503
Node 9306320: In-Degree 464
Node 9905221: In-Degree 449
Node 9408384: In-Degree 444
Node 9507378: In-Degree 438

Sorted Out-Degree:
Node 201071: Out-Degree 411
Node 101336: Out-Degree 376
Node 3154: Out-Degree 322
Node 5025: Out-Degree 282
Node 209244: Out-Degree 278
Node 207108: Out-Degree 278
Node 208209: Out-Degree 263
Node 202122: Out-Degree 255
Node 9712301: Out-Degree 242
Node 204031: Out-Degree 229
