In [None]:
'''
Importing Packages
'''
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import networkx as nx

In [None]:
# get the dataset from this link (https://www.kaggle.com/datasets/mathurinache/twitter-edge-nodes)

In [None]:
import pandas as pd

# Load nodes data into pandas DataFrame
nodes_df = pd.read_csv('/content/drive/MyDrive/sna-dataset/nodes.csv', header=None, names=['id'])

# Convert DataFrame to dictionary
nodes_dict = nodes_df.set_index('id').to_dict('index')



In [None]:
import pandas as pd
import networkx as nx

# Load edges data into pandas DataFrame
edges_df = pd.read_csv('/content/drive/MyDrive/sna-dataset/edges.csv', header=None, names=['source', 'target'])




In [None]:
'''
Checking for missing values and duplicates
'''
edges_df.isna().sum()
edges_df.duplicated().sum()

0

In [None]:
chunk_size = 1000000
chunks = pd.read_csv('/content/drive/MyDrive/sna-dataset/edges.csv', header=None, names=['source', 'target'], chunksize=chunk_size)

G = nx.Graph()
for chunk in chunks:
    G = nx.from_pandas_edgelist(chunk, source='source', target='target', create_using=G)


In [None]:
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())


Number of nodes: 318282
Number of edges: 331845


In [None]:
import community.community_louvain as cl
partition = cl.best_partition(G)
# Print the number of communities
print(f"Number of communities: {len(set(partition.values()))}")


Number of communities: 851


In [None]:
from collections import Counter


# Count the number of nodes in each community
community_sizes = Counter(partition.values())

# Print the community sizes
for community, size in community_sizes.items():
    print(f"Community {community}: {size} nodes")


Community 0: 1357 nodes
Community 1: 1318 nodes
Community 2: 385 nodes
Community 3: 5855 nodes
Community 4: 1033 nodes
Community 5: 566 nodes
Community 6: 1809 nodes
Community 7: 1635 nodes
Community 8: 2437 nodes
Community 9: 344 nodes
Community 10: 10818 nodes
Community 11: 9227 nodes
Community 12: 1450 nodes
Community 13: 215 nodes
Community 14: 4175 nodes
Community 15: 1600 nodes
Community 16: 2140 nodes
Community 17: 928 nodes
Community 18: 2378 nodes
Community 19: 609 nodes
Community 20: 2670 nodes
Community 21: 24 nodes
Community 22: 2538 nodes
Community 23: 1027 nodes
Community 24: 3877 nodes
Community 25: 895 nodes
Community 26: 873 nodes
Community 27: 2040 nodes
Community 28: 518 nodes
Community 29: 1619 nodes
Community 30: 2440 nodes
Community 31: 415 nodes
Community 32: 3413 nodes
Community 33: 1149 nodes
Community 34: 1122 nodes
Community 35: 1061 nodes
Community 36: 1650 nodes
Community 37: 57636 nodes
Community 38: 4244 nodes
Community 39: 650 nodes
Community 40: 3961 no

In [None]:
# Sort the communities by size in descending order
sorted_communities = sorted(community_sizes.items(), key=lambda x: x[1], reverse=True)

# Print the top 10 communities with the highest number of nodes
print("Top 10 communities with highest number of nodes:")
for i in range(10):
    if i < len(sorted_communities):
        community = sorted_communities[i]
        print(f"Community {community[0]} with {community[1]} nodes")
    else:
        break



Top 10 communities with highest number of nodes:
Community 37 with 57636 nodes
Community 10 with 10818 nodes
Community 11 with 9227 nodes
Community 71 with 9085 nodes
Community 85 with 6024 nodes
Community 3 with 5855 nodes
Community 59 with 5376 nodes
Community 64 with 4522 nodes
Community 38 with 4244 nodes
Community 14 with 4175 nodes


In [None]:
# Compute density of edges within each community
community_densities = []
for community_id in set(partition.values()):
    community_nodes = [n for n in G.nodes() if partition[n] == community_id]
    subgraph = G.subgraph(community_nodes)
    community_density = nx.density(subgraph)
    community_densities.append((community_id, community_density))

# Sort the communities by density in descending order
sorted_densities = sorted(community_densities, key=lambda x: x[1], reverse=True)

# Print the top 10 communities with highest density
print("Top 10 communities with highest edge density:")
for i in range(10):
    community_id = sorted_densities[i][0]
    community_density = sorted_densities[i][1]
    print(f"Community {community_id}: density {community_density}")



Top 10 communities with highest edge density:
Community 58: density 1.0
Community 65: density 1.0
Community 75: density 1.0
Community 119: density 1.0
Community 130: density 1.0
Community 174: density 1.0
Community 180: density 1.0
Community 193: density 1.0
Community 196: density 1.0
Community 198: density 1.0


In [None]:
# Compute degree centrality for each node in each community
for community_id in set(partition.values()):
    community_nodes = [n for n in G.nodes() if partition[n] == community_id]
    subgraph = G.subgraph(community_nodes)
    degree_centrality = nx.degree_centrality(subgraph)
    print(f"Community {community_id} degree centrality: {degree_centrality}")

Community 0 degree centrality: {8470528: 0.0007374631268436578, 8470529: 0.0007374631268436578, 8470530: 0.0007374631268436578, 8470531: 0.0007374631268436578, 8470532: 0.0007374631268436578, 8470533: 0.0007374631268436578, 8470534: 0.0007374631268436578, 8470535: 0.0007374631268436578, 8470536: 0.0007374631268436578, 8470537: 0.0007374631268436578, 8470538: 0.0007374631268436578, 8470539: 0.0007374631268436578, 8470540: 0.0007374631268436578, 8470541: 0.0007374631268436578, 8470542: 0.0007374631268436578, 8470543: 0.0007374631268436578, 8470544: 0.0007374631268436578, 8470545: 0.0007374631268436578, 8470546: 0.0007374631268436578, 8470547: 0.0007374631268436578, 8470548: 0.0007374631268436578, 8470549: 0.0007374631268436578, 8470550: 0.0007374631268436578, 8470551: 0.0007374631268436578, 8470552: 0.0007374631268436578, 8470553: 0.0007374631268436578, 8470554: 0.0007374631268436578, 8470555: 0.0007374631268436578, 8470556: 0.0007374631268436578, 8470557: 0.0007374631268436578, 8470558:

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Community 41 degree centrality: {8708096: 0.0003937007874015748, 8708097: 0.0003937007874015748, 8708098: 0.0003937007874015748, 8708099: 0.0003937007874015748, 8708100: 0.0003937007874015748, 8708101: 0.0003937007874015748, 8708102: 0.0003937007874015748, 8708103: 0.0003937007874015748, 8708104: 0.0003937007874015748, 8708105: 0.0003937007874015748, 8708106: 0.0003937007874015748, 8708107: 0.0003937007874015748, 8708108: 0.0003937007874015748, 8708109: 0.0003937007874015748, 8708110: 0.0003937007874015748, 8708111: 0.0003937007874015748, 8708112: 0.0003937007874015748, 8708113: 0.0003937007874015748, 8708114: 0.0003937007874015748, 8708115: 0.0003937007874015748, 8708116: 0.0003937007874015748, 8708117: 0.0003937007874015748, 8708118: 0.0003937007874015748, 8708119: 0.0003937007874015748, 8708120: 0.0003937007874015748, 8708121: 0.0003937007874015748, 8708122: 0.0003937007874015748, 8708123: 0.0003937007874015748, 8708124: 0.0003937007874015748, 8708125: 0.0003937007874015748, 8708126

In [None]:
# This will print out the top 10 nodes in each community ranked by their degree centrality,
# which is a measure of how many edges are incident to a node.
# Compute degree centrality for each node in each community
for community_id in set(partition.values()):
    community_nodes = [n for n in G.nodes() if partition[n] == community_id]
    subgraph = G.subgraph(community_nodes)
    degree_centrality = nx.degree_centrality(subgraph)
    sorted_dc = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)
    top_nodes = [n for n, dc in sorted_dc[:10]]
    print(f"Community {community_id} top nodes by degree centrality: {top_nodes}")

Community 0 top nodes by degree centrality: [72169, 6963, 451391, 118242, 1552915, 451148, 3238427, 8446384, 8717374, 490564]
Community 1 top nodes by degree centrality: [46396, 1025967, 3318817, 1638607, 8620405, 8582005, 8582012, 8582260, 8544285, 8520040]
Community 2 top nodes by degree centrality: [7279677, 8632953, 8461023, 8700995, 8700996, 8700997, 8700998, 8700999, 8701000, 8701001]
Community 3 top nodes by degree centrality: [4573533, 58200, 47350, 284688, 132075, 219703, 4114793, 48595, 9979, 6734436]
Community 4 top nodes by degree centrality: [195167, 1938572, 3322846, 2474654, 6896293, 7128218, 6760530, 8567635, 8538114, 8579222]
Community 5 top nodes by degree centrality: [1016556, 588443, 2672782, 5458219, 291480, 250337, 2486154, 1850536, 8597873, 8597880]
Community 6 top nodes by degree centrality: [47604, 444613, 1223264, 1573841, 3269447, 1778108, 4400861, 1845091, 1148660, 791386]
Community 7 top nodes by degree centrality: [445062, 5046790, 8577482, 8541007, 854193

In [None]:
components = nx.connected_components(G)

# Count the number of connected components
num_components = nx.number_connected_components(G)

if num_components > 1:
    print("The graph is not well connected.")
    print(f"There are {num_components} connected components.")
else:
    print("The graph is well connected.")

The graph is not well connected.
There are 653 connected components.


In [None]:
betweenness = nx.betweenness_centrality(G, k=10, endpoints=True, seed=0)

# Print the nodes with the highest betweenness centrality
for node, score in sorted(betweenness.items(), key=lambda x: x[1], reverse=True)[:20]:
    print(f"Node {node}: Betweenness centrality = {score}")

Node 6623784: Betweenness centrality = 0.2771342579559482
Node 3403: Betweenness centrality = 0.13183415928323722
Node 25442: Betweenness centrality = 0.11263819648855784
Node 3402: Betweenness centrality = 0.11125294836371849
Node 119446: Betweenness centrality = 0.1080927124764905
Node 2078325: Betweenness centrality = 0.10750600143327188
Node 8646807: Betweenness centrality = 0.1058372816284181
Node 3051028: Betweenness centrality = 0.10151373735418973
Node 6935659: Betweenness centrality = 0.09845977000495991
Node 4014640: Betweenness centrality = 0.09845131587344974
Node 4410045: Betweenness centrality = 0.09819895557136347
Node 8467088: Betweenness centrality = 0.09818526826317912
Node 8581044: Betweenness centrality = 0.0981503765540513
Node 8604205: Betweenness centrality = 0.0981503765540513
Node 8632720: Betweenness centrality = 0.0981503765540513
Node 8646926: Betweenness centrality = 0.0981503765540513
Node 8657258: Betweenness centrality = 0.0981503765540513
Node 8665449: 

In [None]:
nx.write_graphml(G, "my_graph.graphml")