In [2]:
!pip install graphframes

Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl.metadata (934 bytes)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nose, graphframes
Successfully installed graphframes-0.6 nose-1.3.7


In [3]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame
import pandas as pd
import json

In [4]:
spark = SparkSession.builder \
    .appName("AGM Community Detection") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.0-s_2.12") \
    .getOrCreate()

In [5]:
edges_path = "/content/drive/MyDrive/git_web_ml/musae_git_edges.csv"
targets_path = "/content/drive/MyDrive/git_web_ml/musae_git_target.csv"
features_path = "/content/drive/MyDrive/git_web_ml/musae_git_features.json"

In [6]:
edges_df = spark.read.csv(edges_path, header=True)
edges_df = edges_df.withColumnRenamed("id_1", "src").withColumnRenamed("id_2", "dst")

In [7]:
nodes_df = spark.read.csv(targets_path, header=True).select("id", "name")

In [8]:
with open(features_path, 'r') as f:
    features = json.load(f)

In [9]:
features_df = pd.DataFrame(list(features.items()), columns=['id', 'features'])

In [10]:
graph = GraphFrame(nodes_df, edges_df)



In [11]:
communities = graph.labelPropagation(maxIter=10)



In [12]:
community_assignments = communities.rdd.map(lambda row: (row.id, row.label)).collectAsMap()

In [13]:
edges_rdd = edges_df.rdd.map(lambda row: (row.src, row.dst))

In [15]:
def calculate_modularity(edges_rdd, community_assignments):
    m = edges_rdd.count()
    communities = set(community_assignments.values())
    modularity = 0.0

    for community in communities:
        nodes_in_community = {node for node, label in community_assignments.items() if label == community}
        internal_edges = edges_rdd.filter(
            lambda edge: edge[0] in nodes_in_community and edge[1] in nodes_in_community
        ).count()
        degree_sum = edges_rdd.filter(
            lambda edge: edge[0] in nodes_in_community or edge[1] in nodes_in_community
        ).count()
        modularity += (internal_edges / m) - (degree_sum / (2 * m))**2

    return modularity

In [16]:
modularity_score = calculate_modularity(edges_rdd, community_assignments)

print(f"Detected Communities: {communities.show(truncate=False)}")
print(f"Modularity Score: {modularity_score}")

+---+-----------------+------------+
|id |name             |label       |
+---+-----------------+------------+
|0  |Eiryyy           |833223655499|
|1  |shawflying       |833223655499|
|2  |JpMCarrilho      |833223655499|
|3  |SuhwanCha        |833223655499|
|4  |sunilangadi2     |833223655499|
|5  |j6montoya        |833223655499|
|6  |sfate            |833223655499|
|7  |amituuush        |833223655499|
|8  |mauroherlein     |833223655499|
|9  |ParadoxZero      |833223655499|
|10 |llazzaro         |833223655499|
|11 |beeva-manueldepaz|833223655499|
|12 |damianmuti       |833223655499|
|13 |apobbati         |833223655499|
|14 |hwlv             |833223655499|
|15 |haroldoramirez   |833223655499|
|16 |jasonblanchard   |833223655499|
|17 |BahiHussein      |833223655499|
|18 |itsmevanessi     |833223655499|
|19 |nwjsmith         |833223655499|
+---+-----------------+------------+
only showing top 20 rows

Detected Communities: None
Modularity Score: 0.7466594985986716
