In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.graphx import Graph, VertexRDD, Edge

# Initialize Spark session
conf = SparkConf().setAppName("GitHubCommunityDetection").setMaster("local[*]")
sc = SparkContext(conf=conf)

spark = SparkSession(sc)

# Load the dataset
edges_path = "musae_git_edges.csv"
vertices_path = "musae_git_target.csv"

# Load edges (source, destination)
edges_df = spark.read.csv(edges_path, header=True)
edges_rdd = edges_df.rdd.map(lambda row: (int(row['source']), int(row['target'])))

# Load vertices (id, attributes)
vertices_df = spark.read.csv(vertices_path, header=True)
vertices_rdd = vertices_df.rdd.map(lambda row: (int(row['id']), {"name": row['name'], "ml_target": int(row['ml_target'])}))

# Create Graph
edges = edges_rdd.map(lambda edge: Edge(edge[0], edge[1]))
vertices = vertices_rdd.map(lambda vertex: (vertex[0], vertex[1]))
graph = Graph(vertices, edges)

# AGM Model for Community Detection
# Placeholder for Affiliation Graph Model (AGM) implementation.
# As GraphX doesn't have direct AGM support, you may need to use an alternative method.
# For simplicity, here we'll use Label Propagation for community detection.

# Detect communities using Label Propagation
def label_propagation(graph):
    return graph.labelPropagation(maxSteps=5)

communities = label_propagation(graph)

# Evaluate Modularity
def calculate_modularity(graph, communities):
    # Modularity calculation logic here
    # Placeholder for modularity implementation
    pass

modularity = calculate_modularity(graph, communities)

# Output Results
print("Detected Communities:")
for community in communities.collect():
    print(community)

print(f"Modularity Score: {modularity}")

# Stop Spark session
sc.stop()


ModuleNotFoundError: No module named 'pyspark.graphx'

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from graphframes import GraphFrame
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import numpy as np

def create_spark_session():
    """Initialize Spark session with GraphFrames package"""
    return (SparkSession.builder
            .appName("GitHubCommunityDetection")
            .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12")
            .master("local[*]")
            .getOrCreate())

def load_data(spark, edges_path, vertices_path):
    """Load and prepare the edge and vertex data"""
    # Define schemas
    edge_schema = StructType([
        StructField("src", IntegerType(), False),
        StructField("dst", IntegerType(), False)
    ])
    
    vertex_schema = StructType([
        StructField("id", IntegerType(), False),
        StructField("ml_target", IntegerType(), True)
    ])
    
    # Load edges and vertices
    edges_df = spark.read.csv(edges_path, header=True, schema=edge_schema)
    vertices_df = spark.read.csv(vertices_path, header=True, schema=vertex_schema)
    
    # Ensure vertex column is named 'id' for GraphFrames
    vertices_df = vertices_df.withColumnRenamed("id", "id")
    
    return edges_df, vertices_df

def create_graph(vertices_df, edges_df):
    """Create GraphFrame from vertex and edge DataFrames"""
    return GraphFrame(vertices_df, edges_df)

def detect_communities(graph, max_iter=10):
    """
    Detect communities using Label Propagation Algorithm
    Returns: DataFrame with vertex id and community label
    """
    # Run Label Propagation
    result = graph.labelPropagation(maxIter=max_iter)
    return result

def calculate_modularity(graph, communities):
    """
    Calculate modularity score for the detected communities
    
    Modularity Q = 1/(2m) * sum[(Aij - (ki*kj)/(2m)) * δ(ci,cj)]
    where:
    - m is the number of edges
    - Aij is 1 if vertices i,j are connected, 0 otherwise
    - ki, kj are degrees of vertices i,j
    - ci, cj are communities of vertices i,j
    """
    # Get edge count
    m = graph.edges.count()
    if m == 0:
        return 0.0
    
    # Join communities with edges
    edge_communities = (graph.edges
        .join(communities.select("id", "label").withColumnRenamed("label", "src_comm"), 
              graph.edges.src == communities.id)
        .join(communities.select("id", "label").withColumnRenamed("label", "dst_comm"), 
              graph.edges.dst == communities.id))
    
    # Calculate vertex degrees
    degrees = (graph.degrees
        .withColumnRenamed("id", "vertex_id")
        .withColumnRenamed("degree", "degree"))
    
    # Calculate modularity components
    modularity_components = (edge_communities
        .join(degrees.withColumnRenamed("degree", "src_degree"), 
              edge_communities.src == degrees.vertex_id)
        .join(degrees.withColumnRenamed("degree", "dst_degree"), 
              edge_communities.dst == degrees.vertex_id)
        .select("src_comm", "dst_comm", "src_degree", "dst_degree"))
    
    # Compute final modularity
    def compute_modularity(row):
        expected = (row.src_degree * row.dst_degree) / (2.0 * m)
        actual = 1.0 if row.src_comm == row.dst_comm else 0.0
        return (actual - expected) / (2.0 * m)
    
    modularity = (modularity_components
        .rdd
        .map(compute_modularity)
        .sum())
    
    return modularity

def main():
    # Initialize Spark
    spark = create_spark_session()
    
    # File paths
    edges_path = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_edges.csv"
    vertices_path = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_target.csv"
    
    try:
        # Load data
        edges_df, vertices_df = load_data(spark, edges_path, vertices_path)
        
        # Create graph
        graph = create_graph(vertices_df, edges_df)
        
        # Detect communities
        communities = detect_communities(graph)
        
        # Calculate modularity
        modularity = calculate_modularity(graph, communities)
        
        # Output results
        print("\nCommunity Detection Results:")
        print("-" * 50)
        
        # Show community distribution
        community_sizes = (communities
            .groupBy("label")
            .count()
            .orderBy("count", ascending=False))
        
        print("\nCommunity Sizes:")
        community_sizes.show()
        
        print(f"\nModularity Score: {modularity:.4f}")
        
        # Optional: Save results
        communities.write.csv("github_communities.csv", header=True, mode="overwrite")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        
    finally:
        # Stop Spark session
        spark.stop()

if __name__ == "__main__":
    main()

24/11/25 15:32:05 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id_1, id_2
 Schema: src, dst
Expected: src but found: id_1
CSV file: file:///Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_edges.csv
24/11/25 15:32:05 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 3, schema size: 2
CSV file: file:///Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_target.csv
24/11/25 15:32:06 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id_1, id_2
 Schema: src, dst
Expected: src but found: id_1
CSV file: file:///Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_edges.csv


An error occurred: Column id#72 are ambiguous. It's probably because you joined several Datasets together, and some of these Datasets are the same. This column points to one of the Datasets but Spark is unable to figure out which one. Please alias the Datasets with different names via `Dataset.as` before joining them, and specify the column using qualified name, e.g. `df.as("a").join(df.as("b"), $"a.id" > $"b.id")`. You can also set spark.sql.analyzer.failAmbiguousSelfJoin to false to disable this check.


In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from graphframes import GraphFrame
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from tqdm import tqdm
import numpy as np
import time

def create_spark_session():
    """Initialize Spark session with GraphFrames package"""
    print("Initializing Spark session...")
    return (SparkSession.builder
            .appName("GitHubCommunityDetection")
            .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12")
            .master("local[*]")
            .getOrCreate())

def load_data(spark, edges_path, vertices_path):
    """Load and prepare the edge and vertex data with progress bars"""
    print("\nLoading data...")
    
    # Define schemas
    edge_schema = StructType([
        StructField("src", IntegerType(), False),
        StructField("dst", IntegerType(), False)
    ])
    
    vertex_schema = StructType([
        StructField("id", IntegerType(), False),
        StructField("ml_target", IntegerType(), True)
    ])
    
    with tqdm(total=2, desc="Loading datasets") as pbar:
        # Load edges
        edges_df = spark.read.csv(edges_path, header=True, schema=edge_schema)
        pbar.update(1)
        
        # Load vertices
        vertices_df = spark.read.csv(vertices_path, header=True, schema=vertex_schema)
        pbar.update(1)
    
    # Ensure vertex column is named 'id' for GraphFrames
    vertices_df = vertices_df.withColumnRenamed("id", "id")
    
    return edges_df, vertices_df

def create_graph(vertices_df, edges_df):
    """Create GraphFrame from vertex and edge DataFrames"""
    with tqdm(total=1, desc="Creating graph") as pbar:
        graph = GraphFrame(vertices_df, edges_df)
        pbar.update(1)
    return graph

def detect_communities(graph, max_iter=10):
    """
    Detect communities using Label Propagation Algorithm with progress bar
    Returns: DataFrame with vertex id and community label
    """
    print("\nDetecting communities...")
    start_time = time.time()
    
    with tqdm(total=max_iter, desc="Label Propagation") as pbar:
        def progress_callback(iteration):
            pbar.update(1)
        
        # Run Label Propagation
        result = graph.labelPropagation(maxIter=max_iter)
        
        # Simulate progress updates since actual iterations aren't exposed
        for _ in range(max_iter):
            time.sleep(0.1)  # Add small delay to show progress
            pbar.update(1)
    
    print(f"Community detection completed in {time.time() - start_time:.2f} seconds")
    return result

def calculate_modularity(graph, communities):
    """
    Calculate modularity score for the detected communities with progress bar
    """
    print("\nCalculating modularity...")
    
    with tqdm(total=5, desc="Modularity calculation") as pbar:
        # Get edge count
        m = graph.edges.count()
        if m == 0:
            return 0.0
        pbar.update(1)
        
        # Join communities with edges
        edge_communities = (graph.edges
            .join(communities.select("id", "label").withColumnRenamed("label", "src_comm"), 
                  graph.edges.src == communities.id)
            .join(communities.select("id", "label").withColumnRenamed("label", "dst_comm"), 
                  graph.edges.dst == communities.id))
        pbar.update(1)
        
        # Calculate vertex degrees
        degrees = (graph.degrees
            .withColumnRenamed("id", "vertex_id")
            .withColumnRenamed("degree", "degree"))
        pbar.update(1)
        
        # Calculate modularity components
        modularity_components = (edge_communities
            .join(degrees.withColumnRenamed("degree", "src_degree"), 
                  edge_communities.src == degrees.vertex_id)
            .join(degrees.withColumnRenamed("degree", "dst_degree"), 
                  edge_communities.dst == degrees.vertex_id)
            .select("src_comm", "dst_comm", "src_degree", "dst_degree"))
        pbar.update(1)
        
        # Compute final modularity
        def compute_modularity(row):
            expected = (row.src_degree * row.dst_degree) / (2.0 * m)
            actual = 1.0 if row.src_comm == row.dst_comm else 0.0
            return (actual - expected) / (2.0 * m)
        
        modularity = (modularity_components
            .rdd
            .map(compute_modularity)
            .sum())
        pbar.update(1)
    
    return modularity

def main():
    # Initialize Spark
    spark = create_spark_session()
    
    # File paths
    edges_path = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_edges.csv"
    vertices_path = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_target.csv"
    
    try:
        # Load data
        edges_df, vertices_df = load_data(spark, edges_path, vertices_path)
        
        # Create graph
        graph = create_graph(vertices_df, edges_df)
        
        # Detect communities
        communities = detect_communities(graph)
        
        # Calculate modularity
        modularity = calculate_modularity(graph, communities)
        
        # Output results
        print("\nCommunity Detection Results:")
        print("-" * 50)
        
        # Show community distribution
        print("\nCalculating community sizes...")
        with tqdm(total=1, desc="Computing distribution") as pbar:
            community_sizes = (communities
                .groupBy("label")
                .count()
                .orderBy("count", ascending=False))
            pbar.update(1)
        
        print("\nCommunity Sizes:")
        community_sizes.show()
        
        print(f"\nModularity Score: {modularity:.4f}")
        
        # Optional: Save results
        print("\nSaving results...")
        with tqdm(total=1, desc="Saving communities") as pbar:
            communities.write.csv("github_communities.csv", header=True, mode="overwrite")
            pbar.update(1)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        
    finally:
        # Stop Spark session
        print("\nCleaning up...")
        spark.stop()
        print("Done!")

if __name__ == "__main__":
    main()

Initializing Spark session...

Loading data...


Loading datasets: 100%|██████████| 2/2 [00:00<00:00, 67.84it/s]
Creating graph: 100%|██████████| 1/1 [00:00<00:00, 227.43it/s]



Detecting communities...


Label Propagation:   0%|          | 0/10 [00:00<?, ?it/s]24/11/25 15:33:13 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id_1, id_2
 Schema: src, dst
Expected: src but found: id_1
CSV file: file:///Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_edges.csv
24/11/25 15:33:13 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 3, schema size: 2
CSV file: file:///Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_target.csv
24/11/25 15:33:14 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id_1, id_2
 Schema: src, dst
Expected: src but found: id_1
CSV file: file:///Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_edges.csv
Label Propagation: 100%|██████████| 10/10 [00:54<00:00,  5.48s/it]              


Community detection completed in 54.85 seconds

Calculating modularity...


Modularity calculation:  20%|██        | 1/5 [00:00<00:00,  7.35it/s]


An error occurred: Column id#219 are ambiguous. It's probably because you joined several Datasets together, and some of these Datasets are the same. This column points to one of the Datasets but Spark is unable to figure out which one. Please alias the Datasets with different names via `Dataset.as` before joining them, and specify the column using qualified name, e.g. `df.as("a").join(df.as("b"), $"a.id" > $"b.id")`. You can also set spark.sql.analyzer.failAmbiguousSelfJoin to false to disable this check.

Cleaning up...
Done!


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from graphframes import GraphFrame
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from tqdm import tqdm
import time

def create_spark_session():
    """Initialize Spark session with GraphFrames package"""
    print("Initializing Spark session...")
    return (SparkSession.builder
            .appName("GitHubCommunityDetection")
            .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12")
            .config("spark.sql.analyzer.failAmbiguousSelfJoin", "false")
            .master("local[*]")
            .getOrCreate())

def load_data(spark, edges_path, vertices_path):
    """Load and prepare the edge and vertex data with progress bars"""
    print("\nLoading data...")
    
    # Define schemas matching the actual CSV headers
    edge_schema = StructType([
        StructField("id_1", IntegerType(), False),
        StructField("id_2", IntegerType(), False)
    ])
    
    vertex_schema = StructType([
        StructField("id", IntegerType(), False),
        StructField("ml_target", IntegerType(), True),
        StructField("name", StringType(), True)  # Add this if present in your CSV
    ])
    
    with tqdm(total=2, desc="Loading datasets") as pbar:
        # Load edges and rename columns to match GraphFrames requirements
        edges_df = (spark.read.csv(edges_path, header=True, schema=edge_schema)
                   .withColumnRenamed("id_1", "src")
                   .withColumnRenamed("id_2", "dst"))
        pbar.update(1)
        
        # Load vertices
        vertices_df = spark.read.csv(vertices_path, header=True, schema=vertex_schema)
        pbar.update(1)
    
    return edges_df, vertices_df

def create_graph(vertices_df, edges_df):
    """Create GraphFrame from vertex and edge DataFrames"""
    with tqdm(total=1, desc="Creating graph") as pbar:
        graph = GraphFrame(vertices_df, edges_df)
        pbar.update(1)
    return graph

def detect_communities(graph, max_iter=10):
    """
    Detect communities using Label Propagation Algorithm with progress bar
    Returns: DataFrame with vertex id and community label
    """
    print("\nDetecting communities...")
    start_time = time.time()
    
    with tqdm(total=max_iter, desc="Label Propagation") as pbar:
        # Run Label Propagation
        result = graph.labelPropagation(maxIter=max_iter)
        
        # Simulate progress updates
        for _ in range(max_iter):
            time.sleep(0.1)
            pbar.update(1)
    
    print(f"Community detection completed in {time.time() - start_time:.2f} seconds")
    return result

def calculate_modularity(graph, communities):
    """
    Calculate modularity score for the detected communities with progress bar
    """
    print("\nCalculating modularity...")
    
    with tqdm(total=5, desc="Modularity calculation") as pbar:
        # Get edge count
        m = graph.edges.count()
        if m == 0:
            return 0.0
        pbar.update(1)
        
        # Join communities with edges using aliases to avoid ambiguity
        communities1 = communities.select("id", "label").alias("comm1")
        communities2 = communities.select("id", "label").alias("comm2")
        
        edge_communities = (graph.edges
            .join(communities1, 
                  graph.edges.src == communities1.id)
            .join(communities2, 
                  graph.edges.dst == communities2.id)
            .select(
                graph.edges.src,
                graph.edges.dst,
                communities1.label.alias("src_comm"),
                communities2.label.alias("dst_comm")
            ))
        pbar.update(1)
        
        # Calculate vertex degrees with alias
        degrees = graph.degrees.alias("deg")
        degrees1 = degrees.select("id", "degree").alias("deg1")
        degrees2 = degrees.select("id", "degree").alias("deg2")
        
        pbar.update(1)
        
        # Calculate modularity components with explicit aliases
        modularity_components = (edge_communities
            .join(degrees1, 
                  edge_communities.src == degrees1.id)
            .join(degrees2, 
                  edge_communities.dst == degrees2.id)
            .select(
                "src_comm",
                "dst_comm",
                degrees1.degree.alias("src_degree"),
                degrees2.degree.alias("dst_degree")
            ))
        pbar.update(1)
        
        # Compute final modularity
        def compute_modularity(row):
            expected = (row.src_degree * row.dst_degree) / (2.0 * m)
            actual = 1.0 if row.src_comm == row.dst_comm else 0.0
            return (actual - expected) / (2.0 * m)
        
        modularity = (modularity_components
            .rdd
            .map(compute_modularity)
            .sum())
        pbar.update(1)
    
    return modularity

def main():
    # Initialize Spark
    spark = create_spark_session()
    
    # File paths
    edges_path = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_edges.csv"
    vertices_path = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_target.csv"
    
    try:
        # Load data
        edges_df, vertices_df = load_data(spark, edges_path, vertices_path)
        
        # Create graph
        graph = create_graph(vertices_df, edges_df)
        
        # Detect communities
        communities = detect_communities(graph)
        
        # Calculate modularity
        modularity = calculate_modularity(graph, communities)
        
        # Output results
        print("\nCommunity Detection Results:")
        print("-" * 50)
        
        # Show community distribution
        print("\nCalculating community sizes...")
        with tqdm(total=1, desc="Computing distribution") as pbar:
            community_sizes = (communities
                .groupBy("label")
                .count()
                .orderBy("count", ascending=False))
            pbar.update(1)
        
        print("\nCommunity Sizes:")
        community_sizes.show()
        
        print(f"\nModularity Score: {modularity:.4f}")
        
        # Optional: Save results
        print("\nSaving results...")
        with tqdm(total=1, desc="Saving communities") as pbar:
            communities.write.csv("github_communities.csv", header=True, mode="overwrite")
            pbar.update(1)
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise e  # Add this to see full stacktrace
        
    finally:
        # Stop Spark session
        print("\nCleaning up...")
        spark.stop()
        print("Done!")

if __name__ == "__main__":
    main()

Initializing Spark session...

Loading data...


Loading datasets: 100%|██████████| 2/2 [00:00<00:00, 60.27it/s]
Creating graph: 100%|██████████| 1/1 [00:00<00:00, 231.74it/s]



Detecting communities...


Label Propagation:   0%|          | 0/10 [00:00<?, ?it/s]24/11/25 15:35:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id, name, ml_target
 Schema: id, ml_target, name
Expected: ml_target but found: name
CSV file: file:///Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_target.csv
Label Propagation: 100%|██████████| 10/10 [00:54<00:00,  5.45s/it]              


Community detection completed in 54.49 seconds

Calculating modularity...


Modularity calculation: 100%|██████████| 5/5 [00:00<00:00,  8.14it/s]



Community Detection Results:
--------------------------------------------------

Calculating community sizes...


Computing distribution: 100%|██████████| 1/1 [00:00<00:00, 95.05it/s]



Community Sizes:
+-----+-----+
|label|count|
+-----+-----+
|35773|31477|
|31126| 5757|
| 6301|  105|
| 8095|   12|
|27923|   10|
| 3672|    9|
|29439|    9|
| 5547|    8|
|31577|    6|
|20611|    6|
|26963|    5|
| 4632|    5|
|18942|    5|
| 1483|    5|
| 1689|    4|
|29520|    4|
|10715|    4|
| 4066|    4|
| 9755|    4|
|27803|    4|
+-----+-----+
only showing top 20 rows


Modularity Score: 0.0000

Saving results...


Saving communities:   0%|          | 0/1 [00:00<?, ?it/s]24/11/25 15:36:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id, name, ml_target
 Schema: id, ml_target, name
Expected: ml_target but found: name
CSV file: file:///Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_target.csv
Saving communities: 100%|██████████| 1/1 [00:00<00:00,  2.18it/s]



Cleaning up...
Done!


In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc, count
from graphframes import GraphFrame
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import networkx as nx
import numpy as np

def create_spark_session():
    """Initialize Spark session with GraphFrames package"""
    return (SparkSession.builder
            .appName("GitHubNetworkAnalysis")
            .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12")
            .config("spark.sql.analyzer.failAmbiguousSelfJoin", "false")
            .config("spark.driver.memory", "4g")
            .config("spark.executor.memory", "4g")
            .master("local[*]")
            .getOrCreate())

def load_and_prepare_data(spark, edges_path, vertices_path):
    """Load and prepare the graph data"""
    # Define schemas
    edge_schema = StructType([
        StructField("id_1", IntegerType(), False),
        StructField("id_2", IntegerType(), False)
    ])
    
    vertex_schema = StructType([
        StructField("id", IntegerType(), False),
        StructField("name", StringType(), True),
        StructField("ml_target", IntegerType(), True)
    ])
    
    # Load data
    edges_df = (spark.read.csv(edges_path, header=True, schema=edge_schema)
                .withColumnRenamed("id_1", "src")
                .withColumnRenamed("id_2", "dst"))
    
    vertices_df = spark.read.csv(vertices_path, header=True, schema=vertex_schema)
    
    return edges_df, vertices_df

def analyze_graph_metrics(graph):
    """Compute various graph metrics"""
    print("\nComputing graph metrics...")
    
    # Basic statistics
    num_vertices = graph.vertices.count()
    num_edges = graph.edges.count()
    
    # Degree metrics
    in_degrees = graph.inDegrees
    out_degrees = graph.outDegrees
    
    # Get average degrees
    avg_in_degree = in_degrees.select("inDegree").agg({"inDegree": "avg"}).collect()[0][0]
    avg_out_degree = out_degrees.select("outDegree").agg({"outDegree": "avg"}).collect()[0][0]
    
    # Identify hubs (high out-degree) and authorities (high in-degree)
    top_hubs = (out_degrees.orderBy(desc("outDegree"))
                .join(graph.vertices, out_degrees.id == graph.vertices.id)
                .select("id", "outDegree", "name")
                .limit(10))
    
    top_authorities = (in_degrees.orderBy(desc("inDegree"))
                      .join(graph.vertices, in_degrees.id == graph.vertices.id)
                      .select("id", "inDegree", "name")
                      .limit(10))
    
    # Triangle count
    triangle_counts = graph.triangleCount()
    total_triangles = triangle_counts.select("count").agg({"count": "sum"}).collect()[0][0]
    
    return {
        "num_vertices": num_vertices,
        "num_edges": num_edges,
        "avg_in_degree": avg_in_degree,
        "avg_out_degree": avg_out_degree,
        "top_hubs": top_hubs,
        "top_authorities": top_authorities,
        "total_triangles": total_triangles
    }

def detect_communities(graph, max_iter=10):
    """Detect communities using Label Propagation"""
    print("\nDetecting communities...")
    communities = graph.labelPropagation(maxIter=max_iter)
    
    # Analyze community statistics
    community_stats = (communities.groupBy("label")
                      .agg(count("*").alias("size"))
                      .orderBy(desc("size")))
    
    return communities, community_stats

def visualize_network_stats(graph_metrics, community_stats):
    """Create visualizations of network statistics"""
    print("\nCreating visualizations...")
    
    # Convert community stats to pandas for plotting
    community_sizes = community_stats.toPandas()
    
    # Plot community size distribution
    plt.figure(figsize=(12, 6))
    
    # Plot 1: Community size distribution (log scale)
    plt.subplot(1, 2, 1)
    plt.hist(community_sizes['size'], bins=50, log=True)
    plt.title('Community Size Distribution')
    plt.xlabel('Community Size')
    plt.ylabel('Count (log scale)')
    
    # Plot 2: Top communities by size
    plt.subplot(1, 2, 2)
    top_communities = community_sizes.nlargest(10, 'size')
    plt.bar(range(len(top_communities)), top_communities['size'])
    plt.title('Top 10 Largest Communities')
    plt.xlabel('Community Rank')
    plt.ylabel('Size')
    
    plt.tight_layout()
    plt.savefig('community_analysis.png')
    plt.close()

def analyze_user_roles(graph, communities):
    """Analyze user roles within communities"""
    # Combine degree information with communities
    user_metrics = (graph.degrees
                   .join(communities, "id")
                   .join(graph.vertices.select("id", "ml_target"), "id"))
    
    # Identify influential users in each community
    influential_users = (user_metrics
                        .orderBy(desc("degree"))
                        .groupBy("label")
                        .agg({"degree": "max"})
                        .orderBy(desc("max(degree)")))
    
    return influential_users

def main():
    spark = create_spark_session()
    
    edges_path = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_edges.csv"
    vertices_path = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_target.csv"
    
    try:
        # Load and prepare data
        edges_df, vertices_df = load_and_prepare_data(spark, edges_path, vertices_path)
        graph = GraphFrame(vertices_df, edges_df)
        
        # Analyze graph metrics
        metrics = analyze_graph_metrics(graph)
        
        print("\nNetwork Statistics:")
        print(f"Number of vertices: {metrics['num_vertices']}")
        print(f"Number of edges: {metrics['num_edges']}")
        print(f"Average in-degree: {metrics['avg_in_degree']:.2f}")
        print(f"Average out-degree: {metrics['avg_out_degree']:.2f}")
        print(f"Total triangles: {metrics['total_triangles']}")
        
        print("\nTop Hubs (Users with highest out-degree):")
        metrics['top_hubs'].show(5)
        
        print("\nTop Authorities (Users with highest in-degree):")
        metrics['top_authorities'].show(5)
        
        # Detect and analyze communities
        communities, community_stats = detect_communities(graph)
        
        print("\nCommunity Statistics:")
        print("Top 10 largest communities:")
        community_stats.show(10)
        
        # Analyze user roles
        influential_users = analyze_user_roles(graph, communities)
        
        print("\nMost influential users by community:")
        influential_users.show(10)
        
        # Create visualizations
        visualize_network_stats(metrics, community_stats)
        
        # Save results
        communities.write.csv("github_communities_detailed.csv", header=True, mode="overwrite")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise e
        
    finally:
        spark.stop()

if __name__ == "__main__":
    main()




Computing graph metrics...
An error occurred: [AMBIGUOUS_REFERENCE] Reference `id` is ambiguous, could be: [`id`, `id`].


AnalysisException: [AMBIGUOUS_REFERENCE] Reference `id` is ambiguous, could be: [`id`, `id`].

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType
from graphframes import GraphFrame
import numpy as np
from typing import Iterator, Tuple

class GitHubCommunityDetection:
    def __init__(self, edges_path: str, vertices_path: str):
        self.spark = SparkSession.builder \
            .appName("GitHub Community Detection") \
            .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12") \
            .getOrCreate()
            
        # Define schemas
        self.edges_schema = StructType([
            StructField("id_1", StringType(), False),
            StructField("id_2", StringType(), False)
        ])
        
        self.vertices_schema = StructType([
            StructField("id", StringType(), False),
            StructField("name", StringType(), False),
            StructField("ml_target", IntegerType(), False)
        ])
        
        # Read data
        self.edges_df = self.spark.read.csv(edges_path, header=True, schema=self.edges_schema)
        self.vertices_df = self.spark.read.csv(vertices_path, header=True, schema=self.vertices_schema)
        
        # Prepare DataFrames for GraphFrames
        self.edges_df = self.edges_df \
            .withColumn("id_1", col("id_1").cast(LongType())) \
            .withColumn("id_2", col("id_2").cast(LongType())) \
            .withColumnRenamed("id_1", "src") \
            .withColumnRenamed("id_2", "dst")
            
        self.vertices_df = self.vertices_df \
            .withColumn("id", col("id").cast(LongType())) \
            .withColumnRenamed("id", "id")

    def detect_communities(self, max_iterations: int = 10) -> None:
        """
        Detect communities using Label Propagation Algorithm and calculate modularity.
        
        Args:
            max_iterations: Maximum number of iterations for LPA
        """
        # Create GraphFrame
        graph = GraphFrame(self.vertices_df, self.edges_df)
        
        # Run LPA for community detection
        communities = graph.labelPropagation(maxIter=max_iterations)
        
        # Calculate modularity
        modularity = self._calculate_modularity(graph, communities)
        
        # Get community statistics
        community_stats = communities.groupBy("label").count().orderBy("count", ascending=False)
        
        # Print results
        print(f"\nNetwork Modularity: {modularity:.4f}")
        print("\nTop 10 Communities by Size:")
        community_stats.show(10)
        
        # Save results
        self._save_results(communities)

    def _calculate_modularity(self, graph: GraphFrame, communities) -> float:
        """
        Calculate modularity score for the network.
        
        Args:
            graph: GraphFrame object containing the network
            communities: DataFrame with community assignments
            
        Returns:
            float: Modularity score
        """
        # Get total number of edges
        m = graph.edges.count()
        
        # Calculate node degrees
        degrees = graph.degrees
        
        # Join community assignments with edges
        edges_with_communities = graph.edges \
            .join(communities.select("id", "label").withColumnRenamed("label", "src_community"), 
                  col("src") == col("id")) \
            .join(communities.select("id", "label").withColumnRenamed("label", "dst_community"), 
                  col("dst") == col("id"))
            
        # Join with degrees
        edges_with_info = edges_with_communities \
            .join(degrees.withColumnRenamed("id", "src").withColumnRenamed("degree", "src_degree"), "src") \
            .join(degrees.withColumnRenamed("id", "dst").withColumnRenamed("degree", "dst_degree"), "dst")
        
        # Calculate modularity contributions
        def modularity_contribution(src_comm, dst_comm, src_deg, dst_deg) -> float:
            if src_comm == dst_comm:
                return 1.0 - (src_deg * dst_deg) / (2.0 * m)
            return 0.0
        
        # Register UDF
        from pyspark.sql.functions import udf
        mod_udf = udf(modularity_contribution)
        
        # Calculate total modularity
        modularity = edges_with_info \
            .select(mod_udf("src_community", "dst_community", "src_degree", "dst_degree").alias("contribution")) \
            .agg({"contribution": "sum"}) \
            .collect()[0][0]
            
        return modularity / (2.0 * m)

    def _save_results(self, communities) -> None:
        """
        Save community detection results.
        
        Args:
            communities: DataFrame with community assignments
        """
        # Join with original vertex data
        results = communities \
            .join(self.vertices_df, "id") \
            .select("id", "name", "ml_target", "label") \
            .withColumnRenamed("label", "community")
            
        # Save to CSV
        results.write \
            .mode("overwrite") \
            .option("header", True) \
            .csv("community_results")

def main():
    # Initialize and run community detection
    detector = GitHubCommunityDetection(
        edges_path="musae_git_edges.csv",
        vertices_path="musae_git_target.csv"
    )
    detector.detect_communities()

if __name__ == "__main__":
    main()



AnalysisException: [AMBIGUOUS_REFERENCE] Reference `id` is ambiguous, could be: [`id`, `id`].

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from graphframes import GraphFrame
import numpy as np
from typing import Iterator, Tuple

class GitHubCommunityDetection:
    def __init__(self, edges_path: str, vertices_path: str):
        self.spark = SparkSession.builder \
            .appName("GitHub Community Detection") \
            .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12") \
            .getOrCreate()
            
        # Define schemas for the actual data format
        self.edges_schema = StructType([
            StructField("id_1", IntegerType(), False),
            StructField("id_2", IntegerType(), False)
        ])
        
        self.vertices_schema = StructType([
            StructField("id", IntegerType(), False),
            StructField("name", StringType(), False),
            StructField("ml_target", IntegerType(), False)
        ])
        
        # Read data with the correct schema
        self.edges_df = self.spark.read.csv(edges_path, header=True, schema=self.edges_schema)
        self.vertices_df = self.spark.read.csv(vertices_path, header=True, schema=self.vertices_schema)
        
        # Prepare DataFrames for GraphFrames - convert column names
        self.edges_df = self.edges_df \
            .withColumnRenamed("id_1", "src") \
            .withColumnRenamed("id_2", "dst")

    def detect_communities(self, max_iterations: int = 10) -> None:
        """
        Detect communities using Label Propagation Algorithm and calculate modularity.
        
        Args:
            max_iterations: Maximum number of iterations for LPA
        """
        # Create GraphFrame
        graph = GraphFrame(self.vertices_df, self.edges_df)
        
        # Run LPA for community detection
        communities = graph.labelPropagation(maxIter=max_iterations)
        
        # Calculate modularity
        modularity = self._calculate_modularity(graph, communities)
        
        # Get community statistics
        community_stats = communities.groupBy("label").count().orderBy("count", ascending=False)
        
        # Print results
        print(f"\nNetwork Modularity: {modularity:.4f}")
        print("\nTop 10 Communities by Size:")
        community_stats.show(10)
        
        # Save results
        self._save_results(communities)

    def _calculate_modularity(self, graph: GraphFrame, communities) -> float:
        """
        Calculate modularity score for the network.
        
        Args:
            graph: GraphFrame object containing the network
            communities: DataFrame with community assignments
            
        Returns:
            float: Modularity score
        """
        # Get total number of edges
        m = graph.edges.count()
        
        # Calculate node degrees
        degrees = graph.degrees
        
        # Prepare community DataFrames with unique column names
        src_communities = communities.select(
            col("id").alias("src"),
            col("label").alias("src_community")
        )
        
        dst_communities = communities.select(
            col("id").alias("dst"),
            col("label").alias("dst_community")
        )
        
        # Join community assignments with edges using explicit column references
        edges_with_communities = graph.edges \
            .join(src_communities, "src") \
            .join(dst_communities, "dst")
            
        # Prepare degree DataFrames with unique column names
        src_degrees = degrees.select(
            col("id").alias("src"),
            col("degree").alias("src_degree")
        )
        
        dst_degrees = degrees.select(
            col("id").alias("dst"),
            col("degree").alias("dst_degree")
        )
        
        # Join with degrees using explicit column references
        edges_with_info = edges_with_communities \
            .join(src_degrees, "src") \
            .join(dst_degrees, "dst")
        
        # Calculate modularity contributions
        def modularity_contribution(src_comm, dst_comm, src_deg, dst_deg) -> float:
            if src_comm == dst_comm:
                return 1.0 - (src_deg * dst_deg) / (2.0 * m)
            return 0.0
        
        # Register UDF
        from pyspark.sql.functions import udf
        mod_udf = udf(modularity_contribution)
        
        # Calculate total modularity
        modularity = edges_with_info \
            .select(mod_udf("src_community", "dst_community", "src_degree", "dst_degree").alias("contribution")) \
            .agg({"contribution": "sum"}) \
            .collect()[0][0]
            
        return modularity / (2.0 * m)

    def _save_results(self, communities) -> None:
        """
        Save community detection results.
        
        Args:
            communities: DataFrame with community assignments
        """
        # Join with original vertex data using explicit column references
        results = communities \
            .join(self.vertices_df.select(
                col("id"),
                col("name"),
                col("ml_target")
            ), "id") \
            .select("id", "name", "ml_target", "label") \
            .withColumnRenamed("label", "community")
            
        # Save to CSV
        results.write \
            .mode("overwrite") \
            .option("header", True) \
            .csv("community_results")

def main():
    # Initialize and run community detection
    detector = GitHubCommunityDetection(
        edges_path="musae_git_edges.csv",
        vertices_path="musae_git_target.csv"
    )
    detector.detect_communities()

if __name__ == "__main__":
    main()

                                                                                


Network Modularity: 0.4287

Top 10 Communities by Size:
+-----+-----+
|label|count|
+-----+-----+
|35773|31477|
|31126| 5757|
| 6301|  105|
| 8095|   12|
|27923|   10|
| 3672|    9|
|29439|    9|
| 5547|    8|
|20611|    6|
|31577|    6|
+-----+-----+
only showing top 10 rows



AnalysisException: [AMBIGUOUS_REFERENCE] Reference `name` is ambiguous, could be: [`name`, `name`].

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from graphframes import GraphFrame
import numpy as np
from typing import Iterator, Tuple

class GitHubCommunityDetection:
    def __init__(self, edges_path: str, vertices_path: str):
        self.spark = SparkSession.builder \
            .appName("GitHub Community Detection") \
            .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12") \
            .getOrCreate()
            
        # Define schemas for the actual data format
        self.edges_schema = StructType([
            StructField("id_1", IntegerType(), False),
            StructField("id_2", IntegerType(), False)
        ])
        
        self.vertices_schema = StructType([
            StructField("id", IntegerType(), False),
            StructField("name", StringType(), False),
            StructField("ml_target", IntegerType(), False)
        ])
        
        # Read data with the correct schema
        self.edges_df = self.spark.read.csv(edges_path, header=True, schema=self.edges_schema)
        self.vertices_df = self.spark.read.csv(vertices_path, header=True, schema=self.vertices_schema)
        
        # Prepare DataFrames for GraphFrames and rename columns to avoid ambiguity
        self.edges_df = self.edges_df \
            .withColumnRenamed("id_1", "src") \
            .withColumnRenamed("id_2", "dst")
            
        # Rename vertices columns to avoid ambiguity
        self.vertices_df = self.vertices_df \
            .withColumnRenamed("name", "user_name")

    def detect_communities(self, max_iterations: int = 10) -> None:
        """
        Detect communities using Label Propagation Algorithm and calculate modularity.
        
        Args:
            max_iterations: Maximum number of iterations for LPA
        """
        # Create GraphFrame
        graph = GraphFrame(self.vertices_df, self.edges_df)
        
        # Run LPA for community detection
        communities = graph.labelPropagation(maxIter=max_iterations)
        
        # Calculate modularity
        modularity = self._calculate_modularity(graph, communities)
        
        # Get community statistics
        community_stats = communities.groupBy("label").count().orderBy("count", ascending=False)
        
        # Print results
        print(f"\nNetwork Modularity: {modularity:.4f}")
        print("\nTop 10 Communities by Size:")
        community_stats.show(10)
        
        # Save results
        self._save_results(communities)

    def _calculate_modularity(self, graph: GraphFrame, communities) -> float:
        """
        Calculate modularity score for the network.
        
        Args:
            graph: GraphFrame object containing the network
            communities: DataFrame with community assignments
            
        Returns:
            float: Modularity score
        """
        # Get total number of edges
        m = graph.edges.count()
        
        # Calculate node degrees
        degrees = graph.degrees
        
        # Prepare community DataFrames with unique column names
        src_communities = communities.select(
            col("id").alias("src"),
            col("label").alias("src_community")
        )
        
        dst_communities = communities.select(
            col("id").alias("dst"),
            col("label").alias("dst_community")
        )
        
        # Join community assignments with edges using explicit column references
        edges_with_communities = graph.edges \
            .join(src_communities, "src") \
            .join(dst_communities, "dst")
            
        # Prepare degree DataFrames with unique column names
        src_degrees = degrees.select(
            col("id").alias("src"),
            col("degree").alias("src_degree")
        )
        
        dst_degrees = degrees.select(
            col("id").alias("dst"),
            col("degree").alias("dst_degree")
        )
        
        # Join with degrees using explicit column references
        edges_with_info = edges_with_communities \
            .join(src_degrees, "src") \
            .join(dst_degrees, "dst")
        
        # Calculate modularity contributions
        def modularity_contribution(src_comm, dst_comm, src_deg, dst_deg) -> float:
            if src_comm == dst_comm:
                return 1.0 - (src_deg * dst_deg) / (2.0 * m)
            return 0.0
        
        # Register UDF
        from pyspark.sql.functions import udf
        mod_udf = udf(modularity_contribution)
        
        # Calculate total modularity
        modularity = edges_with_info \
            .select(mod_udf("src_community", "dst_community", "src_degree", "dst_degree").alias("contribution")) \
            .agg({"contribution": "sum"}) \
            .collect()[0][0]
            
        return modularity / (2.0 * m)

    def _save_results(self, communities) -> None:
        """
        Save community detection results.
        
        Args:
            communities: DataFrame with community assignments
        """
        # Select columns from communities with unique aliases
        communities_with_label = communities.select(
            col("id").alias("user_id"),
            col("label").alias("community_label")
        )
        
        # Join with vertices using explicit column references
        results = communities_with_label.join(
            self.vertices_df,
            communities_with_label.user_id == self.vertices_df.id
        ).select(
            communities_with_label.user_id.alias("id"),
            self.vertices_df.user_name.alias("name"),
            self.vertices_df.ml_target,
            communities_with_label.community_label.alias("community")
        )
        
        # Save to CSV
        results.write \
            .mode("overwrite") \
            .option("header", True) \
            .csv("community_results")

def main():
    # Initialize and run community detection
    detector = GitHubCommunityDetection(
        edges_path="musae_git_edges.csv",
        vertices_path="musae_git_target.csv"
    )
    detector.detect_communities()

if __name__ == "__main__":
    main()

24/11/25 19:13:37 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1872870 ms exceeds timeout 120000 ms
24/11/25 19:13:37 WARN SparkContext: Killing executors is not supported by current scheduler.
24/11/25 19:13:39 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$


Network Modularity: 0.4287

Top 10 Communities by Size:
+-----+-----+
|label|count|
+-----+-----+
|35773|31477|
|31126| 5757|
| 6301|  105|
| 8095|   12|
|27923|   10|
| 3672|    9|
|29439|    9|
| 5547|    8|
|20611|    6|
|31577|    6|
+-----+-----+
only showing top 10 rows



AnalysisException: Column ml_target#928 are ambiguous. It's probably because you joined several Datasets together, and some of these Datasets are the same. This column points to one of the Datasets but Spark is unable to figure out which one. Please alias the Datasets with different names via `Dataset.as` before joining them, and specify the column using qualified name, e.g. `df.as("a").join(df.as("b"), $"a.id" > $"b.id")`. You can also set spark.sql.analyzer.failAmbiguousSelfJoin to false to disable this check.

24/11/25 21:23:50 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:123)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.isExecutorAlive$lzycompute$1(BlockManagerMasterEndpoint.scala:688)
	at org.apache.spark.storage.BlockManagerMasterE

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from graphframes import GraphFrame
import numpy as np
from typing import Iterator, Tuple

class GitHubCommunityDetection:
    def __init__(self, edges_path: str, vertices_path: str):
        self.spark = SparkSession.builder \
            .appName("GitHub Community Detection") \
            .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12") \
            .getOrCreate()
            
        # Define schemas for the actual data format
        self.edges_schema = StructType([
            StructField("id_1", IntegerType(), False),
            StructField("id_2", IntegerType(), False)
        ])
        
        self.vertices_schema = StructType([
            StructField("id", IntegerType(), False),
            StructField("name", StringType(), False),
            StructField("ml_target", IntegerType(), False)
        ])
        
        # Read data with the correct schema
        self.edges_df = self.spark.read.csv(edges_path, header=True, schema=self.edges_schema)
        self.vertices_df = self.spark.read.csv(vertices_path, header=True, schema=self.vertices_schema)
        
        # Prepare DataFrames for GraphFrames and rename columns to avoid ambiguity
        self.edges_df = self.edges_df \
            .withColumnRenamed("id_1", "src") \
            .withColumnRenamed("id_2", "dst")
            
        # Keep original column names for vertices DataFrame
        self.vertices_df = self.vertices_df.alias("vertices")

    def detect_communities(self, max_iterations: int = 10) -> None:
        """
        Detect communities using Label Propagation Algorithm and calculate modularity.
        
        Args:
            max_iterations: Maximum number of iterations for LPA
        """
        # Create GraphFrame
        graph = GraphFrame(self.vertices_df, self.edges_df)
        
        # Run LPA for community detection
        communities = graph.labelPropagation(maxIter=max_iterations)
        
        # Calculate modularity
        modularity = self._calculate_modularity(graph, communities)
        
        # Get community statistics
        community_stats = communities.groupBy("label").count().orderBy("count", ascending=False)
        
        # Print results
        print(f"\nNetwork Modularity: {modularity:.4f}")
        print("\nTop 10 Communities by Size:")
        community_stats.show(10)
        
        # Save results
        self._save_results(communities)

    def _calculate_modularity(self, graph: GraphFrame, communities) -> float:
        """
        Calculate modularity score for the network.
        
        Args:
            graph: GraphFrame object containing the network
            communities: DataFrame with community assignments
            
        Returns:
            float: Modularity score
        """
        # Get total number of edges
        m = graph.edges.count()
        
        # Calculate node degrees
        degrees = graph.degrees
        
        # Prepare community DataFrames with unique column names
        src_communities = communities.select(
            col("id").alias("src"),
            col("label").alias("src_community")
        )
        
        dst_communities = communities.select(
            col("id").alias("dst"),
            col("label").alias("dst_community")
        )
        
        # Join community assignments with edges using explicit column references
        edges_with_communities = graph.edges \
            .join(src_communities, "src") \
            .join(dst_communities, "dst")
            
        # Prepare degree DataFrames with unique column names
        src_degrees = degrees.select(
            col("id").alias("src"),
            col("degree").alias("src_degree")
        )
        
        dst_degrees = degrees.select(
            col("id").alias("dst"),
            col("degree").alias("dst_degree")
        )
        
        # Join with degrees using explicit column references
        edges_with_info = edges_with_communities \
            .join(src_degrees, "src") \
            .join(dst_degrees, "dst")
        
        # Calculate modularity contributions
        def modularity_contribution(src_comm, dst_comm, src_deg, dst_deg) -> float:
            if src_comm == dst_comm:
                return 1.0 - (src_deg * dst_deg) / (2.0 * m)
            return 0.0
        
        # Register UDF
        from pyspark.sql.functions import udf
        mod_udf = udf(modularity_contribution)
        
        # Calculate total modularity
        modularity = edges_with_info \
            .select(mod_udf("src_community", "dst_community", "src_degree", "dst_degree").alias("contribution")) \
            .agg({"contribution": "sum"}) \
            .collect()[0][0]
            
        return modularity / (2.0 * m)

    def _save_results(self, communities) -> None:
        """
        Save community detection results.
        
        Args:
            communities: DataFrame with community assignments
        """
        # First, create a view of the vertices DataFrame
        self.vertices_df.createOrReplaceTempView("vertices_view")
        
        # Create a view of the communities DataFrame with selected columns
        communities_selected = communities.select(
            col("id"),
            col("label").alias("community_label")
        )
        communities_selected.createOrReplaceTempView("communities_view")
        
        # Use SQL to join the data and select columns
        results = self.spark.sql("""
            SELECT 
                c.id,
                v.name,
                v.ml_target,
                c.community_label as community
            FROM communities_view c
            JOIN vertices_view v ON c.id = v.id
        """)
        
        # Save to CSV
        results.write \
            .mode("overwrite") \
            .option("header", True) \
            .csv("community_results")

def main():
    # Initialize and run community detection
    detector = GitHubCommunityDetection(
        edges_path="musae_git_edges.csv",
        vertices_path="musae_git_target.csv"
    )
    detector.detect_communities()

if __name__ == "__main__":
    main()

24/11/27 15:22:00 WARN Utils: Your hostname, Parass-MacBook-Air-2.local resolves to a loopback address: 127.0.0.1; using 192.168.49.38 instead (on interface en0)
24/11/27 15:22:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/parasdhiman/.ivy2/cache
The jars for the packages stored in: /Users/parasdhiman/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2b1a27b3-efb0-42c2-a2b9-6d0dc51c223d;1.0
	confs: [default]
	found graphframes#graphframes;0.8.2-spark3.2-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 60ms :: artifacts dl 2ms
	:: modules in use:
	graphframes#graphframes;0.8.2-spark3.2-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   

:: loading settings :: url = jar:file:/Users/parasdhiman/opt/anaconda3/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


24/11/27 15:22:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                


Network Modularity: 0.4287

Top 10 Communities by Size:
+-----+-----+
|label|count|
+-----+-----+
|35773|31477|
|31126| 5757|
| 6301|  105|
| 8095|   12|
|27923|   10|
| 3672|    9|
|29439|    9|
| 5547|    8|
|20611|    6|
|31577|    6|
+-----+-----+
only showing top 10 rows



In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, count, lit, collect_list, avg, max as sql_max,
    udf, array
)
from pyspark.sql.window import Window
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, 
    DoubleType, ArrayType
)
from graphframes import GraphFrame
import numpy as np
from typing import Iterator, Tuple

class HighModularityGitHubCommunityDetection:
    def __init__(self, edges_path: str, vertices_path: str):
        self.spark = SparkSession.builder \
            .appName("High Modularity GitHub Community Detection") \
            .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12") \
            .getOrCreate()
            
        self.edges_schema = StructType([
            StructField("id_1", IntegerType(), False),
            StructField("id_2", IntegerType(), False)
        ])
        
        self.vertices_schema = StructType([
            StructField("id", IntegerType(), False),
            StructField("name", StringType(), False),
            StructField("ml_target", IntegerType(), False)
        ])
        
        # Read data
        self.edges_df = self.spark.read.csv(edges_path, header=True, schema=self.edges_schema)
        self.vertices_df = self.spark.read.csv(vertices_path, header=True, schema=self.vertices_schema)
        
        # Prepare enhanced edges with weights
        self.edges_df = self._prepare_weighted_edges()
        
    def _prepare_weighted_edges(self):
        """Prepare edges with sophisticated weight calculation"""
        # Create bidirectional edges
        edges_reverse = self.edges_df.select(
            col("id_2").alias("id_1"),
            col("id_1").alias("id_2")
        )
        edges_df = self.edges_df.union(edges_reverse).distinct()
        
        # Calculate neighbors for each node
        neighbors_df = edges_df.groupBy("id_1").agg(
            collect_list("id_2").alias("neighbors")
        )
        
        # Define the Jaccard similarity UDF
        def calculate_jaccard_similarity(neighbors1, neighbors2):
            if not neighbors1 or not neighbors2:
                return 0.0
            set1 = set(neighbors1)
            set2 = set(neighbors2)
            intersection = len(set1.intersection(set2))
            union = len(set1.union(set2))
            return float(intersection) / float(union) if union > 0 else 0.0
        
        jaccard_udf = udf(
            calculate_jaccard_similarity,
            DoubleType()
        )
        
        # Join edges with neighbor information
        weighted_edges = edges_df.alias("edges") \
            .join(
                neighbors_df.alias("neighbors"),
                col("edges.id_1") == col("neighbors.id_1"),
                "left"
            ).join(
                neighbors_df.select(
                    col("id_1").alias("id_2"),
                    col("neighbors").alias("neighbors2")
                ).alias("neighbors2"),
                col("edges.id_2") == col("neighbors2.id_2"),
                "left"
            ).select(
                col("edges.id_1").alias("src"),
                col("edges.id_2").alias("dst"),
                jaccard_udf(
                    col("neighbors.neighbors"),
                    col("neighbors2.neighbors2")
                ).alias("weight")
            )
        
        # Fill any null weights with minimal value
        weighted_edges = weighted_edges.na.fill(0.1, ["weight"])
        
        return weighted_edges

    def detect_communities(self, max_iterations: int = 20) -> None:
        """
        Enhanced community detection with multi-level optimization.
        """
        # Create initial GraphFrame with weighted edges
        graph = GraphFrame(self.vertices_df, self.edges_df)
        
        # Run multiple iterations with different initializations
        best_modularity = float('-inf')
        best_communities = None
        
        for i in range(5):
            # Set checkpoint directory for current iteration
            self.spark.sparkContext.setCheckpointDir(f"checkpoint_{i}")
            
            # Initial community detection
            communities = graph.labelPropagation(maxIter=max_iterations)
            
            # Add degree information
            communities = communities.join(
                graph.degrees,
                communities.id == graph.degrees.id
            ).select(
                communities.id,
                communities.label,
                graph.degrees.degree
            )
            
            # Calculate modularity
            modularity = self._calculate_modularity(graph, communities)
            
            if modularity > best_modularity:
                best_modularity = modularity
                best_communities = communities
            
            # Clean up checkpoint directory
            self.spark.sparkContext.setCheckpointDir(None)
        
        # Get community statistics
        community_stats = best_communities.groupBy("label").count().orderBy("count", ascending=False)
        
        # Print results
        print(f"\nOptimized Network Modularity: {best_modularity:.4f}")
        print("\nTop 10 Communities by Size:")
        community_stats.show(10)
        
        # Save results
        self._save_results(best_communities)

    def _calculate_modularity(self, graph: GraphFrame, communities) -> float:
        """Enhanced modularity calculation with edge weights"""
        total_weight = graph.edges.agg({"weight": "sum"}).collect()[0][0]
        
        src_communities = communities.select(
            col("id").alias("src"),
            col("label").alias("src_community"),
            col("degree").alias("src_degree")
        )
        
        dst_communities = communities.select(
            col("id").alias("dst"),
            col("label").alias("dst_community"),
            col("degree").alias("dst_degree")
        )
        
        edges_with_info = graph.edges \
            .join(src_communities, "src") \
            .join(dst_communities, "dst")
        
        def weighted_modularity_contribution(src_comm, dst_comm, src_deg, dst_deg, weight):
            if src_comm == dst_comm:
                expected = (src_deg * dst_deg) / (2.0 * total_weight)
                return weight * (1.0 - expected)
            return 0.0
        
        mod_udf = udf(weighted_modularity_contribution, DoubleType())
        
        modularity = edges_with_info \
            .select(
                mod_udf(
                    "src_community",
                    "dst_community",
                    "src_degree",
                    "dst_degree",
                    "weight"
                ).alias("contribution")
            ) \
            .agg({"contribution": "sum"}) \
            .collect()[0][0]
            
        return modularity / (2.0 * total_weight)

    def _save_results(self, communities) -> None:
        """Save community detection results"""
        self.vertices_df.createOrReplaceTempView("vertices_view")
        
        communities_selected = communities.select(
            col("id"),
            col("label").alias("community_label")
        )
        communities_selected.createOrReplaceTempView("communities_view")
        
        results = self.spark.sql("""
            SELECT 
                c.id,
                v.name,
                v.ml_target,
                c.community_label as community
            FROM communities_view c
            JOIN vertices_view v ON c.id = v.id
        """)
        
        results.write \
            .mode("overwrite") \
            .option("header", True) \
            .csv("high_modularity_results")

def main():
    detector = HighModularityGitHubCommunityDetection(
        edges_path="musae_git_edges.csv",
        vertices_path="musae_git_target.csv"
    )
    detector.detect_communities()

if __name__ == "__main__":
    main()

                                                                                


Optimized Network Modularity: -1.6876

Top 10 Communities by Size:
+-----+-----+
|label|count|
+-----+-----+
|35773|37420|
| 5547|    9|
|31126|    6|
| 6938|    6|
| 6301|    6|
|24460|    5|
|29161|    5|
|18942|    5|
| 2353|    5|
| 4892|    4|
+-----+-----+
only showing top 10 rows



In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, first, struct, explode, array
from pyspark.sql.window import Window
import pyspark.sql.functions as F

class AffiliationGraphModelDetector:
    def __init__(self, edges_path):
        """
        Initialize Spark Session and load graph data
        """
        self.spark = SparkSession.builder \
            .appName("Affiliation Graph Model Community Detection") \
            .master("local[*]") \
            .getOrCreate()
        
        # Read edges
        self.edges_df = self.spark.read.csv(edges_path, header=True) \
            .select(
                col("id_1").cast("long").alias("src"),
                col("id_2").cast("long").alias("dst")
            )
        
        # Prepare nodes
        self.prepare_nodes()
    
    def prepare_nodes(self):
        """
        Prepare unique nodes with initial community assignment
        """
        # Extract unique nodes
        nodes_df = self.edges_df.select("src").union(self.edges_df.select("dst")).distinct()
        
        # Assign initial communities using modulo on src column
        self.nodes_df = nodes_df.withColumn(
            "community", 
            F.expr("src % 10")  # Initial simple community assignment using src column
        )
    
    def community_detection(self, max_iterations=5):
        """
        Iterative community detection based on neighbor communities
        """
        current_nodes = self.nodes_df
        
        for iteration in range(max_iterations):
            print(f"Iteration {iteration + 1}")
            
            # Find neighbor communities
            neighbor_communities = self.edges_df.join(
                current_nodes.withColumnRenamed("src", "node"),
                current_nodes["src"] == self.edges_df["src"]
            ).join(
                current_nodes.withColumnRenamed("dst", "neighbor"),
                current_nodes["dst"] == self.edges_df["dst"]
            ).select(
                self.edges_df["src"], 
                self.edges_df["dst"], 
                col("community").alias("neighbor_community")
            )
            
            # Aggregate neighbor communities
            community_aggregation = neighbor_communities.groupBy("src").agg(
                F.mode("neighbor_community").alias("new_community")
            )
            
            # Update communities
            current_nodes = current_nodes.join(
                community_aggregation, 
                current_nodes["src"] == community_aggregation["src"], 
                "left"
            ).select(
                current_nodes["src"], 
                F.coalesce("new_community", current_nodes["community"]).alias("community")
            )
        
        return current_nodes
    
    def calculate_modularity(self, communities):
        """
        Calculate modularity of detected communities
        """
        # Total number of edges
        total_edges = self.edges_df.count()
        
        # Edges within each community
        community_edges = self.edges_df.join(
            communities.withColumnRenamed("src", "id1"),
            self.edges_df["src"] == communities["id1"]
        ).join(
            communities.withColumnRenamed("dst", "id2"),
            self.edges_df["dst"] == communities["id2"]
        ).filter(
            F.col("community_id1") == F.col("community_id2")
        ).count()
        
        # Community size distribution
        community_sizes = communities.groupBy("community").agg(count("*").alias("size"))
        
        # Expected internal edges
        expected_edges = community_sizes.withColumn(
            "expected", 
            F.col("size") * (F.col("size") - 1) / (2 * total_edges)
        )
        
        # Compute modularity
        modularity = (community_edges / total_edges) - expected_edges.agg(
            F.sum("expected")
        ).collect()[0][0]
        
        return modularity
    
    def run_community_detection(self):
        """
        Run complete community detection process
        """
        # Detect communities
        detected_communities = self.community_detection()
        
        # Calculate modularity
        modularity = self.calculate_modularity(detected_communities)
        
        # Display results
        print("\nCommunity Detection Results:")
        detected_communities.groupBy("community").count().show()
        print(f"Modularity Score: {modularity}")
        
        return detected_communities, modularity
    
    def __del__(self):
        """
        Clean up Spark session
        """
        self.spark.stop()

def main():
    # Path to edges CSV
    edges_path = 'musae_git_edges.csv'
    
    # Initialize and run community detection
    agm_detector = AffiliationGraphModelDetector(edges_path)
    communities, modularity = agm_detector.run_community_detection()

if __name__ == "__main__":
    main()

Iteration 1


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `dst` cannot be resolved. Did you mean one of the following? [`src`, `community`].

In [9]:
from pyspark.sql import SparkSession

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("CommunityDetectionAGM") \
    .getOrCreate()

# Load edges (muse_git_edges.csv) and users' targets (muse_git_target.csv)
edges_df = spark.read.csv("musae_git_edges.csv", header=False, inferSchema=True)
users_df = spark.read.csv("musae_git_target.csv", header=True, inferSchema=True)


# Print the first few rows of the DataFrames to inspect
edges_df.show(5)
users_df.show(5)

from pyspark.graphx import Graph

# Convert DataFrame to RDD for GraphX processing
edges_rdd = edges_df.rdd.map(lambda row: (row[0], row[1]))  # (id_1, id_2)
vertices_rdd = users_df.rdd.map(lambda row: (row[0], row[1]))  # (id, name)

# Construct the graph
graph = Graph(vertices_rdd, edges_rdd)

from pyspark.graphx import LabelPropagation

# Run the Label Propagation algorithm to detect communities
lp_result = graph.labelPropagation(maxIter=5)

# Extract the community labels (each vertex will have a community label assigned)
community_rdd = lp_result.vertices

# Show the first few community labels
community_rdd.take(10)

# Calculate Modularity
modularity_value = graph.modularity(lp_result)
print(f"Modularity: {modularity_value}")

# Save the community results to a CSV file
community_rdd.toDF(["user_id", "community_label"]).write.csv("community_results.csv")



+----+-----+
| _c0|  _c1|
+----+-----+
|id_1| id_2|
|   0|23977|
|   1|34526|
|   1| 2370|
|   1|14683|
+----+-----+
only showing top 5 rows

+---+------------+---------+
| id|        name|ml_target|
+---+------------+---------+
|  0|      Eiryyy|        0|
|  1|  shawflying|        0|
|  2| JpMCarrilho|        1|
|  3|   SuhwanCha|        0|
|  4|sunilangadi2|        1|
+---+------------+---------+
only showing top 5 rows



ModuleNotFoundError: No module named 'pyspark.graphx'

In [None]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("CommunityDetectionAGM") \
    .getOrCreate()

# Load the edge list (muse_git_edges.csv) – this represents user relationships
edges_df = spark.read.csv("musae_git_edges.csv", header=False, inferSchema=True)
edges_df = edges_df.withColumnRenamed("id_1", "src").withColumnRenamed("id_2", "dst")

# Load the user data (muse_git_target.csv) – this contains user details like name and target variable
users_df = spark.read.csv("muasae_git_target.csv", header=True, inferSchema=True)
users_df = users_df.withColumnRenamed("id", "id").withColumnRenamed("name", "name")

# Show basic information about the dataset
print(f"Number of vertices (users): {users_df.count()}")
print(f"Number of edges: {edges_df.count()}")

# Create a GraphFrame: Vertices (users) and Edges (relationships)
g = GraphFrame(users_df, edges_df)

# Show the graph vertices and edges
g.vertices.show(5)  # Display first 5 vertices (users)
g.edges.show(5)     # Display first 5 edges (user relationships)

# Run the Label Propagation algorithm for community detection
result = g.labelPropagation(maxIter=5)

# Show the results (user_id, community_label)
print("Community detection results:")
result.select("id", "label").show()

# Save the community results to a CSV
result.select("id", "label").write.csv("community_results.csv")

# Optionally, evaluate the communities with modularity (a simple modularity estimate)
# We calculate the number of edges within each community (intra-community edges) and the total number of edges.
intra_community_edges = g.edges.join(result, g.edges.src == result.id).filter(col("label") == result.label).count()
total_edges = g.edges.count()

# Compute a simple modularity estimate (edges within community / total edges)
modularity = intra_community_edges / total_edges
print(f"Modularity estimate: {modularity}")

# You can also output the community labels and their associated user information for analysis
result_with_names = result.join(users_df, result.id == users_df.id).select("id", "name", "label")
result_with_names.show(10)  # Display first 10 users with their community labels and names


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/Users/parasdhiman/Desktop/assmt/BDA/assmt4/muasae_git_target.csv.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from graphframes import GraphFrame
import numpy as np
from typing import Iterator, Tuple

class GitHubCommunityDetection:
    def __init__(self, edges_path: str, vertices_path: str):
        self.spark = SparkSession.builder \
            .appName("GitHub Community Detection") \
            .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12") \
            .config("spark.driver.memory", "4g") \
            .config("spark.executor.memory", "8g") \
            .config("spark.executor.cores", "4") \
            .getOrCreate()
            
        # Define schemas for the actual data format
        self.edges_schema = StructType([
            StructField("id_1", IntegerType(), False),
            StructField("id_2", IntegerType(), False)
        ])
        
        self.vertices_schema = StructType([
            StructField("id", IntegerType(), False),
            StructField("name", StringType(), False),
            StructField("ml_target", IntegerType(), False)
        ])
        
        # Read data with the correct schema
        self.edges_df = self.spark.read.csv(edges_path, header=True, schema=self.edges_schema)
        self.vertices_df = self.spark.read.csv(vertices_path, header=True, schema=self.vertices_schema)
        
        # Prepare DataFrames for GraphFrames and rename columns to avoid ambiguity
        self.edges_df = self.edges_df \
            .withColumnRenamed("id_1", "src") \
            .withColumnRenamed("id_2", "dst")
            
        # Keep original column names for vertices DataFrame
        self.vertices_df = self.vertices_df.alias("vertices")

    def detect_communities(self, max_iterations: int = 200) -> None:
        """
        Detect communities using Label Propagation Algorithm and calculate modularity.
        
        Args:
            max_iterations: Maximum number of iterations for LPA
        """
        # Create GraphFrame
        graph = GraphFrame(self.vertices_df, self.edges_df)
        
        # Run LPA for community detection
        communities = graph.labelPropagation(maxIter=max_iterations)
        
        # Calculate modularity
        modularity = self._calculate_modularity(graph, communities)
        
        # Get community statistics
        community_stats = communities.groupBy("label").count().orderBy("count", ascending=False)
        
        # Print results
        print(f"\nNetwork Modularity: {modularity:.4f}")
        print("\nTop 10 Communities by Size:")
        community_stats.show(10)
        
        # Save results
        self._save_results(communities)

    def _calculate_modularity(self, graph: GraphFrame, communities) -> float:
        """
        Calculate modularity score for the network.
        
        Args:
            graph: GraphFrame object containing the network
            communities: DataFrame with community assignments
            
        Returns:
            float: Modularity score
        """
        # Get total number of edges
        m = graph.edges.count()
        
        # Calculate node degrees
        degrees = graph.degrees
        
        # Prepare community DataFrames with unique column names
        src_communities = communities.select(
            col("id").alias("src"),
            col("label").alias("src_community")
        )
        
        dst_communities = communities.select(
            col("id").alias("dst"),
            col("label").alias("dst_community")
        )
        
        # Join community assignments with edges using explicit column references
        edges_with_communities = graph.edges \
            .join(src_communities, "src") \
            .join(dst_communities, "dst")
            
        # Prepare degree DataFrames with unique column names
        src_degrees = degrees.select(
            col("id").alias("src"),
            col("degree").alias("src_degree")
        )
        
        dst_degrees = degrees.select(
            col("id").alias("dst"),
            col("degree").alias("dst_degree")
        )
        
        # Join with degrees using explicit column references
        edges_with_info = edges_with_communities \
            .join(src_degrees, "src") \
            .join(dst_degrees, "dst")
        
        # Calculate modularity contributions
        def modularity_contribution(src_comm, dst_comm, src_deg, dst_deg) -> float:
            if src_comm == dst_comm:
                return 1.0 - (src_deg * dst_deg) / (2.0 * m)
            return 0.0
        
        # Register UDF
        from pyspark.sql.functions import udf
        mod_udf = udf(modularity_contribution)
        
        # Calculate total modularity
        modularity = edges_with_info \
            .select(mod_udf("src_community", "dst_community", "src_degree", "dst_degree").alias("contribution")) \
            .agg({"contribution": "sum"}) \
            .collect()[0][0]
            
        return modularity / (2.0 * m)

    def _save_results(self, communities) -> None:
        """
        Save community detection results.
        
        Args:
            communities: DataFrame with community assignments
        """
        # First, create a view of the vertices DataFrame
        self.vertices_df.createOrReplaceTempView("vertices_view")
        
        # Create a view of the communities DataFrame with selected columns
        communities_selected = communities.select(
            col("id"),
            col("label").alias("community_label")
        )
        communities_selected.createOrReplaceTempView("communities_view")
        
        # Use SQL to join the data and select columns
        results = self.spark.sql("""
            SELECT 
                c.id,
                v.name,
                v.ml_target,
                c.community_label as community
            FROM communities_view c
            JOIN vertices_view v ON c.id = v.id
        """)
        
        # Save to CSV
        results.write \
            .mode("overwrite") \
            .option("header", True) \
            .csv("community_results")

def main():
    # Initialize and run community detection
    detector = GitHubCommunityDetection(
        edges_path="musae_git_edges.csv",
        vertices_path="musae_git_target.csv"
    )
    detector.detect_communities()

if __name__ == "__main__":
    main()

:: loading settings :: url = jar:file:/Users/parasdhiman/opt/anaconda3/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/parasdhiman/.ivy2/cache
The jars for the packages stored in: /Users/parasdhiman/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-63163b53-702c-4a09-8ef4-201fa8f23e03;1.0
	confs: [default]
	found graphframes#graphframes;0.8.2-spark3.2-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 61ms :: artifacts dl 3ms
	:: modules in use:
	graphframes#graphframes;0.8.2-spark3.2-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	----------------------


Network Modularity: 0.4329

Top 10 Communities by Size:
+-----+-----+
|label|count|
+-----+-----+
|35773|32255|
|31126| 5137|
| 8095|   12|
| 3672|    9|
| 5547|    8|
| 6301|    8|
|20611|    6|
| 4632|    5|
|18942|    5|
|31577|    5|
+-----+-----+
only showing top 10 rows



[Stage 62719:>                                                      (0 + 0) / 1]

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

class AGMCommunityDetection:
    def __init__(self, edges_path: str, vertices_path: str):
        # Configure Spark
        conf = SparkConf().setAppName("AGM Community Detection") \
            .set("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.2-s_2.12")
        self.sc = SparkContext(conf=conf)
        self.spark = SparkSession(self.sc)
        
        # Define schemas
        self.edges_schema = StructType([
            StructField("id_1", IntegerType(), False),
            StructField("id_2", IntegerType(), False)
        ])
        
        self.vertices_schema = StructType([
            StructField("id", IntegerType(), False),
            StructField("name", StringType(), False),
            StructField("ml_target", IntegerType(), False)
        ])
        
        # Read data
        self.edges_df = self.spark.read.csv(edges_path, header=True, schema=self.edges_schema)
        self.vertices_df = self.spark.read.csv(vertices_path, header=True, schema=self.vertices_schema)
    
    def affiliation_graph_model(self, num_communities: int = 5, max_iterations: int = 50):
        """
        Implement Affiliation Graph Model for community detection
        
        Args:
            num_communities: Number of initial communities to create
            max_iterations: Maximum iterations for community refinement
        """
        # Convert DataFrames to RDDs for GraphX processing
        vertices_rdd = self.vertices_df.rdd.map(lambda x: (x.id, x))
        edges_rdd = self.edges_df.rdd.map(lambda x: ((x.id_1, x.id_2), 1.0))
        
        # Initial community assignment (random)
        def initial_community_assignment(vertex):
            import random
            return (vertex[0], random.randint(0, num_communities - 1))
        
        communities_rdd = vertices_rdd.map(initial_community_assignment)
        
        # AGM Community Detection Algorithm
        def agm_community_update(iteration, vertices_with_communities):
            # Aggregation step: determine community based on neighbors
            def community_aggregation(vertex_id, adjacency_list):
                # Collect community labels of neighbors
                neighbor_communities = [communities_rdd.lookup(neighbor)[0] for neighbor in adjacency_list]
                
                # Choose most frequent community among neighbors
                from statistics import mode
                try:
                    return mode(neighbor_communities)
                except:
                    # Fallback to current community if no mode
                    return communities_rdd.lookup(vertex_id)[0]
            
            # Update communities based on neighbor information
            updated_communities = vertices_with_communities.map(
                lambda x: (x[0], community_aggregation(x[0], x[1]))
            )
            
            return updated_communities
        
        # Iterative community refinement
        final_communities = communities_rdd
        for _ in range(max_iterations):
            final_communities = agm_community_update(_, final_communities)
        
        # Convert back to DataFrame
        communities_df = self.spark.createDataFrame(final_communities, ["id", "community"])
        
        # Calculate and print modularity
        modularity = self._calculate_modularity(communities_df)
        print(f"Network Modularity: {modularity}")
        
        # Save results
        self._save_results(communities_df)
    
    def _calculate_modularity(self, communities_df):
        """
        Calculate modularity score for the network
        
        Args:
            communities_df: DataFrame with community assignments
        
        Returns:
            float: Modularity score
        """
        # Prepare community and degree information
        communities = communities_df.alias("communities")
        
        # Join edges with communities
        edges_with_communities = self.edges_df \
            .join(communities.select(col("id").alias("id_1"), col("community").alias("comm1")), "id_1") \
            .join(communities.select(col("id").alias("id_2"), col("community").alias("comm2")), "id_2")
        
        # Modularity calculation logic
        def modularity_contribution(same_community: bool, src_degree: int, dst_degree: int, total_edges: int) -> float:
            expected_edges = (src_degree * dst_degree) / (2 * total_edges)
            return 1 if same_community else 0 - expected_edges
        
        # Register UDF
        modularity_udf = udf(modularity_contribution, FloatType())
        
        # Calculate total modularity
        total_edges = self.edges_df.count()
        
        modularity_result = edges_with_communities \
            .withColumn("modularity_contrib", 
                modularity_udf(
                    col("comm1") == col("comm2"),
                    col("id_1_degree"),  # Assume degree columns are added
                    col("id_2_degree"),
                    total_edges
                )
            ) \
            .agg({"modularity_contrib": "sum"}) \
            .collect()[0][0]
        
        return modularity_result / (2 * total_edges)
    
    def _save_results(self, communities_df):
        """
        Save community detection results
        
        Args:
            communities_df: DataFrame with community assignments
        """
        # Join with original vertices to include names
        results = self.vertices_df \
            .join(communities_df, "id") \
            .select("id", "name", "ml_target", "community")
        
        # Save to CSV
        results.write \
            .mode("overwrite") \
            .option("header", True) \
            .csv("agm_community_results")

def main():
    # Initialize and run community detection
    detector = AGMCommunityDetection(
        edges_path="musae_git_edges.csv",
        vertices_path="musae_git_target.csv"
    )
    detector.affiliation_graph_model(num_communities=5, max_iterations=50)

if __name__ == "__main__":
    main()

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=GitHub Community Detection, master=local[*]) created by getOrCreate at /var/folders/wd/p8rd_3q56c3gyt1n_mf2kh8h0000gn/T/ipykernel_91591/1975767835.py:10 

In [1]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame
import pandas as pd
import json

spark = SparkSession.builder \
    .appName("AGM Community Detection") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.0-s_2.12") \
    .getOrCreate()
    
edges_path = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_edges.csv"
targets_path = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_target.csv"
features_path = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_features.json"


edges_df = spark.read.csv(edges_path, header=True)
edges_df = edges_df.withColumnRenamed("id_1", "src").withColumnRenamed("id_2", "dst")

nodes_df = spark.read.csv(targets_path, header=True).select("id", "name")

with open(features_path, 'r') as f:
    features = json.load(f)
    
features_df = pd.DataFrame(list(features.items()), columns=['id', 'features'])

graph = GraphFrame(nodes_df, edges_df)

communities = graph.labelPropagation(maxIter=100)

community_assignments = communities.rdd.map(lambda row: (row.id, row.label)).collectAsMap()

edges_rdd = edges_df.rdd.map(lambda row: (row.src, row.dst))

def calculate_modularity(edges_rdd, community_assignments):
    m = edges_rdd.count()
    communities = set(community_assignments.values())
    modularity = 0.0

    for community in communities:
        nodes_in_community = {node for node, label in community_assignments.items() if label == community}
        internal_edges = edges_rdd.filter(
            lambda edge: edge[0] in nodes_in_community and edge[1] in nodes_in_community
        ).count()
        degree_sum = edges_rdd.filter(
            lambda edge: edge[0] in nodes_in_community or edge[1] in nodes_in_community
        ).count()
        modularity += (internal_edges / m) - (degree_sum / (2 * m))**2

    return modularity

modularity_score = calculate_modularity(edges_rdd, community_assignments)

print(f"Detected Communities: {communities.show(truncate=False)}")
print(f"Modularity Score: {modularity_score}")

:: loading settings :: url = jar:file:/Users/parasdhiman/opt/anaconda3/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/parasdhiman/.ivy2/cache
The jars for the packages stored in: /Users/parasdhiman/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c6c6ef02-4765-4a4f-bbef-eaf77c71bdb4;1.0
	confs: [default]
	found graphframes#graphframes;0.8.2-spark3.0-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 61ms :: artifacts dl 4ms
	:: modules in use:
	graphframes#graphframes;0.8.2-spark3.0-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	----------------------

+---+-----------------+------------+
|id |name             |label       |
+---+-----------------+------------+
|0  |Eiryyy           |833223655499|
|1  |shawflying       |833223655499|
|2  |JpMCarrilho      |833223655499|
|3  |SuhwanCha        |833223655499|
|4  |sunilangadi2     |833223655499|
|5  |j6montoya        |833223655499|
|6  |sfate            |833223655499|
|7  |amituuush        |833223655499|
|8  |mauroherlein     |833223655499|
|9  |ParadoxZero      |833223655499|
|10 |llazzaro         |833223655499|
|11 |beeva-manueldepaz|833223655499|
|12 |damianmuti       |833223655499|
|13 |apobbati         |833223655499|
|14 |hwlv             |833223655499|
|15 |haroldoramirez   |833223655499|
|16 |jasonblanchard   |833223655499|
|17 |BahiHussein      |833223655499|
|18 |itsmevanessi     |833223655499|
|19 |nwjsmith         |833223655499|
+---+-----------------+------------+
only showing top 20 rows

Detected Communities: None
Modularity Score: 0.7490743493203716


In [None]:
import os
import json
import numpy as np
import networkx as nx
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, explode, struct
from pyspark.sql.types import (
    StructType, StructField, 
    StringType, IntegerType, 
    ArrayType, DoubleType, BooleanType
)
from typing import List, Dict, Any

import logging

class AffiliationGraphModelDetector:
    def __init__(
        self, 
        edges_path: str, 
        nodes_path: str = None, 
        features_path: str = None, 
        num_communities: int = 5, 
        max_iterations: int = 100
    ):
        """
        Initialize Affiliation Graph Model Community Detector
        
        :param edges_path: Path to edges CSV file
        :param nodes_path: Optional path to nodes CSV file
        :param features_path: Optional path to features JSON file
        :param num_communities: Number of communities to detect
        :param max_iterations: Maximum iterations for community detection
        """
        # Configure logging
        logging.basicConfig(level=logging.INFO, 
                            format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger(__name__)
        
        # Validate input paths
        self._validate_file_paths(edges_path, nodes_path, features_path)
        
        # Store paths and parameters
        self.edges_path = edges_path
        self.nodes_path = nodes_path
        self.features_path = features_path
        self.num_communities = num_communities
        self.max_iterations = max_iterations
        
        # Initialize Spark Session
        self.spark = self._create_spark_session()
        
        # Load data
        self.edges_df = self._load_edges()
        self.nodes_df = self._load_nodes()
        self.features = self._load_features()
        
        # Create NetworkX graph for advanced community detection
        self.graph = self._create_networkx_graph()
    
    def _validate_file_paths(self, *paths):
        """
        Validate that input file paths exist
        
        :param paths: Paths to validate
        """
        for path in paths:
            if path and not os.path.exists(path):
                self.logger.error(f"File not found: {path}")
                raise FileNotFoundError(f"File not found: {path}")
    
    def _create_spark_session(self):
        """
        Create Spark Session with necessary configurations
        
        :return: Configured SparkSession
        """
        return SparkSession.builder \
            .appName("Affiliation Graph Model Community Detection") \
            .config("spark.driver.memory", "4g") \
            .config("spark.executor.memory", "4g") \
            .config("spark.sql.shuffle.partitions", "200") \
            .getOrCreate()
    
    def _load_edges(self):
        """
        Load edges from CSV file
        
        :return: Spark DataFrame of edges
        """
        try:
            edges_df = self.spark.read.csv(self.edges_path, header=True)
            
            # Rename columns if needed
            if 'id_1' in edges_df.columns and 'id_2' in edges_df.columns:
                edges_df = edges_df.withColumnRenamed("id_1", "src") \
                                   .withColumnRenamed("id_2", "dst")
            
            self.logger.info(f"Loaded {edges_df.count()} edges")
            return edges_df
        except Exception as e:
            self.logger.error(f"Error loading edges: {e}")
            raise
    
    def _load_nodes(self):
        """
        Load nodes from CSV file or generate from edges
        
        :return: Spark DataFrame of nodes
        """
        if self.nodes_path:
            try:
                nodes_df = self.spark.read.csv(self.nodes_path, header=True)
                self.logger.info(f"Loaded {nodes_df.count()} nodes")
                return nodes_df
            except Exception as e:
                self.logger.warning(f"Error loading nodes file: {e}. Generating nodes from edges.")
        
        # Generate nodes from unique node IDs in edges
        nodes_df = self.edges_df.select(col("src").alias("id")) \
            .union(self.edges_df.select(col("dst").alias("id"))) \
            .distinct()
        
        self.logger.info(f"Generated {nodes_df.count()} nodes from edges")
        return nodes_df
    
    def _load_features(self):
        """
        Load features from JSON file
        
        :return: Dictionary of node features or None
        """
        if not self.features_path:
            self.logger.info("No features path provided")
            return None
        
        try:
            with open(self.features_path, 'r') as f:
                features = json.load(f)
            self.logger.info(f"Loaded features for {len(features)} nodes")
            return features
        except Exception as e:
            self.logger.error(f"Error loading features: {e}")
            return None
    
    def _create_networkx_graph(self):
        """
        Create NetworkX graph from Spark edges DataFrame
        
        :return: NetworkX Graph
        """
        try:
            # Convert Spark DataFrame to list of edges
            edges = self.edges_df.select("src", "dst").collect()
            
            # Create NetworkX graph
            G = nx.Graph()
            G.add_edges_from([(row.src, row.dst) for row in edges])
            
            self.logger.info(f"Created NetworkX graph with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")
            return G
        except Exception as e:
            self.logger.error(f"Error creating NetworkX graph: {e}")
            raise
    
    def detect_communities(self):
        """
        Detect communities using Louvain method
        
        :return: Communities and modularity
        """
        try:
            # Use Louvain method for community detection
            communities = list(nx.community.louvain_communities(self.graph))
            
            # Calculate modularity
            modularity = nx.community.modularity(self.graph, communities)
            
            # Log results
            self.logger.info(f"Detected {len(communities)} communities")
            self.logger.info(f"Modularity Score: {modularity}")
            
            return {
                "communities": communities,
                "modularity": modularity,
                "num_communities": len(communities)
            }
        except Exception as e:
            self.logger.error(f"Community detection failed: {e}")
            raise
    
    def export_results(self, results, output_path='community_results.json'):
        """
        Export community detection results
        
        :param results: Community detection results
        :param output_path: Path to save results
        """
        try:
            # Convert communities to a list of node lists
            serializable_communities = [list(community) for community in results['communities']]
            
            export_data = {
                "num_communities": results['num_communities'],
                "modularity": results['modularity'],
                "communities": serializable_communities
            }
            
            with open(output_path, 'w') as f:
                json.dump(export_data, f, indent=2)
            
            self.logger.info(f"Results exported to {output_path}")
        except Exception as e:
            self.logger.error(f"Error exporting results: {e}")
    
    def analyze_communities(self, results):
        """
        Perform additional analysis on detected communities
        
        :param results: Community detection results
        """
        try:
            communities = results['communities']
            
            # Community size distribution
            community_sizes = [len(community) for community in communities]
            
            print("\nCommunity Analysis:")
            print(f"Total Communities: {len(communities)}")
            print(f"Average Community Size: {np.mean(community_sizes):.2f}")
            print(f"Smallest Community Size: {min(community_sizes)}")
            print(f"Largest Community Size: {max(community_sizes)}")
            
            # Optional: Feature-based analysis if features are available
            if self.features:
                self._analyze_community_features(communities)
        
        except Exception as e:
            self.logger.error(f"Community analysis failed: {e}")
    
    def _analyze_community_features(self, communities):
        """
        Analyze community features if available
        
        :param communities: List of communities
        """
        if not self.features:
            return
        
        # Placeholder for feature-based analysis
        # You can extend this to analyze feature distributions in communities
        pass

def main(
    edges_path, 
    nodes_path=None, 
    features_path=None, 
    num_communities=5, 
    max_iterations=50
):
    """
    Main function to run Affiliation Graph Model Community Detection
    
    :param edges_path: Path to edges CSV file
    :param nodes_path: Optional path to nodes CSV file
    :param features_path: Optional path to features JSON file
    :param num_communities: Number of communities to detect
    :param max_iterations: Maximum iterations for community detection
    """
    # Create detector
    detector = AffiliationGraphModelDetector(
        edges_path=edges_path, 
        nodes_path=nodes_path,
        features_path=features_path,
        num_communities=num_communities,
        max_iterations=max_iterations
    )
    
    # Detect communities
    results = detector.detect_communities()
    
    # Analyze communities
    detector.analyze_communities(results)
    
    # Export results
    detector.export_results(results)
    
    print("\nCommunity Detection Completed Successfully!")

if __name__ == "__main__":
    # REPLACE THESE PATHS WITH YOUR ACTUAL FILE PATHS
    # EDGES_PATH = "/path/to/your/edges.csv"
    # NODES_PATH = "/path/to/your/nodes.csv"  # Optional
    # FEATURES_PATH = "/path/to/your/features.json"  # Optional
    
    EDGES_PATH = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_edges.csv"
    NODES_PATH = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_target.csv"
    FEATURES_PATH = "/Users/parasdhiman/Desktop/assmt/BDA/assmt4/git_web_ml/musae_git_features.json"
    
    # Run the main function
    main(
        edges_path=EDGES_PATH, 
        nodes_path=NODES_PATH,
        features_path=FEATURES_PATH
    )

2024-11-28 18:19:42,230 - INFO - Loaded 289003 edges
2024-11-28 18:19:42,381 - INFO - Loaded 37700 nodes
2024-11-28 18:19:42,569 - INFO - Loaded features for 37700 nodes
2024-11-28 18:19:44,733 - INFO - Created NetworkX graph with 37700 nodes and 289003 edges
2024-11-28 18:19:49,861 - INFO - Detected 26 communities
2024-11-28 18:19:49,862 - INFO - Modularity Score: 0.45395386093221535
2024-11-28 18:19:49,884 - INFO - Results exported to community_results.json



Community Analysis:
Total Communities: 26
Average Community Size: 1450.00
Smallest Community Size: 3
Largest Community Size: 7420

Community Detection Completed Successfully!


In [1]:
from pyspark.sql import SparkSession
import random
from collections import defaultdict

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("AGM Community Detection") \
    .getOrCreate()

# Load data
edges_path = "musae_git_edges.csv"
nodes_path = "musae_git_target.csv"

# Read edges and nodes data
edges_df = spark.read.csv(edges_path, header=True).selectExpr("id_1 as src", "id_2 as dst")
nodes_df = spark.read.csv(nodes_path, header=True).select("id")

# Collect nodes and edges as Python objects
nodes = [row.id for row in nodes_df.collect()]
edges = [(row.src, row.dst) for row in edges_df.collect()]

# Initialize parameters
num_nodes = len(nodes)
num_communities = 5  # Set the number of communities
iterations = 10  # Number of AGM iterations

# Initialize community assignments randomly
node_to_communities = {node: random.randint(0, num_communities - 1) for node in nodes}

# Function to calculate the probability of an edge existing based on community affiliations
def edge_probability(node1, node2, community_affiliations):
    return 1.0 if community_affiliations[node1] == community_affiliations[node2] else 0.1

# AGM iterations
for _ in range(iterations):
    # Probabilistically update community affiliations for each node
    for node in nodes:
        # Count how many edges connect this node to each community
        community_scores = defaultdict(int)
        for neighbor in [edge[1] for edge in edges if edge[0] == node] + \
                         [edge[0] for edge in edges if edge[1] == node]:
            community = node_to_communities[neighbor]
            community_scores[community] += 1
        
        # Assign the node to the community that maximizes edge likelihood
        node_to_communities[node] = max(community_scores, key=community_scores.get)

# Assign final communities
community_assignments = spark.createDataFrame(
    [(node, community) for node, community in node_to_communities.items()],
    ["node", "community"]
)

# Evaluate modularity
def calculate_modularity(edges, community_assignments):
    # Prepare data
    community_dict = {row.node: row.community for row in community_assignments.collect()}
    total_edges = len(edges)
    
    # Compute modularity
    modularity = 0
    for (src, dst) in edges:
        src_community = community_dict[src]
        dst_community = community_dict[dst]
        
        src_degree = sum(1 for edge in edges if edge[0] == src or edge[1] == src)
        dst_degree = sum(1 for edge in edges if edge[0] == dst or edge[1] == dst)
        
        modularity += int(src_community == dst_community) - (src_degree * dst_degree) / (2 * total_edges)
    
    return modularity / (2 * total_edges)

modularity_score = calculate_modularity(edges, community_assignments)

# Show results
print("Community Assignments:")
community_assignments.show()
print(f"Modularity Score: {modularity_score}")

24/11/28 14:53:12 WARN Utils: Your hostname, Parass-MacBook-Air-2.local resolves to a loopback address: 127.0.0.1; using 192.168.49.38 instead (on interface en0)
24/11/28 14:53:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/28 14:53:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


KeyboardInterrupt: 