In [None]:
import git
import networkx as nx
import matplotlib.pyplot as plt
import tempfile
import shutil
import re
from collections import defaultdict
from datetime import datetime, timedelta
from github import Github
from google.colab import userdata
import numpy as np
import networkx as nx
import community as community_louvain  # For community detection (pip install python-louvain)
from networkx.algorithms import community as nx_community
from google.colab import drive
import os

mounted = False

def generateGraphSet(repo_url):
  global mounted
  if not mounted:
    drive.mount('/content/drive')
    mounted = True

  base_save_dir = '/content/drive/MyDrive/RepoGraphs'

  # Ensure base directory exists
  os.makedirs(base_save_dir, exist_ok=True)

  # GitHub repository link
  # repo_url = "https://github.com/wso2/apk.git"
  repo_name = repo_url.split('/')[-2] + '/' + repo_url.split('/')[-1].replace('.git', '')

  repo_dir = os.path.join(base_save_dir, repo_name)
  os.makedirs(repo_dir, exist_ok=True)


  # GitHub access token (replace with your token for authenticated requests, or use None for unauthenticated)
  GITHUB_TOKEN = userdata.get('GithubToken') #Provide the access token via secret
  g = Github(GITHUB_TOKEN)

  # Step 1: Clone the repository into a temporary directory
  temp_dir = tempfile.mkdtemp()
  repo = git.Repo.clone_from(repo_url, temp_dir)

  # Step 2: Fetch all contributors for the project at once to avoid multiple API requests
  github_repo = g.get_repo(repo_name)
  contributor_data = {}

  # Populate a dictionary with all contributors, storing their type and normalizing their username
  for contributor in github_repo.get_contributors():
      username = contributor.login
      contributor_data[username] = {
          "type": contributor.type,
          "normalized_name": re.sub(r"[^a-zA-Z0-9]", "", username).lower()
      }

  # Helper function to normalize usernames and check if a user is a bot
  def get_normalized_username(username):
      return re.sub(r"[^a-zA-Z0-9]", "", username).lower()

  def is_bot(username):
      user_info = contributor_data.get(username)
      return user_info and (user_info["type"] == "Bot" or "bot" in username.lower())

  # Step 3: Determine cutoff date (1.5 years before the most recent commit)
  most_recent_commit = next(repo.iter_commits())  # Get the last commit (most recent)
  cutoff_date = most_recent_commit.committed_datetime - timedelta(days=547)  # 1.5 years

  # Initialize the network and contributor tracking
  G = nx.Graph()
  contributor_map = defaultdict(set)  # Map normalized usernames to actual usernames

  # Step 4: Identify contributors and group duplicates within the time window
  for commit in repo.iter_commits():
      if commit.committed_datetime < cutoff_date:
          break  # Stop processing as we are only interested in recent commits

      username = commit.author.name
      email = commit.author.email

      # Exclude bot accounts using cached data
      if is_bot(username):
          continue

      # Normalize username to group duplicates
      normalized_username = get_normalized_username(username)
      contributor_map[normalized_username].add((username, email))  # Track all variations of each contributor

  # Step 5: Create graph nodes for each unique contributor group
  unique_contributors = {}
  for norm_name, variations in contributor_map.items():
      representative = next(iter(variations))[0]
      unique_contributors[norm_name] = representative
      G.add_node(representative)

  # Step 6: Add edges based on file contributions within the time window
  for commit in repo.iter_commits():
      if commit.committed_datetime < cutoff_date:
          break  # Only consider recent commits

      author_name = commit.author.name
      email = commit.author.email

      if is_bot(author_name):
          continue
      normalized_author = get_normalized_username(author_name)
      author_node = unique_contributors[normalized_author]

      files = commit.stats.files

      # Find other contributors who also modified the same files
      for file in files:
          other_authors = set()
          for c in repo.iter_commits(paths=file, since=cutoff_date):  # Limit to commits within cutoff
              if c.author and not is_bot(c.author.name):
                  other_norm = get_normalized_username(c.author.name)
                  other_author_node = unique_contributors.get(other_norm)
                  if other_author_node:
                      other_authors.add(other_author_node)

          # Create edges between this author and other authors who worked on the same file
          for other_author in other_authors:
              if other_author != author_node:
                  G.add_edge(author_node, other_author)

  # Step 7: Calculate centrality as an example metric
  centrality = nx.degree_centrality(G)

  # Step 8: Plot the network with optimized visualization adjustments
  plt.figure(figsize=(60, 60))

  # Use a consistent layout for nodes and labels
  pos = nx.spring_layout(G)

  # Draw the nodes and edges
  nx.draw_networkx(
      G,
      pos=pos,
      with_labels=False,
      node_size=[v * 1000 for v in centrality.values()],
      font_size=8,
      node_color="skyblue",
      edge_color="lightgray"
  )

  # Adjust the label positions to be closer to the nodes
  label_pos = {node: (x, y + 0.02) for node, (x, y) in pos.items()}

  # Add labels with custom color (red)
  labels = {node: node for node in G.nodes()}
  nx.draw_networkx_labels(G, pos=label_pos, labels=labels, font_color="red")

  plt.title("Contribution Network")
  # Save the plot as a PNG in the repository's subfolder
  fig = plt.gcf()
  fig_path = os.path.join(repo_dir, f"NetworkGraph.png")
  fig.savefig(fig_path, format="png", bbox_inches="tight")
  print(f"Network Graph Plot saved to {fig_path}")
  # plt.show()

  # Cleanup temporary directory
  shutil.rmtree(temp_dir)

  #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
                                                                      #Generating Insights
  #>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  # Assuming `G` is the network graph generated previously

  # Step 1: Identification of Key Collaborators using Degree Centrality
  centrality = nx.degree_centrality(G)
  top_central_nodes = sorted(centrality, key=centrality.get, reverse=True)[:5]  # Top 5 key collaborators

  plt.figure(figsize=(60, 60))
  pos = nx.spring_layout(G)
  nx.draw_networkx(G, pos, with_labels=True, node_color="skyblue", edge_color="lightgray")
  nx.draw_networkx_nodes(G, pos, nodelist=top_central_nodes, node_color="red", label="Key Collaborators")
  plt.title("Key Collaborators (in red)")
  # Save the plot as a PNG in the repository's subfolder
  fig = plt.gcf()
  fig_path = os.path.join(repo_dir, f"KeyCollab.png")
  fig.savefig(fig_path, format="png", bbox_inches="tight")
  print(f"Key Collab Plot saved to {fig_path}")
  # plt.show()


  # Step 2: Identification of Knowledge Silos using Community Detection
  # Using NetworkX's greedy_modularity_communities for community detection
  silo_communities = list(nx_community.greedy_modularity_communities(G))

  # Create a dictionary to assign a unique color to each community
  silo_groups = {node: i for i, community in enumerate(silo_communities) for node in community}

  # Draw each community in a different color
  plt.figure(figsize=(60, 60))
  colors = plt.cm.rainbow(np.linspace(0, 1, len(silo_communities)))
  for i, community in enumerate(silo_communities):
      nx.draw_networkx_nodes(G, pos, nodelist=list(community), node_color=[colors[i]], label=f"Silo {i+1}")
  nx.draw_networkx_edges(G, pos, alpha=0.5,edge_color="lightgray")
  nx.draw_networkx_labels(G, pos, font_size=12, font_color="black")
  plt.title("Knowledge Silos (Community Clusters)")
  # Save the plot as a PNG in the repository's subfolder
  fig = plt.gcf()
  fig_path = os.path.join(repo_dir, f"KnowledgeSilo.png")
  fig.savefig(fig_path, format="png", bbox_inches="tight")
  print(f"Knowledge Silos Plot saved to {fig_path}")
  # plt.show()

  # Step 3: Identification of Collaboration Bottlenecks using Betweenness Centrality
  betweenness = nx.betweenness_centrality(G)
  top_bottlenecks = sorted(betweenness, key=betweenness.get, reverse=True)[:5]  # Top 5 bottlenecks

  plt.figure(figsize=(60, 60))
  pos = nx.spring_layout(G)
  nx.draw_networkx(G, pos, with_labels=True, node_color="skyblue", edge_color="lightgray")
  nx.draw_networkx_nodes(G, pos, nodelist=top_bottlenecks, node_color="orange", label="Collaboration Bottlenecks")
  plt.title("Collaboration Bottlenecks (in orange)")
  # Save the plot as a PNG in the repository's subfolder
  fig = plt.gcf()
  fig_path = os.path.join(repo_dir, f"Collab_Bottleneck.png")
  fig.savefig(fig_path, format="png", bbox_inches="tight")
  print(f"Collab Bottleneck Plot saved to {fig_path}")
  # plt.show()

  # Step 4: Potential Mentorship or Knowledge Transfer Candidates
  # Find nodes that connect multiple communities (mentorship candidates)
  intercommunity_nodes = set()
  for node in G.nodes():
      # Get the communities connected by this node
      neighbor_communities = {silo_groups[neighbor] for neighbor in G.neighbors(node) if neighbor in silo_groups}
      if len(neighbor_communities) > 1:  # Node links multiple communities
          intercommunity_nodes.add(node)

  plt.figure(figsize=(60, 60))
  nx.draw_networkx(G, pos, with_labels=True, node_color="skyblue", edge_color="lightgray")
  nx.draw_networkx_nodes(G, pos, nodelist=list(intercommunity_nodes), node_color="orange", label="Mentorship Candidates")
  plt.title("Potential Mentorship or Knowledge Transfer Candidates (in orange)")
  # Save the plot as a PNG in the repository's subfolder
  fig = plt.gcf()
  fig_path = os.path.join(repo_dir, f"PMKT.png")
  fig.savefig(fig_path, format="png", bbox_inches="tight")
  print(f"Mentorship Plot saved to {fig_path}")
  # plt.show()


In [None]:
repo_url = ["https://github.com/wso2/apk.git"]
# repo_url = ["https://github.com/khoj-ai/khoj.git",
#             "https://github.com/All-Hands-AI/OpenHands.git",
#             "https://github.com/Stirling-Tools/Stirling-PDF.git",
#             "https://github.com/puppeteer/puppeteer.git",
#             "https://github.com/abi/screenshot-to-code.git",
#             "https://github.com/Cinnamon/kotaemon.git",
#             "https://github.com/bluesky-social/social-app.git",
#             "https://github.com/trufflesecurity/trufflehog.git",
#             "https://github.com/yamadashy/repomix.git",
#             "https://github.com/localsend/localsend.git"
#             ]
for url in repo_url:
  generateGraphSet(url)