In [1]:
import pandas as pd
import networkx as nx
import numpy as np

In [2]:
commits = pd.read_feather('../data/commits.feather')
commits.head()

Unnamed: 0,commit_id,project_id,project_name,parent_id
0,321132855,26730207,meta-ivi,321132856.0
1,333294039,27617768,ExcelLaunchPad,333294036.0
2,302312189,24513310,yii2-webception,302312191.0
3,124854783,3452681,mammoth.js,124843456.0
4,423307958,32130467,phase-0,423307476.0


In [3]:
project_names_sample = commits[['project_name']].sample(5, random_state = 2019)
commits_sample = commits.loc[commits['project_name'].isin(project_names_sample.project_name.values)]
commits_sample.head()

Unnamed: 0,commit_id,project_id,project_name,parent_id
312,57035161,3526154,oneclick,57035155.0
313,162455517,3526154,oneclick,162450416.0
3666,61587113,3526154,oneclick,61539488.0
3667,269200796,3526154,oneclick,269200795.0
5391,39513853,3526154,oneclick,39513850.0


In [4]:
graphs = {}

for project in project_names_sample.project_name.values:
    graph_df = commits_sample.loc[commits_sample['project_name'] == project]
    graph_df = graph_df[['commit_id', 'parent_id']]
    graph_df.columns = pd.Index(['target', 'source'])
    
    graph = nx.from_pandas_edgelist(graph_df, create_using=nx.DiGraph)
    graph.name = project
    graphs[project] = graph

In [5]:
repo = graphs['javarosa']

### Features

- Number of Nodes
- Number of Edges
- Density
- Average Clustering Coefficient
- Transitivity
- Weakly Connected
- Number of Weakly Connected
- Number attracting components

In [6]:
num_nodes = repo.number_of_nodes()
num_edges = repo.number_of_edges()

density = nx.density(repo)
avg_clustering = nx.algorithms.cluster.average_clustering(repo)
transitivity = nx.algorithms.cluster.transitivity(repo)

weakly_connected = nx.algorithms.components.is_weakly_connected(repo)
num_weakly = nx.algorithms.components.number_weakly_connected_components(repo)
num_attracting = nx.algorithms.components.number_attracting_components(repo)

In [7]:
degree_centrality = nx.algorithms.centrality.degree_centrality(repo).values()
in_degree_centrality = nx.algorithms.centrality.in_degree_centrality(repo).values()
out_degree_centrality = nx.algorithms.centrality.out_degree_centrality(repo).values()
eigen_centrality = nx.algorithms.centrality.eigenvector_centrality(repo, max_iter=int(1e6)).values()
katz_centrality = nx.algorithms.centrality.katz_centrality(repo, max_iter=int(1e6)).values()
num_triangles = nx.algorithms.cluster.clustering(repo).values()

In [8]:
degree = list(nx.degree(repo))
in_degree = list(repo.in_degree)
out_degree = list(repo.out_degree)

- Degree Centrality **dict**
- In-Degree Centrality **dict**
- Out-Degree Centrality **dict**
- Eigenvector Centrality **dict**
- Katz Centrality **dict**
- Current-flow Closeness Centrality **dict**
- Betweenness Centrality **dict**
- Clustering **dict**

- Average degree (overall)
- Standard Deviation degree (overall)
- Average in degree
- Standard Deviation in degree
- Average out degree
- Standard Deviation out degree