In [1]:
import pickle
import itertools
import networkx as nx
import math
import seaborn as sns
from matplotlib import pyplot as plt
from collections import OrderedDict
%matplotlib inline
from networkx.algorithms import bipartite
from networkx.drawing.nx_agraph import graphviz_layout
import hypernetx as hnx
import powerlaw
import pandas as pd
import numpy as np
import copy
import operator
import collections
import json



# Table of Contents

### [Section 1](#section1): Contribution Distributions Type 
### [Section 2](#section2): HyperNetwork Analysis
### [Section 3](#section3): Bipartitie and Projection Analysis

<a id = "section1"></a>
# Section 1:  Identify contribution distribution for each of the top 14 libraries and cpython. 

In [2]:
#Read in data from pickle file see read_diles_convert_to_dict_simplified for process of pax_net
pax_net = pickle.load(open("./Dependency Graphs/pax_net3.pkl", "rb"))


#list of top repos by paretian binning -- see PyPI Analytics  
repos = ["click", "cpython", "django","matplotlib","numpy","odoo","pandas","pytest","pytest-cov","pyyaml","requests",
         "scipy","setuptools","six","sphinx"]
        #'matplotlib', 'scipy', 


In [3]:
#Create data structures to analyze data
'''
identify each contributors contribution to each library
key: library
value: list of contribution numbers

purpose: build distribution histograms
'''
lib_counts_by_contrib = {}
lib_by_pulls = {}
lib_by_commits ={}
'''
identify each contributors contribution to each library by name
key: library
value dictionary of: 
     key2: github username
     value2: number of contributions 
'''
lib_producers = {}
lib_producers_split ={}



for repo in repos: 
        lib_counts_by_contrib[repo] = []
        lib_producers[repo] = {}
        lib_by_pulls[repo] =[]
        lib_by_commits[repo] =[]
        lib_producers_split[repo]={}


In [4]:
#Critical to this measurement is defining contributions
#Contributions are defined as a successful pull (e.g. a pull request)
#and a successful commit, these are mereley prxies as each user has their own technique for commits

for k,v in pax_net.items(): 
        for lib in v["contributor"]: 
            if lib in v["successful_pulls"].keys():
                pulls = v["successful_pulls"][lib]
            else: 
                pulls = 0
            if lib in v["commits"].keys(): 
                coms = v["commits"][lib]
            else:
                coms = 0
        
            lib_counts_by_contrib[lib].append(coms+pulls)
            lib_producers[lib][k] = coms+pulls
            lib_by_pulls[lib].append(pulls)
            lib_by_commits[lib].append(coms)
            lib_producers_split[lib][k]= [coms,pulls]

lib_by_commits.keys()
    

dict_keys(['click', 'cpython', 'django', 'matplotlib', 'numpy', 'odoo', 'pandas', 'pytest', 'pytest-cov', 'pyyaml', 'requests', 'scipy', 'setuptools', 'six', 'sphinx'])

In [5]:
#Assess bin size
county = 0
for contrib in lib_counts_by_contrib['cpython']:
    min_c = min(lib_counts_by_contrib['cpython'])
    if contrib < min_c+48:
        county +=1
county, min_c

(886, 1)

# Histograms of distributions

In [None]:
sns.set(rc={'figure.figsize':(12,12)})
sns.histplot(lib_counts_by_contrib['cpython'], bins = 70)
plt.title('Distribution of cPython Contributions', fontsize = 20, fontweight ='bold')

In [None]:
sorted_py = np.sort(lib_counts_by_contrib['cpython'])
sorted_py = sorted_py[::-1]
below_bend = []
for i in sorted_py:
    if i > 100: 
        below_bend.append(i)
    else: 
        break
percent_head = sum(below_bend)/sum(sorted_py)
percent_head, len(below_bend)/len(sorted_py), len(below_bend), len(sorted_py)


In [None]:

yvals=np.arange(len(sorted_py))/float(len(sorted_py)-1)
sns.lineplot(x=sorted_py,y=yvals)
sns.set(rc={'figure.figsize':(12,12)})
plt.title('Distribution of cPython Contributions', fontsize = 24, fontweight ='bold')
plt.tick_params(axis='x', labelsize=16)
plt.tick_params(axis='y', labelsize=16)
plt.ylabel('Pertcent of Contributors', fontsize=18)
plt.xlabel('Number of Contributions', fontsize=18)
style = dict(size=20, color='blue')
plt.text(450, 0.6, "5% (47) of contributors compromise 83% of contributions", **style)

In [None]:
sns.set(rc={'figure.figsize':(12,12)})
sns.histplot(lib_by_pulls['cpython'], bins = 70)
plt.title('Distribution of Python Pulls', fontsize = 20, fontweight ='bold')

In [None]:
sns.set(rc={'figure.figsize':(12,12)})
sns.histplot(lib_by_commits['cpython'], bins = 70)
plt.title('Distribution of Python Commits', fontsize = 20, fontweight ='bold')

In [None]:
#Plot histogram for each library

#Due to the variance in scale of activity for each library instead of making a common y axis these plots are organized
#from least to greatest total number of contributions. 
sorted_count =OrderedDict()
# create sorted dictionary from least to greatest
count_libs = []
for k,v in lib_counts_by_contrib.items(): 
    if k == 'cpython':
        pass
    else:
        count_libs.append((k,sum(v)))

sorted_libs = sorted(count_libs, key=lambda tup: tup[1])

for lib in sorted_libs: 
    sorted_count[lib[0]] = lib_counts_by_contrib[lib[0]]

fig, ax = plt.subplots(7,2, figsize=(12,10))

libs = list(sorted_count.keys())
commits = list(sorted_count.values())
count = 0
for j in range(len(ax)):
    for i in range(len(ax[j])):
        sns.set()
        #print (commits[count]) 
        ax[j][i].set_title(libs[count], fontsize = 15,fontweight='bold')
        ax[j][i] = sns.histplot(commits[count],ax=ax[j][i], bins =70)
        count +=1
fig.tight_layout()
fig.suptitle('Contribution Distribution for Top 14 Libaries', position=(.5,1.05), fontsize=20, fontweight='bold')
    


In [None]:
#Plot histogram for each library

#Due to the variance in scale of activity for each library instead of making a common y axis these plots are organized
#from least to greatest total number of contributions. 
sorted_count =OrderedDict()
# create sorted dictionary from least to greatest
count_libs = []
for k,v in lib_by_pulls.items(): 
     if k == 'cpython':
        pass
     else:
        count_libs.append((k,sum(v)))

sorted_libs = sorted(count_libs, key=lambda tup: tup[1])

for lib in sorted_libs: 
    sorted_count[lib[0]] = lib_by_pulls[lib[0]]

fig, ax = plt.subplots(7,2, figsize=(12,10))

libs = list(sorted_count.keys())
commits = list(sorted_count.values())
count = 0
for j in range(len(ax)):
    for i in range(len(ax[j])):
        sns.set()
        #print (commits[count]) 
        ax[j][i].set_title(libs[count], fontsize = 15,fontweight='bold')
        ax[j][i] = sns.histplot(commits[count],ax=ax[j][i], bins =70)
        count +=1
fig.tight_layout()
fig.suptitle('Pull Distribution for Top 14 Libaries', position=(.5,1.05), fontsize=20, fontweight='bold')
    


In [None]:
#Plot histogram for each library

#Due to the variance in scale of activity for each library instead of making a common y axis these plots are organized
#from least to greatest total number of contributions. 
sorted_count =OrderedDict()
# create sorted dictionary from least to greatest
count_libs = []
for k,v in lib_by_commits.items(): 
     if k == 'cpython':
        pass
     else:
        count_libs.append((k,sum(v)))

sorted_libs = sorted(count_libs, key=lambda tup: tup[1])

for lib in sorted_libs: 
    sorted_count[lib[0]] = lib_by_commits[lib[0]]

fig, ax = plt.subplots(7,2, figsize=(12,10))

libs = list(sorted_count.keys())
commits = list(sorted_count.values())
count = 0
for j in range(len(ax)):
    for i in range(len(ax[j])):
        sns.set()
        #print (commits[count]) 
        ax[j][i].set_title(libs[count], fontsize = 15,fontweight='bold')
        ax[j][i] = sns.histplot(commits[count],ax=ax[j][i], bins =70)
        count +=1
fig.tight_layout()
fig.suptitle('Commit Distribution for Top 14 Libaries', position=(.5,1.05), fontsize=20, fontweight='bold')
    


# Identify Distribution of the Top 15 

In [None]:

def get_fit(distro):
    dist_results = {"Distribution 1": [], "Distribution 2":[], "R":[], "p":[]}
    best = "" 

    results = powerlaw.Fit(distro)
    type_heavy = list(results.supported_distributions.keys())
    for ty in type_heavy:
        for other in type_heavy: 
            if ty != other: 
                R, p = results.distribution_compare(ty, other)
                dist_results["Distribution 1"].append(ty)
                dist_results["Distribution 2"].append(other)
                dist_results["R"].append(R)
                dist_results['p'].append(p)

    count_check = 0
    base = "power_law"
    #Get best fit
    for i in range(len(dist_results["Distribution 1"])):    
        dist = dist_results["Distribution 1"][i]
        if base != dist: 
            base = dist
            count_check = 0
            if dist_results["R"][i] > 0: 
                count_check += 1
        else: 
            if dist_results["R"][i] > 0: 
                count_check += 1
            if count_check == 5: 
                best = dist

    #Put results in table for reference           
    dist_results = pd.DataFrame.from_dict(dist_results)
    #see distribution type
    return best, dist_results

In [None]:
results = {}
for k,v, in lib_counts_by_contrib.items(): 
    best_fit, dist_results = get_fit(v)
    results[k] = best_fit

print(results)

In [None]:
for k,v in results.items(): 
    print(k,v,sum(lib_counts_by_contrib[k]))

In [None]:
for k,v in lib_counts_by_contrib.items():
    print(k,len(v))

<a id = "section2"></a>

# Section 2: HyperNetwork Analysis

As contributors and libraries represent a disjoint set we examine the network as a hypergraph. An then with various biparitie and projections. 


In [None]:
#Create data structure for hypergraphs

'''
hype_contrib
key : name of library
value  list of contributors
'''
hyper_contrib = {}
'''
H_star
key: name of contributor
value: list of libraries
'''
H_star = {}

for k, v in lib_producers.items(): 
    hyper_contrib[k] = list(v.keys())
    for p in v:
        if p not in H_star.keys(): 
            H_star[p] = [k]
        else: 
            H_star[p].append(k)

In [None]:
#Save to file
hyper_contrib2 = hnx.Hypergraph(hyper_contrib)
H_star_contrib = hnx.Hypergraph(H_star)

with open("data/h_star_contrib.pkl", 'wb') as pkl_object:
    pickle.dump(H_star_contrib, pkl_object)
with open("data/hyper_contrib.pkl", 'wb') as pkl_object:
    pickle.dump(hyper_contrib2, pkl_object)

In [None]:
with open("data/h_star_contrib.pkl", 'rb') as pkl_object:
    H_star_hyper = pickle.load(pkl_object)
with open("data/hyper_contrib.pkl", 'rb') as pkl_object:
    hyper = pickle.load(pkl_object)

In [None]:
hyper_stats = hnx.reports.descriptive_stats.info(hyper)
hyper_stats

In [None]:
s_central = hnx.algorithms.s_centrality_measures.s_betweenness_centrality(hyper)
s_central

In [None]:
b_sorted = sorted(s_central.items(), key=operator.itemgetter(1))
b_sorted_dict = collections.OrderedDict(b_sorted)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#b_sorted = sorted(centrality.items(), key=operator.itemgetter(1))
#b_sorted_dict = collections.OrderedDict(b_sorted)
print('Building graph...')

sns.set(style="darkgrid",rc={'figure.figsize':(10,10)}, font_scale = 2 )
plt.title('Python Libraries by S-Centrality (Top 14)', fontweight = 'bold')
#plt.figure(figsize= )

plt.xticks(rotation=90)
plt.xlabel('Library', fontweight='bold')
plt.ylabel('S Centrality', fontweight='bold')
plt.tight_layout()
x_plot = list(b_sorted_dict.keys())[-14:]
y_plot = list(b_sorted_dict.values())[-14:]
x_plot.reverse()
y_plot.reverse()
sns.barplot(x=x_plot, y=y_plot)


plt.savefig('hypercentralities_graph.png', dpi=300, bbox_inches='tight', pad_inches=0)

In [None]:
hnx.drawing.draw(hyper, with_node_labels = False)

In [None]:
#Collapsed nodes version
hnx.drawing.draw(hyper_contrib.collapse_nodes(), with_node_labels=False)

<a id = "section3"></a>
##  Section 3: Bipartite and Projection 

In [6]:
'''
Get the top network contributors defined using the Paretian Binning strategy from 'Geosptaial Analysis Requires a Different Way of Thinking' by Bin Jiang. 

Key: Repository name
Value: List of tob contributors
'''

def sep_bins(head_dict, history):
   
    head = {}
    tail = {}
    avg = np.mean(list(head_dict.values()))
    for k,v in head_dict.items(): 
        if v < avg: 
            tail[k] = v
        else: 
            head[k] = v
            
    history.append((head, tail))
    ratio_head_to_tail = len(head)/len(tail)
    ratio_weight = sum([x for x in tail.values()])/ sum([x for x in head.values()])
    print("Ratio Head to tail: {} and Ratio Weight: {}".format(ratio_head_to_tail, ratio_weight))
    if ratio_head_to_tail < 0.5: #Note after 50% ratio of head to tail ratio weight starts to go down. 
        sep_bins(head, history)
    return head, tail, history
print("\n\n")        
pratian = []
tails = {}
head_total = 0

for k,v in lib_producers.items(): 
    history = []
    head, tail, history = sep_bins(lib_producers[k], history)
    pratian.append({k:head})
    tails[k] = history
    head_total += len(head)
    print (len(head), len(tail))
head_total




Ratio Head to tail: 0.1346153846153846 and Ratio Weight: 0.3881278538812785
Ratio Head to tail: 0.4 and Ratio Weight: 0.46
Ratio Head to tail: 1.0 and Ratio Weight: 0.6666666666666666
7 52
Ratio Head to tail: 0.09965635738831616 and Ratio Weight: 0.1062087859577586
Ratio Head to tail: 0.3384615384615385 and Ratio Weight: 0.2990812779341169
Ratio Head to tail: 0.375 and Ratio Weight: 0.7663742690058479
Ratio Head to tail: 1.0 and Ratio Weight: 0.37805983680870353
87 873
Ratio Head to tail: 0.07982261640798226 and Ratio Weight: 0.18463256506498743
Ratio Head to tail: 0.23076923076923078 and Ratio Weight: 0.2825542072806843
Ratio Head to tail: 0.35 and Ratio Weight: 0.47644501879699247
Ratio Head to tail: 0.4 and Ratio Weight: 0.9057427515952088
Ratio Head to tail: 1.0 and Ratio Weight: 0.48610880053235733
144 1804
Ratio Head to tail: 0.09937888198757763 and Ratio Weight: 0.12084302031739613
Ratio Head to tail: 0.2972972972972973 and Ratio Weight: 0.35808909328025257
Ratio Head to tail

768

In [None]:
with open("tails.json", "w") as file:
    json.dump(tails, file)

## 3A. Bipartite and Projection Graphs of Tail 1 


In [7]:

def make_bi(tails, up_tail):
    '''
    Creates bipartite graph
    
    return networkx graph object, and left and right side of graph
    
    pass in tails library created above
    
    pass in negative integer of how far up the tail to include
    '''
    
    bip_bin = nx.Graph()

    for lib, bins in tails.items(): 
        bip_bin.add_node(lib, bipartite=0)
        if len(bins) > up_tail*-1: 
            #print(bins[-2])
            for v in bins[up_tail]: #.items(): 
                #print(v.keys())
                for n in v.keys(): 
                    #print(n)
                    bip_bin.add_node(n, bipartite=1)
                    bip_bin.add_edge(lib,n)
        else: 
            bin_num = len(bins) *-1
            for v in bins[bin_num]: #.items(): 
                for n in v.keys():
                    bip_bin.add_node(n, bipartite=1)
                    bip_bin.add_edge(lib,n)  

    print("There are {} nodes in the bipartite graph".format(len(bip_bin.nodes)))
    
    #For graph
    top = [node for node in bip_bin.nodes() if bip_bin.nodes[node]['bipartite']==0]
    bottom = [node for node in bip_bin.nodes() if bip_bin.nodes[node]['bipartite']==1]
    
    return bip_bin, top, bottom

def make_proj(tails, up_tail): 
    '''
    Creates contributor projection graph
    
    return networkx graph object, and left and right side of graph
    
    pass in tails library created above
    
    pass in negative integer of how far up the tail to include
    '''
    
    
    proj_bin = nx.Graph()

    for lib, bins in tails.items(): 
        #bip_2bin.add_node(lib, bipartite=0)
        if len(bins) > up_tail*-1:
            nodes = []
            for v in bins[up_tail]: #.items(): 
                nodes += list(v.keys())
            proj_bin.add_nodes_from(nodes)
            edges = [p for p in itertools.combinations(nodes,2)]   
            proj_bin.add_edges_from(edges) 
        else: 
            bin_num = len(bins) *-1
            nodes = []
            for v in bins[bin_num]:
                nodes += list(v.keys())
            proj_bin.add_nodes_from(nodes)
            edges = [p for p in itertools.combinations(nodes,2)]   
            proj_bin.add_edges_from(edges) 

    print("There are {} nodes in the projection graph".format(len(proj_bin.nodes)))
    
    return proj_bin


In [None]:
bip_1bin, top, bottom = make_bi(tails, -1)
proj_1bin = make_proj(tails, -1)

In [None]:
proj1_comps = list(nx.connected_components(proj_1bin))
len(proj1_comps)

In [None]:
nx.degree_assortativity_coefficient(proj_1bin)

In [None]:
nx.local_efficiency(proj_1bin)

In [None]:
nx.global_efficiency(proj_1bin)

In [None]:
#Get the libararies and most contributors
multiplier = len(bottom)/len(top)

sns.set(rc={'figure.figsize':(12,12)})
pos = dict()
pos.update( (n, (1.20, i*50)) for i, n in enumerate(bottom)) # put nodes from X at x=1
pos.update( (n, (1.70, i*multiplier*50)) for i, n in enumerate(top) ) # put nodes from Y at x=2


nx.draw(bip_1bin, pos=pos, with_labels=False)#, nodelist=top, node_size=2000, node_color='#FF0000', font_size=20)
#nx.draw(bip_most, pos=pos, with_labels=True, nodelist=bottom, node_size=800, node_color = '#FFFF00', font_size=15)

plt.title("Bipartite of Tail 1", fontsize=20, fontweight="bold")
style = dict(size=15, color='blue', fontweight="bold")
plt.text(1.18,3300, "Contributors", **style)
plt.text(1.68,3300, "Libraries", **style)


In [None]:
nx.draw(proj_1bin)

## 3B. Bipartite and Projection Graphs of Tail 2 

In [None]:
bip_2bin, top, bottom = make_bi(tails, -2)
proj_2bin = make_proj(tails, -2)

In [None]:
proj2_comps = list(nx.connected_components(proj_2bin))
len(proj2_comps)

In [None]:
nx.degree_assortativity_coefficient(proj_2bin)

In [None]:
nx.local_efficiency(proj_2bin)

In [None]:
nx.global_efficiency(proj_2bin)

In [None]:
#Get the libararies and most contributors
top = [node for node in bip_2bin.nodes() if bip_2bin.nodes[node]['bipartite']==0]
bottom = [node for node in bip_2bin.nodes() if bip_2bin.nodes[node]['bipartite']==1]
multiplier = len(bottom)/len(top)

sns.set(rc={'figure.figsize':(12,12)})
pos = dict()
pos.update( (n, (1.20, i*50)) for i, n in enumerate(bottom)) # put nodes from X at x=1
pos.update( (n, (1.70, i*multiplier*50)) for i, n in enumerate(top) ) # put nodes from Y at x=2


nx.draw(bip_2bin, pos=pos, with_labels=False)#, nodelist=top, node_size=2000, node_color='#FF0000', font_size=20)
#nx.draw(bip_most, pos=pos, with_labels=True, nodelist=bottom, node_size=800, node_color = '#FFFF00', font_size=15)

plt.title("Bipartites of Tail 2", fontsize=20, fontweight="bold")
style = dict(size=15, color='blue', fontweight="bold")
#plt.text(1.18,38000, "Contributors", **style)
#plt.text(1.68,38000, "Libraries", **style)


In [None]:
proj_2bin = nx.Graph()

for lib, bins in tails.items(): 
    #bip_2bin.add_node(lib, bipartite=0)
    if len(bins) > 2: 
        for v in bins[-2]: #.items(): 
            nodes = list(v.keys())
            proj_2bin.add_nodes_from(nodes)
            edges = [p for p in itertools.combinations(nodes,2)]   
            proj_2bin.add_edges_from(edges) 
    else: 
      for v in bins[-1]: #.items(): 
        nodes = list(v.keys())
        proj_2bin.add_nodes_from(nodes)
        edges = [p for p in itertools.combinations(nodes,2)]   
        proj_2bin.add_edges_from(edges) 

len(proj_2bin.nodes)

In [None]:
nx.draw(proj_2bin)

## 3C. Bipartite and Projection Graphs of Tail 3

In [None]:
bip_3bin, top, bottom = make_bi(tails, -3)
proj_3bin = make_proj(tails, -3)


In [None]:
proj3_comps = list(nx.connected_components(proj_3bin))
len(proj3_comps)

In [None]:
nx.degree_assortativity_coefficient(proj_3bin)

In [None]:
nx.local_efficiency(proj_3bin)

In [None]:
nx.global_efficiency(proj_3bin)

In [None]:
#Get the libararies and most contributors
multiplier = len(bottom)/len(top)

sns.set(rc={'figure.figsize':(12,12)})
pos = dict()
pos.update( (n, (1.20, i*50)) for i, n in enumerate(bottom)) # put nodes from X at x=1
pos.update( (n, (1.70, i*multiplier*50)) for i, n in enumerate(top) ) # put nodes from Y at x=2


nx.draw(bip_3bin, pos=pos, with_labels=False)#, nodelist=top, node_size=2000, node_color='#FF0000', font_size=20)
#nx.draw(bip_most, pos=pos, with_labels=True, nodelist=bottom, node_size=800, node_color = '#FFFF00', font_size=15)

plt.title("Bipartites of Tail 3", fontsize=20, fontweight="bold")
style = dict(size=15, color='blue', fontweight="bold")
#plt.text(1.18,38000, "Contributors", **style)
#plt.text(1.68,38000, "Libraries", **style)


In [None]:
nx.draw(proj_3bin)

## 3D. Bipartite and Projection Graphs of Tail 4

In [None]:
bip_4bin, top, bottom = make_bi(tails, -4)
proj_4bin = make_proj(tails, -4)

In [None]:
proj4_comps = list(nx.connected_components(proj_4bin))
len(proj4_comps)

In [None]:
nx.degree_assortativity_coefficient(proj_4bin)

In [None]:
nx.local_efficiency(proj_4bin)

In [None]:
nx.global_efficiency(proj_4bin)

In [None]:
nx.draw(proj_4bin)

## 3E. Bipartite and Projection Graphs of Tail 5

In [None]:
bip_5bin, top, bottom = make_bi(tails, -5)
proj_5bin = make_proj(tails, -5)

In [None]:
proj5_comps = list(nx.connected_components(proj_5bin))
len(proj5_comps)

In [None]:
nx.degree_assortativity_coefficient(proj_5bin)

In [None]:
nx.local_efficiency(proj_5bin)

In [None]:
nx.global_efficiency(proj_5bin)

In [None]:
num_nodes = []
num_edges = []
num_components = []
assortativity = []
density = []
local_e = []
global_e = []

for i in range(1,6):
    print("You are on run {}".format(i))
    proj_bin = make_proj(tails, i *-1)
    num_nodes.append(len(proj_bin.nodes))
    num_edges.append(len(proj_bin.edges))
    num_components.append(len(list(nx.connected_components(proj_bin))))
    assortativity.append(nx.degree_assortativity_coefficient(proj_bin))
    density.append(nx.density(proj_bin))
    local_e.append(nx.local_efficiency(proj_bin))
    print("local complete")
    global_e.append(nx.global_efficiency(proj_bin))
    
    
    
### Collect metrics for all network

In [None]:
num_nodes5 = []
num_edges5 = []
num_components5 = []
assortativity5 = []
density5 = []
local_e5 = []
global_e5 = []


#print("You are on run {}".format(i))
proj_bin = make_proj(tails, -5)
num_nodes5.append(len(proj_bin.nodes))
num_edges5.append(len(proj_bin.edges))
#num_components.append(len(list(nx.connected_components(proj_bin))))
#assortativity.append(nx.degree_assortativity_coefficient(proj_bin))
#density.append(nx.density(proj_bin))
local_e5.append(nx.local_efficiency(proj_bin))
print("local complete")
#global_e.append(nx.global_efficiency(proj_bin))
    
    
    
### Collect metrics for all network

There are 7383 nodes in the projection graph


In [None]:
local_e5

In [None]:
transitivity = []
triangles = []
clustering = []
coms_sum = []
coms_exp = []

for i in range(1,6):
    print("You are on run {}".format(i))
    proj_bin = make_proj(tails, i *-1)
    transitivity.append(nx.transitivity(proj_bin))
    print("trans complete")
    triangles.append(nx.triangles(proj_bin))
    print("triangles complete")
    clustering.append(nx.average_clustering(proj_bin))
    print("clustering complete")

In [None]:
transitivity,clustering,

In [None]:
coms_sum = []
coms_exp = []

for i in range(1,6):
    print("You are on run {}".format(i))
    proj_bin = make_proj(tails, i *-1)
    if len(list(nx.connected_components(proj_bin))) == 1: 
                print("coms strting")
                coms_sum.append(nx.communicability(proj_bin))
                print("coms mid")
                coms_exp.append(nx.communicability_exp(proj_bin))
                print("coms ending")
    else: 
                coms_sum.append("disconnected")
                coms_exp.append("disconnected")

In [None]:
import json


project_dict = {"number of nodes" : num_nodes, "number of edges": num_edges, "number of components": num_components, "assortativity" : assortativity, "density": density,
                "local efficiency":local_e, "global efficiency" : global_e}


with open("projection_graph.json", "w") as file:
    json.dump(project_dict, file)

In [None]:
project_dict

In [None]:
proj_bin = make_proj(tails, -5)
num_nodes.append(len(nx.nodes(proj_bin)))
num_edges.append(len(proj_bin.edges))
num_components.append(len(list(nx.connected_components(proj_bin))))
assortativity.append(nx.degree_assortativity_coefficient(proj_bin))
density.append(nx.density(proj_bin))
global_e.append(nx.global_efficiency(proj_bin))

In [None]:
project_dict = {"number of nodes" : num_nodes, "number of edges": num_edges, "number of components": num_components, "assortativity" : assortativity, "density": density,
                "local efficiency":local_e, "global efficiency" : global_e}
project_dict

In [None]:
for i in range(5): 
    print(project_dict["number of edges"][i]/project_dict["number of nodes"][i])
    print(project_dict["number of nodes"][i]/project_dict["number of edges"][i])

In [None]:
sns.set_theme(style="darkgrid")

sns.relplot(project_dict["number of nodes"],project_dict["number of edges"],kind='line')

In [None]:
sns.relplot(project_dict["global efficiency"],project_dict["density"])

### NO LONGER USED ------Projection by magnitude

In [None]:
#create list of number one contributors for each library 
most_list = []
for k,v in lib_producers.items():
    most = ["",0, k]
    for k2,v2 in v.items(): 
        if v2 > most[1]: 
            most[0] = k2
            most[1] = v2
    most_list.append(most)
most_list            

In [None]:
# dictionary
# key = libraray
#value = list of contributors
pratian_contribs = {}
for prat in pratian: 
    key = list(prat.keys())[0]
    pratian_contribs[key] = list(prat[key].keys())
pratian_contribs.keys()

In [None]:
#create list of number one contributors for each library 
most_list = []
for k,v in lib_producers.items():
    most = ["",0, k]
    for k2,v2 in v.items(): 
        if v2 > most[1]: 
            most[0] = k2
            most[1] = v2
    most_list.append(most)
most_list            

In [None]:
top_contribs['cpython']

In [None]:
bip_most = nx.Graph()

for k,v in top_contribs.items(): 
    bip_most.add_node(k, bipartite=0)
    for n in v: 
        bip_most.add_node(n[0], bipartite=1)
        bip_most.add_edge(k,n[0])

len(bip_most.nodes)

In [None]:
#Get the libararies and most contributors
top = [node for node in bip_most.nodes() if bip_most.nodes[node]['bipartite']==0]
bottom = [node for node in bip_most.nodes() if bip_most.nodes[node]['bipartite']==1]
multiplier = len(bottom)/len(top)

sns.set(rc={'figure.figsize':(12,12)})
pos = dict()
pos.update( (n, (1.20, i*50)) for i, n in enumerate(bottom)) # put nodes from X at x=1
pos.update( (n, (1.70, i*multiplier*50)) for i, n in enumerate(top) ) # put nodes from Y at x=2


nx.draw(bip_most, pos=pos, with_labels=False)#, nodelist=top, node_size=2000, node_color='#FF0000', font_size=20)
#nx.draw(bip_most, pos=pos, with_labels=True, nodelist=bottom, node_size=800, node_color = '#FFFF00', font_size=15)

plt.title("Major Contributors to the Top Networks", fontsize=20, fontweight="bold")
style = dict(size=15, color='blue', fontweight="bold")
plt.text(1.18,2100, "Contributors", **style)
plt.text(1.68,2100, "Libraries", **style)


In [None]:
proj_most = nx.Graph()

for k,v in top_contribs.items(): 
    nodes = list(zip(*v))[0]
    proj_most.add_nodes_from(nodes)
    edges = [p for p in itertools.combinations(nodes,2)]   
    proj_most.add_edges_from(edges)

proj_most.nodes

In [None]:
nx.draw(proj_most, with_labels=True)