In [1]:
from collections import defaultdict, Counter
import json
import networkx as nx
import numpy as np
import os
import pandas as pd
import sys

from utils.validate_weights import validate_weights
from utils.serialize_graph import convert_graph_to_serializable

# Load the unweighted graph

In [22]:
with open("./graph/unweighted_graph_pruned_with_metadata.json", "r") as f:
    graph_data = json.load(f)

G = nx.node_link_graph(graph_data, edges="links")
print("Nodes:", len(G.nodes))
repo_urls = [x for x in G.nodes]
print("Edges:", len(G.edges))

Nodes: 255
Edges: 444


In [24]:
validate_weights(G)

{'https://github.com/prysmaticlabs/prysm': 0.0,
 'https://github.com/ethereum/go-ethereum': 0.0,
 'https://github.com/sigp/lighthouse': 0.0,
 'https://github.com/consensys/teku': 0.0,
 'https://github.com/status-im/nimbus-eth2': 0.0,
 'https://github.com/chainsafe/lodestar': 0.0,
 'https://github.com/ethereumjs/ethereumjs-monorepo': 0.0,
 'https://github.com/grandinetech/grandine': 0.0,
 'https://github.com/erigontech/erigon': 0.0,
 'https://github.com/paradigmxyz/reth': 0.0,
 'https://github.com/ethereum/solidity': 0.0,
 'https://github.com/ethereum/remix-project': 0.0,
 'https://github.com/vyperlang/vyper': 0.0,
 'https://github.com/ethereum/web3.py': 0.0,
 'https://github.com/ethereum/py-evm': 0.0,
 'https://github.com/eth-infinitism/account-abstraction': 0.0,
 'https://github.com/safe-global/safe-smart-account': 0.0,
 'https://github.com/web3/web3.js': 0.0}

In [23]:
relations = [data['relation'] for _, _, data in G.edges(data=True) if 'relation' in data]
pkg_relation_counts = Counter(relations)
pkg_relation_counts

Counter()

# Grab some repo-level and historical funding data

- Subscribe to the OSO Production dataset on BigQuery (see docs [here](https://docs.opensource.observer/docs/get-started/bigquery))
- Enter the following query into your [console](https://console.cloud.google.com/bigquery) to get a fresh copy of the graph
- Save it as a CSV file to `datasets/oso/repo_and_funding_stats.csv`

In [5]:
def stringify_array(arr):
    return "'" + "','".join(arr) + "'"

query = f"""
    -- COPY THIS INTO YOUR BIGQUERY CONSOLE

    with repos as (
      select
        project_id as oso_project_id,
        artifact_url as url,
        is_fork,
        star_count,
        fork_count,
        license_name,
        language,
        created_at,
        updated_at
      from `oso_production.repositories_v0`
    ),

    gitcoin as (
      select
        oso_project_id,
        sum(amount_in_usd) as gitcoin_grants_usd,
        count(distinct donor_address) as unique_donors,
        count(distinct round_number) as num_rounds
      from `oso_production.gitcoin_funding_events_by_project_v0`
      where oso_project_id is not null and donor_address is not null
      group by oso_project_id
    ),

    retrofunding as (
      select
        p.project_id,
        sum(f.amount) as retro_funding_usd,
        count(distinct f.grant_pool_name) as num_retro_funding_rounds
      from `static_data_sources.oss_funding_v1` f -- see: https://github.com/opensource-observer/oss-funding
      join `oso_production.projects_v1` p
        on f.to_project_name = p.project_name
      where
        f.grant_pool_name like '%retro%'
        and f.from_funder_name = 'optimism'
      group by p.project_id
    )

    select
      repos.url,
      repos.star_count,
      repos.fork_count,
      repos.is_fork,
      repos.language,
      repos.license_name,
      repos.created_at,
      repos.updated_at,
      coalesce(gitcoin.gitcoin_grants_usd, 0) as gitcoin_grants_usd,
      coalesce(gitcoin.unique_donors, 0) as gitcoin_unique_donors,
      coalesce(gitcoin.num_rounds, 0) as gitcoin_num_rounds,
      coalesce(retrofunding.retro_funding_usd, 0) as retro_funding_usd,
      coalesce(retrofunding.num_retro_funding_rounds, 0) as num_retro_funding_rounds,
      repos.oso_project_id
    from repos
    left join gitcoin
      on repos.oso_project_id = gitcoin.oso_project_id
    left join retrofunding
      on repos.oso_project_id = retrofunding.project_id
    
    -- Add the list of relevant repo_urls as a where clause
"""
#query += f"    where url in ({stringify_array(repo_urls)})"
print(query)


    -- COPY THIS INTO YOUR BIGQUERY CONSOLE

    with repos as (
      select
        project_id as oso_project_id,
        artifact_url as url,
        is_fork,
        star_count,
        fork_count,
        license_name,
        language,
        created_at,
        updated_at
      from `oso_production.repositories_v0`
    ),

    gitcoin as (
      select
        oso_project_id,
        sum(amount_in_usd) as gitcoin_grants_usd,
        count(distinct donor_address) as unique_donors,
        count(distinct round_number) as num_rounds
      from `oso_production.gitcoin_funding_events_by_project_v0`
      where oso_project_id is not null and donor_address is not null
      group by oso_project_id
    ),

    retrofunding as (
      select
        p.project_id,
        sum(f.amount) as retro_funding_usd,
        count(distinct f.grant_pool_name) as num_retro_funding_rounds
      from `static_data_sources.oss_funding_v1` f -- see: https://github.com/opensource-observer/oss-funding
   

In [25]:
# load the data and transform it a bit
# metrics_df = pd.read_csv('./datasets/oso/repo_and_funding_stats.csv', index_col=0)

# funding_averages = (
#     metrics_df
#     .groupby('oso_project_id')
#     [['gitcoin_grants_usd', 'retro_funding_usd']]
#     .transform('mean')
# )
# metrics_df['gitcoin_grants_usd'] = funding_averages['gitcoin_grants_usd']
# metrics_df['retro_funding_usd'] = funding_averages['retro_funding_usd']

# # print("Metric totals:")
# # for c in metrics_df.columns:
# #     if metrics_df[c].dtype != 'O':
# #         print(f"- {c}: {metrics_df[c].sum():,.0f}")

# metrics_df

In [7]:
# add these variables as attributes in our graph

# for node in G.nodes():
#     if node in metrics_df.index:
#         for col in metrics_df.columns:
#             G.nodes[node][col] = metrics_df.at[node, col]
#     else:
#         for col in metrics_df.columns:
#             G.nodes[node][col] = 0

# sample_node = list(G.nodes())[4]
# attrs = G.nodes[sample_node]
# print("Node attributes for", sample_node)
# for k,v in attrs.items():
#     print(f"-{k}: {v}")

Node attributes for https://github.com/chainsafe/lodestar
-level: 1
-star_count: 1218
-fork_count: 306
-is_fork: False
-language: TypeScript
-license_name: Apache License 2.0
-created_at: 2018-06-22 14:41:47.000000 UTC
-updated_at: 2024-12-14 21:35:37.000000 UTC
-gitcoin_grants_usd: 5588.859293819911
-gitcoin_unique_donors: 803
-gitcoin_num_rounds: 6
-retro_funding_usd: 1163079.6939814093
-num_retro_funding_rounds: 3
-oso_project_id: imOWe-ffDazpMiTjIJWOp-siTuTAeO6jJ9v-DrUyvXM=


# AI Agent to weight the graph

In [29]:
from deepfunding import run_comparison

for seed_node, node_data in G.nodes(data=True):
    if node_data.get('level') != 1:
        continue
    print(f"Weighting all pairs of dependencies for Seed node: {seed_node}")
    dependencies = list(G.successors(seed_node))

    # Iterate through all pairs of dependencies
    for i, dep1 in enumerate(dependencies):
        for dep2 in dependencies[i+1:]:
            print(f"Comparing {dep1} and {dep2}")
            G.nodes[dep1]['url'] = dep1
            G.nodes[dep2]['url'] = dep2
            print(G.nodes[dep1])
            print(G.nodes[dep2])
            run_comparison(G.nodes[dep1], G.nodes[dep2])
            break
        break

    break


Weighting all pairs of dependencies for Seed node: https://github.com/prysmaticlabs/prysm
Comparing https://github.com/ethereum/go-ethereum and https://github.com/ipfs/go-cid
{'level': 1, 'language': 'Go', 'status': 'indexed', 'isFork': False, 'createdAt': '2013-12-26', 'updatedAt': '2024-12-29', 'starCount': 47988, 'forkCount': 20343, 'numPackages': 1, 'numDependentsInOso': 238, 'listOfFunders': ['Gitcoin', 'Optimism'], 'totalFundingUsd': 2657310.811802441, 'totalFundingUsdSince2023': 2496187.621307001, 'osoDependencyRank': 0.38474672737620946, 'numReposInSameLanguage': 325, 'osoDependencyRankForLanguage': 0.9629629629629629, 'url': 'https://github.com/ethereum/go-ethereum'}
{'level': 2, 'language': 'Go', 'status': 'indexed', 'isFork': False, 'createdAt': '2016-08-23', 'updatedAt': '2024-12-12', 'starCount': 157, 'forkCount': 47, 'numPackages': 1, 'numDependentsInOso': 165, 'listOfFunders': ['Optimism', 'Gitcoin'], 'totalFundingUsd': 791769.407770362, 'totalFundingUsdSince2023': 79176

Error during comparison: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT


Raw event: {'validator': {'messages': [HumanMessage(content='{"validation": "The analysis provided by the Project Analyzer is comprehensive and well-supported with quantitative metrics, particularly in user feedback and community engagement. However, the analysis could further elaborate on how the metrics directly influence the weight distribution. The Funding Strategist\'s analysis is also strong, providing relevant insights into funding and community support, though it lacks specific metrics to substantiate claims about the number of stars and forks. The Community Advocate adds valuable qualitative insights but does not introduce new metrics to strengthen the justification of the weight assignment. Therefore, the Project Analyzer and Funding Strategist need to ensure they provide more explicit connections between metrics and weight decisions. The Community Advocate should focus on incorporating more quantitative data to support the qualitative aspects discussed.", "revision_needed": 

GraphRecursionError: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT

# Apply a basic weighting algorithm

In [8]:
total_weight_cap = 0.8
max_edge_weight = 0.2

max_gitcoin = max(nx.get_node_attributes(G, 'gitcoin_grants_usd').values(), default=1)
max_retro = max(nx.get_node_attributes(G, 'retro_funding_usd').values(), default=1)
max_forks = max(nx.get_node_attributes(G, 'fork_count').values(), default=1)

for seed_node, node_data in G.nodes(data=True):
    if node_data.get('level') != 1:
        continue

    dependencies = list(G.successors(seed_node))
    weights = []

    # Step 1: Calculate raw weights with language check
    for dep in dependencies:
        seed_language = G.nodes[seed_node].get('language', None)
        if seed_language in ['JavaScript', 'TypeScript', 'Python', 'Rust', 'Go']:
            dep_language = G.nodes[dep].get('language', None)
            if seed_language != dep_language:
                weights.append((dep, 0.0))
                continue

        gitcoin = G.nodes[dep].get('gitcoin_grants_usd', 0) / max_gitcoin
        retro = G.nodes[dep].get('retro_funding_usd', 0) / max_retro
        forks = G.nodes[dep].get('fork_count', 0)
        if forks:
            forks = np.log(forks) / np.log(max_forks)

        raw_weight = 0.4 * gitcoin + 0.4 * retro + 0.2 * forks
        weights.append((dep, raw_weight))

    total_raw_weight = sum(w for _, w in weights)

    # Step 2: Normalize raw weights
    normalized_weights = [
        (dep, (w / total_raw_weight) * total_weight_cap) if total_raw_weight > 0 else (dep, 0.0)
        for dep, w in weights
    ]

    # Step 3: Cap weights and calculate remaining weight
    capped_weights = []
    remaining_weight = total_weight_cap

    for dep, weight in normalized_weights:
        capped_weight = min(weight, max_edge_weight)
        capped_weights.append((dep, capped_weight))
        remaining_weight -= capped_weight

    # Step 4: Redistribute remaining weight proportionally to uncapped edges
    uncapped_edges = [(dep, weight) for dep, weight in capped_weights if weight < max_edge_weight]
    uncapped_total = sum(weight for _, weight in uncapped_edges)

    final_weights = []
    for dep, capped_weight in capped_weights:
        if uncapped_total > 0 and capped_weight < max_edge_weight:
            additional_weight = (capped_weight / uncapped_total) * remaining_weight
            new_weight = min(capped_weight + additional_weight, max_edge_weight)
            final_weights.append((dep, new_weight))
        else:
            final_weights.append((dep, capped_weight))

    # Step 5: Assign weights back to edges
    for dep, weight in final_weights:
        if G.has_edge(seed_node, dep):
            G[seed_node][dep]['weight'] = weight

In [9]:
for seed_node in G.nodes():
    outgoing_edges = [(v, G[seed_node][v]['weight']) for v in G.successors(seed_node) 
                      if 'weight' in G[seed_node][v]]
    sorted_edges = sorted(outgoing_edges, key=lambda x: x[1], reverse=True)
    if sorted_edges:
        total_weights = sum([x[1] for x in sorted_edges])
        print(f"\nSeed Node: {seed_node} | Weight: {1 - total_weights:.4f}")
        for i, (dep, weight) in enumerate(sorted_edges):
            if i < 10:
                print(f"- Dependent: {dep} | Weight: {weight:.4f}")


Seed Node: https://github.com/prysmaticlabs/prysm | Weight: 0.2000
- Dependent: https://github.com/ethereum/go-ethereum | Weight: 0.0664
- Dependent: https://github.com/libp2p/go-libp2p | Weight: 0.0375
- Dependent: https://github.com/libp2p/go-libp2p-pubsub | Weight: 0.0344
- Dependent: https://github.com/libp2p/go-reuseport | Weight: 0.0335
- Dependent: https://github.com/prysmaticlabs/go-bitfield | Weight: 0.0314
- Dependent: https://github.com/libp2p/go-yamux | Weight: 0.0309
- Dependent: https://github.com/prysmaticlabs/fastssz | Weight: 0.0309
- Dependent: https://github.com/libp2p/go-netroute | Weight: 0.0306
- Dependent: https://github.com/libp2p/go-mplex | Weight: 0.0306
- Dependent: https://github.com/libp2p/go-buffer-pool | Weight: 0.0302

Seed Node: https://github.com/sigp/lighthouse | Weight: 0.2000
- Dependent: https://github.com/libp2p/rust-libp2p | Weight: 0.0266
- Dependent: https://github.com/alloy-rs/alloy | Weight: 0.0097
- Dependent: https://github.com/tokio-rs/to

# Export the graph to JSON

In [10]:
print("\nLevel 1 Nodes (Sources) and Their Summed Edge Weights:")
source_weights = validate_weights(G)
pd.Series(source_weights)


Level 1 Nodes (Sources) and Their Summed Edge Weights:


https://github.com/prysmaticlabs/prysm                   0.8
https://github.com/sigp/lighthouse                       0.8
https://github.com/consensys/teku                        0.8
https://github.com/status-im/nimbus-eth2                 0.8
https://github.com/chainsafe/lodestar                    0.8
https://github.com/grandinetech/grandine                 0.8
https://github.com/ethereum/go-ethereum                  0.8
https://github.com/erigontech/erigon                     0.8
https://github.com/paradigmxyz/reth                      0.8
https://github.com/ethereum/solidity                     0.0
https://github.com/ethereum/remix-project                0.8
https://github.com/vyperlang/vyper                       0.0
https://github.com/ethereum/web3.py                      0.8
https://github.com/ethereum/py-evm                       0.2
https://github.com/eth-infinitism/account-abstraction    0.8
https://github.com/safe-global/safe-smart-account        0.8
https://github.com/web3/

In [11]:
G_serializable = convert_graph_to_serializable(G)
graph_json = nx.node_link_data(G_serializable)
output_path = "../../graph/weighting_examples/oso_forks_and_funding_weighted_graph.json"
with open(output_path, "w") as f:
    json.dump(graph_json, f, indent=2)