In [46]:
import requests
import networkx as nx
import torch
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from github import Github
from operator import itemgetter
import sys

In [None]:
GITHUB_API_URL = "https://api.github.com"
GITHUB_TOKEN = "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"

headers = {
    "Authorization": f"token {GITHUB_TOKEN}",
    "Accept": "application/vnd.github.v3+json"
}
USER = 'kyleskom'
REPO = 'NBA-Machine-Learning-Sports-Betting'
client = Github(GITHUB_TOKEN, per_page=100)
user = client.get_user(USER)
repo = user.get_repo(REPO)
stargazers = [ s for s in repo.get_stargazers() ]
print("Number of stargazers", len(stargazers))

In [3]:
# Expand the initial graph with (interest) edges pointing each direction for 
# additional people interested. Take care to ensure that user and repo nodes 
g = nx.DiGraph()
g.add_node(repo.name + '(repo)', type='repo', lang=repo.language, owner=user.login)

for sg in stargazers:
    g.add_node(sg.login + '(user)', type='user')
    g.add_edge(sg.login + '(user)', repo.name + '(repo)', type='gazes')

In [None]:

for i, sg in enumerate(stargazers):
    
    # Add "follows" edges between stargazers in the graph if any relationships exist
    try:
        for follower in sg.get_followers():
            if follower.login + '(user)' in g:
                g.add_edge(follower.login + '(user)', sg.login + '(user)', 
                           type='follows')
    except Exception as e: #ssl.SSLError
        print("Encountered an error fetching followers for", sg.login, \
              "Skipping.", file=sys.stderr)
        print(e, file=sys.stderr)

    print("Processed", i+1, " stargazers. Num nodes/edges in graph", \
          g.number_of_nodes(), "/", g.number_of_edges())
    print("Rate limit remaining", client.rate_limiting)

In [7]:
c = Counter([e[1] for e in g.edges(data=True) if e[2]['type'] == 'follows'])
popular_users = [ (u, f) for (u, f) in c.most_common() if f > 1 ]
print("Number of popular users", len(popular_users))
print("Top 10 popular users:", popular_users[:10])

Number of popular users 13
Top 10 popular users: [('llSourcell(user)', 59), ('nkgilley(user)', 9), ('eddwebster(user)', 8), ('gingfacekillah(user)', 7), ('nealmick(user)', 6), ('cryptocoinserver(user)', 2), ('markjoeljimenez(user)', 2), ('JaySCarter(user)', 2), ('robbiejdunne(user)', 2), ('ParthS28(user)', 2)]


In [8]:
h = g.copy()
# Remove the seed of the interest graph, which is a supernode, in order
# to get a better idea of the network dynamics
h.remove_node('NBA-Machine-Learning-Sports-Betting(repo)')
dc = sorted(nx.degree_centrality(h).items(), key=itemgetter(1), reverse=True)
print("Degree Centrality")
print(dc[:10])
print()
bc = sorted(nx.betweenness_centrality(h).items(), key=itemgetter(1), reverse=True)
print("Betweenness Centrality")
print(bc[:10])
print()
print("Closeness Centrality")
cc = sorted(nx.closeness_centrality(h).items(), key=itemgetter(1), reverse=True)
print(cc[:10])

Degree Centrality
[('llSourcell(user)', 0.05427782888684453), ('eddwebster(user)', 0.010119595216191352), ('nkgilley(user)', 0.00827966881324747), ('gingfacekillah(user)', 0.006439742410303588), ('nealmick(user)', 0.005519779208831647), ('fly51fly(user)', 0.004599816007359705), ('eddwebsterlcfc(user)', 0.0036798528058877645), ('JaySCarter(user)', 0.0027598896044158236), ('robbiejdunne(user)', 0.0027598896044158236), ('ParthS28(user)', 0.0027598896044158236)]

Betweenness Centrality
[('eddwebster(user)', 1.5248008864175819e-05), ('KyleWilliamz(user)', 8.471116035653233e-07), ('Oldhuntor(user)', 8.471116035653233e-07), ('HarperGrieve(user)', 0.0), ('cryptocoinserver(user)', 0.0), ('tomdicato(user)', 0.0), ('afzongho(user)', 0.0), ('munozjoseph-sportsBI(user)', 0.0), ('bmckelvey11(user)', 0.0), ('rgl77(user)', 0.0)]

Closeness Centrality
[('llSourcell(user)', 0.05429291025080308), ('nkgilley(user)', 0.00827966881324747), ('eddwebster(user)', 0.007359705611775529), ('gingfacekillah(user)',

In [None]:
MAX_REPOS = 50
for i, sg in enumerate(stargazers):
    print(sg.login)
    try:
        for starred in sg.get_starred()[:MAX_REPOS]: # Slice to avoid supernodes
            g.add_node(starred.name + '(repo)', type='repo', lang=starred.language, \
                       owner=starred.owner.login)
            g.add_edge(sg.login + '(user)', starred.name + '(repo)', type='gazes')
    except Exception as e: #ssl.SSLError:
        print("Encountered an error fetching starred repos for", sg.login, "Skipping.")

    print("Processed", i+1, "stargazers' starred repos")
    print("Num nodes/edges in graph", g.number_of_nodes(), "/", g.number_of_edges())
    print("Rate limit", client.rate_limiting)

In [12]:
# Get a list of repositories from the graph.
repos = [n for n in g.nodes() if g.nodes[n]['type'] == 'repo']
# Most popular repos
print("Popular repositories")
print(sorted([(n,d) for (n,d) in g.in_degree() if g.nodes[n]['type'] == 'repo'], key=itemgetter(1), reverse=True)[:10])


Popular repositories
[('NBA-Machine-Learning-Sports-Betting(repo)', 1088), ('ChatGPT_Sports_Betting_Bot(repo)', 83), ('sports-betting(repo)', 71), ('public-apis(repo)', 60), ('AutoGPT(repo)', 56), ('NBA_Betting(repo)', 46), ('nba_api(repo)', 45), ('build-your-own-x(repo)', 44), ('ProphitBet-Soccer-Bets-Predictor(repo)', 41), ('Sports-betting(repo)', 40)]

Supernode candidates
[]
Graph has 19095 nodes
Graph has 26777 edges


In [13]:
repos = [n for n in g.nodes() if g.nodes[n]['type'] == 'repo']
for repo in repos:
    lang = (g.nodes[repo]['lang'] or "") + "(lang)"
    stargazers = [u for (u, r, d) in g.in_edges(repo, data=True) if d['type'] == 'gazes']
    for sg in stargazers:
        g.add_node(lang, type='lang')
        g.add_edge(sg, lang, type='programs')
        g.add_edge(lang, repo, type='implements')

In [19]:

print([n for n in g.nodes() if g.nodes[n]['type'] == 'lang'])
print("Most popular languages")
print(sorted([(n, g.in_degree(n)) for n in g.nodes() if g.nodes[n]['type'] == 'lang'], key=itemgetter(1), reverse=True)[:10])
python_programmers = [u 
                      for (u, l) in g.in_edges('Python(lang)') 
                          if g.nodes[u]['type'] == 'user']
print("Number of Python programmers:", len(python_programmers))
print()
javascript_programmers = [u for 
                          (u, l) in g.in_edges('JavaScript(lang)') 
                              if g.nodes[u]['type'] == 'user']
print("Number of JavaScript programmers:", len(javascript_programmers))
print()
typescript_programmers = [u for 
                          (u, l) in g.in_edges('TypeScript(lang)') 
                              if g.nodes[u]['type'] == 'user']
print("Number of TypeScript programmers:", len(typescript_programmers))
print()
html_programmers = [u for 
                          (u, l) in g.in_edges('HTML(lang)') 
                              if g.nodes[u]['type'] == 'user']
print("Number of HTML programmers:", len(html_programmers))

print(len(set(python_programmers).intersection(set(javascript_programmers))))
print(len(set(javascript_programmers).intersection(set(html_programmers))))
print(len(set(typescript_programmers).intersection(set(html_programmers))))

['Python(lang)', 'Java(lang)', 'Jupyter Notebook(lang)', 'Rust(lang)', 'C(lang)', 'TypeScript(lang)', 'C#(lang)', '(lang)', 'C++(lang)', 'R(lang)', 'JavaScript(lang)', 'SCSS(lang)', 'HTML(lang)', 'Ruby(lang)', 'MATLAB(lang)', 'SAS(lang)', 'Clojure(lang)', 'NCL(lang)', 'PHP(lang)', 'Dart(lang)', 'Lua(lang)', 'Go(lang)', 'MDX(lang)', 'Vue(lang)', 'Elixir(lang)', 'Shell(lang)', 'CSS(lang)', 'Solidity(lang)', 'VBScript(lang)', 'Objective-C(lang)', 'Swift(lang)', 'Cuda(lang)', 'Vim Script(lang)', 'TeX(lang)', 'Smarty(lang)', 'Matlab(lang)', 'Pug(lang)', 'Markdown(lang)', 'Dockerfile(lang)', 'Jinja(lang)', 'CoffeeScript(lang)', 'Brightscript(lang)', 'BrighterScript(lang)', 'Batchfile(lang)', 'Stata(lang)', 'Astro(lang)', 'Scala(lang)', 'Zig(lang)', 'Julia(lang)', 'HCL(lang)', 'Perl(lang)', 'Makefile(lang)', 'YAML(lang)', 'Sieve(lang)', 'Fortran(lang)', 'PostScript(lang)', 'PowerShell(lang)', 'Crystal(lang)', 'Svelte(lang)', 'Mojo(lang)', 'OpenEdge ABL(lang)', 'Kotlin(lang)', 'Smali(lang)', '

In [16]:

UG = nx.Graph()

UG.add_nodes_from(g.nodes(data=True))

for u, v in g.edges():
    if not UG.has_edge(u, v) and not UG.has_edge(v, u):
        UG.add_edge(u, v)

communities = list(nx.algorithms.community.greedy_modularity_communities(UG))
print("\nCommunity Detection:")
for i, community in enumerate(communities):
    print(f"Community {i + 1}: {', '.join(community)}")

def analyze_degree_distribution(G, communities):
    for i, community in enumerate(communities):
        subgraph = G.subgraph(community)
        degree_distribution = sorted(subgraph.degree(), key=lambda x: x[1], reverse=True)
        top_degree_nodes = degree_distribution[:10]  
        print(f"Top 10 nodes by degree distribution in Community {i + 1}: {top_degree_nodes}")
analyze_degree_distribution(UG, communities)
def analyze_centrality(G, communities):
    for i, community in enumerate(communities):
        subgraph = G.subgraph(community)
        degree_centrality = nx.degree_centrality(subgraph)
        betweenness_centrality = nx.betweenness_centrality(subgraph)
        closeness_centrality = nx.closeness_centrality(subgraph)
        sorted_degree_centrality = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)
        sorted_betweenness_centrality = sorted(betweenness_centrality.items(), key=lambda x: x[1], reverse=True)
        sorted_closeness_centrality = sorted(closeness_centrality.items(), key=lambda x: x[1], reverse=True)
        print(f"Key nodes in Community {i + 1} (sorted by degree centrality): {sorted_degree_centrality[:5]}")
        print(f"Key nodes in Community {i + 1} (sorted by betweenness centrality): {sorted_betweenness_centrality[:5]}")
        print(f"Key nodes in Community {i + 1} (sorted by closeness centrality): {sorted_closeness_centrality[:5]}")

analyze_centrality(UG, communities)


Community Detection:
Community 1: FastSAM(repo), lechiman(user), PyGazeAnalyser(repo), ollama-telegram(repo), AutoPrompt(repo), matplotlib(repo), MLSTM-FCN(repo), pytorch_geometric(repo), contrastors(repo), d2l-en(repo), PPPwn(repo), Stock-News-Scrapping-With-Python(repo), TTNet-Real-time-Analysis-System-for-Table-Tennis-Pytorch(repo), Doxa(repo), age-of-empires-II-api(repo), Netclone_Netflix-UI-Clone(repo), imcp23(repo), PressScraper(repo), zdd_adventure2023(repo), article_openai_bot(repo), Transfer-Learning-Library(repo), AudioLDM2(repo), AutoMates(repo), Py-Boost(repo), bark-voice-cloning-HuBERT-quantizer(repo), DFSLineupOptimizer(repo), langchain-chatbot(repo), Auto_Track_generation_and_upload(repo), ERKER2Phenopackets(repo), dolly(repo), OpenCV-Object-Face-Tracking(repo), mTAN(repo), CS231n-2017-Summary(repo), mmpose(repo), ijepa(repo), griptape(repo), Modmail(repo), optuna(repo), Webull-Trading-Bot(repo), azure-sql-database-samples(repo), Tiktok-uploader(repo), msrx(repo), autom

In [22]:
G = g.copy()
def get_user_neighbors(G, user):
    return set(G.neighbors(user))
# calculate Jaccard Similarity
def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    if union == 0:
        return 0.0
    return intersection / union
def top_jaccard_similarities(G, target_user, top_n=3):
    target_neighbors = get_user_neighbors(G, target_user)
    similarities = []
    
    for node in G.nodes:
        if G.nodes[node].get('type') == 'user' and node != target_user:
            neighbors = get_user_neighbors(G, node)
            similarity = jaccard_similarity(target_neighbors, neighbors)
            similarities.append((node, similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]
target_user = 'cadekeenan(user)'
top_similar_users = top_jaccard_similarities(G, target_user)

print(f"Top 3 users most similar to {target_user}:")
for user, similarity in top_similar_users:
    print(f"{user}: Jaccard Similarity = {similarity}")

Top 3 users most similar to cadekeenan(user):
playpicasso(user): Jaccard Similarity = 0.12389380530973451
yungdz(user): Jaccard Similarity = 0.11494252873563218
michaelemmanuel16(user): Jaccard Similarity = 0.11392405063291139


In [41]:
node_mapping = {node: i for i, node in enumerate(G.nodes())}
node_features = []
node_types = []

for node, data in G.nodes(data=True):
    node_types.append(data['type'])
    if data['type'] == 'user':
        node_features.append([1, 0, 0])
    elif data['type'] == 'lang':
        node_features.append([0, 1, 0])
    elif data['type'] == 'repo':
        node_features.append([0, 0, 1])

node_features = torch.tensor(node_features, dtype=torch.float)

edge_index = []
edge_type = []

for u, v, data in G.edges(data=True):
    edge_index.append([node_mapping[u], node_mapping[v]])
    edge_type.append(data['type'])

edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# Create Pyg 
data = Data(x=node_features, edge_index=edge_index)

In [42]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x
in_channels = data.num_node_features
hidden_channels = 16
out_channels = in_channels  

# Initailize the model
model = GraphSAGE(in_channels, hidden_channels, out_channels)

# Define optimizer 
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

# Train the model
def train(data, epochs=100):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = criterion(out, data.x)  # 自监督任务
        loss.backward()
        optimizer.step()
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')

train(data)
model.eval()
with torch.no_grad():
    embeddings = model(data)
print(embeddings)

Epoch 0, Loss: 0.49496185779571533
Epoch 10, Loss: 0.03259247541427612
Epoch 20, Loss: 0.012529782019555569
Epoch 30, Loss: 0.011746331118047237
Epoch 40, Loss: 0.002692240523174405
Epoch 50, Loss: 0.0013530156575143337
Epoch 60, Loss: 0.0009395055822096765
Epoch 70, Loss: 0.0005343969096429646
Epoch 80, Loss: 0.00040521862683817744
Epoch 90, Loss: 0.00036999466829001904
tensor([[ 4.0501e-02,  1.7501e-02,  9.3501e-01],
        [ 1.0056e+00, -4.8570e-05, -2.1098e-02],
        [ 5.5822e-01,  1.0551e-01,  4.9140e-01],
        ...,
        [ 1.4984e-03,  9.8198e-01,  3.4729e-03],
        [ 1.4984e-03,  9.8198e-01,  3.4729e-03],
        [ 1.4984e-03,  9.8198e-01,  3.4729e-03]])


In [43]:

def calculate_cosine_similarity(embeddings, node1, node2):
    node1_embedding = embeddings[node1].numpy().reshape(1, -1)
    node2_embedding = embeddings[node2].numpy().reshape(1, -1)
    similarity = cosine_similarity(node1_embedding, node2_embedding)[0][0]
    return similarity

def top_cosine_similarities(embeddings, target_user, top_n=10):
    user_index = node_mapping[target_user]
    user_embedding = embeddings[user_index].numpy().reshape(1, -1)
    repo_indices = [i for i, node in enumerate(G.nodes) if G.nodes[node]['type'] == 'repo']

    similarities = []
    for repo_index in repo_indices:
        repo_embedding = embeddings[repo_index].numpy().reshape(1, -1)
        similarity = cosine_similarity(user_embedding, repo_embedding)[0][0]
        similarities.append((list(G.nodes)[repo_index], similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_n]

top_similar_nodes = top_cosine_similarities(embeddings, 'michaelemmanuel16(user)')
print(f'Top 10 nodes most similar to node user1:')
for node, similarity in top_similar_nodes:
    print(f'Node {node}: Cosine Similarity = {similarity}')

Cosine similarity between node 0 and node 1: 0.02230467088520527
Top 10 nodes most similar to node user1:
Node NBA-Machine-Learning-Sports-Betting(repo): Cosine Similarity = 0.02230467088520527
Node ChatGPT_Sports_Betting_Bot(repo): Cosine Similarity = 0.02212148904800415
Node sports-betting(repo): Cosine Similarity = 0.021768363192677498
Node ProphitBet-Soccer-Bets-Predictor(repo): Cosine Similarity = 0.02039150521159172
Node Sports-betting(repo): Cosine Similarity = 0.02031581476330757
Node public-apis(repo): Cosine Similarity = 0.02026885189116001
Node ollama(repo): Cosine Similarity = 0.020064987242221832
Node AutoGPT(repo): Cosine Similarity = 0.020042194053530693
Node WagerBrain(repo): Cosine Similarity = 0.019874680787324905
Node NBA_Betting(repo): Cosine Similarity = 0.01930730976164341
