In [27]:
import networkx as nx
from more_itertools import flatten
from entity.taxonomy import Taxonomy

all_terms = []
cso_taxonomy = Taxonomy.load(f'../data/interim/new_gitranking_cso.json')
wiki_taxonomy = Taxonomy.load(f'../data/interim/new_gitranking_wikidata.json')

combined_taxonomy = Taxonomy('combined-filtered')
combined_taxonomy.pairs = cso_taxonomy.pairs + wiki_taxonomy.pairs
combined_taxonomy.terms = list(set(flatten(combined_taxonomy.pairs)))

In [28]:
len(combined_taxonomy.terms)

In [29]:
import pandas as pd

llm_ad = pd.read_csv('../data/interim/llm_responses_is_ad.csv')
llm_ad = llm_ad[llm_ad['response'] == '1']


In [30]:
llm_ad

In [31]:
filtered_pairs = []
remaining_pairs = []
for pair in combined_taxonomy.pairs:
    if any(term not in llm_ad['term'].values for term in [pair[0], pair[1]]):
        filtered_pairs.append(pair)
    else:
        remaining_pairs.append(pair)
        
print(len(filtered_pairs), len(combined_taxonomy.pairs))

In [32]:
filtered_pairs


In [33]:
wiki_filtered = sum([1 for x in filtered_pairs if 'wikidata' == x[2]])
print(wiki_filtered)
cso_filtered = sum([1 for x in filtered_pairs if 'cso' == x[2]])
print(cso_filtered)

In [34]:
remaining_pairs

In [35]:
wiki_remaining = sum([1 for x in remaining_pairs if 'wikidata' == x[2]])
print(wiki_remaining)
cso_remaining = sum([1 for x in remaining_pairs if 'cso' == x[2]])
print(cso_remaining)

In [44]:
edges = [(pair[0], pair[1], {'origin': pair[2]}) for pair in remaining_pairs]
g = nx.DiGraph()
g.add_edges_from(edges)

In [45]:
nx.is_directed_acyclic_graph(g)

In [47]:
no_parent = [node for node, out_degree in g.out_degree() if out_degree == 0]

In [48]:
len(no_parent)

In [49]:
for node in no_parent:
    print(node)

In [50]:
from entity.gitranking import GitRanking

gitranking = GitRanking('../data/raw/gitranking.jsonl')

In [51]:
missing_gitranking = set(gitranking.terms).difference(set(combined_taxonomy.terms))

In [52]:
len(missing_gitranking)

In [53]:
combined_taxonomy.terms.extend(missing_gitranking)

In [54]:
for term in missing_gitranking:
    combined_taxonomy.pairs.append([term, '', 'gitranking'])

In [68]:
hex_colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#00FFFF', '#FF00FF', '#800000', '#008000', '#000080']

origin_to_color = {'cso': hex_colors[0], 'wikidata': hex_colors[1], 'gitranking': hex_colors[2], 
                   'root': hex_colors[3]}
nodes = [(x, y, origin_to_color[t]) for x, y, t in combined_taxonomy.pairs]
df = pd.DataFrame(nodes, columns=['source', 'target', 'color'])
df.to_csv('../data/interim/joined_graph_combined_filtered.csv', index=False)

In [58]:
for node in no_parent:
    g.add_edges_from([(node, 'root', {"origin": 'root'})])
    

In [61]:
nx.shortest_path_length(g, 'root')

In [64]:

G = g
paths = []
sink_nodes = [node for node, outdegree in G.out_degree(G.nodes()) if outdegree == 0]
source_nodes = [node for node, indegree in G.in_degree(G.nodes()) if indegree == 0]
for source, sink in [(source, sink) for sink in sink_nodes for source in source_nodes]:
    for path in nx.all_simple_paths(G, source=source, target=sink):
        paths.append(path)


In [80]:
from collections import defaultdict

paths_passing_throught_node = defaultdict(list)
for node in G.nodes():
    paths_passing_throught_node[node].extend([path for path in paths if node in path])

In [65]:
distances = [len(x) for x in paths]

In [66]:
distances

In [67]:
import numpy as np

print(np.max(distances))
print(np.min(distances))
print(np.mean(distances))
print(np.std(distances))
print(np.median(distances))

In [69]:
paths

In [85]:

wiki_taxonomy = Taxonomy.load(f'../data/interim/new_gitranking_wikidata.json')
wiki_edges = [(pair[0], pair[1]) for pair in wiki_taxonomy.pairs]
removed_nodes = []
for node in g:
    inwards = g.in_edges(node)
    outwards = g.out_edges(node)
    both_edges = list(inwards) + list(outwards)
    #if all(edge in wiki_edges for edge in both_edges):
    t = list(flatten(paths_passing_throught_node[node]))
    print(t)
    if not set(t).intersection(set(gitranking.terms)):
        removed_nodes.append(node)

In [87]:
removed_nodes