In [1]:
import networkx as nx
from more_itertools import flatten
from entity.taxonomy import Taxonomy

combined_taxonomy = Taxonomy('combined-filtered').load('../data/interim/mixed_gitranking_cso_wikidata.json')

In [5]:
len(set(flatten(combined_taxonomy.pairs)))

1027

In [34]:
import pandas as pd

llm_ad = pd.read_csv('../data/interim/llm_responses_is_ad.csv')
llm_ad = llm_ad[llm_ad['response'] == '0']


In [35]:
llm_ad

Unnamed: 0,term,response
0,noticing,0
1,physical property,0
3,reasoning,0
4,effectiveness,0
7,learning material,0
...,...,...
1277,infographic,0
1278,physical process,0
1280,economic entity,0
1281,physical container,0


In [36]:
filtered_pairs = []
remaining_pairs = []
for pair in combined_taxonomy.pairs:
    if any(term in llm_ad['term'].values for term in [pair[0], pair[1]]):
        filtered_pairs.append(pair)
    else:
        remaining_pairs.append(pair)
        
print(len(filtered_pairs), len(combined_taxonomy.pairs))

1166 1715


In [37]:
filtered_pairs


[['curricula', 'education', 'cso'],
 ['peer-to-peer', 'distributed computer systems', 'cso'],
 ['network analysis', 'electric network analysis', 'cso'],
 ['natural languages', 'semantics', 'cso'],
 ['probability', 'mathematics', 'cso'],
 ['smart homes', 'smart environment', 'cso'],
 ['visualization', 'human computer interaction', 'cso'],
 ['natural sciences computing', 'database systems', 'cso'],
 ['mobile computing', 'telecommunication networks', 'cso'],
 ['data visualization', 'visualization', 'cso'],
 ['man machine systems', 'robotics', 'cso'],
 ['speech transmission', 'speech communication', 'cso'],
 ['image compression', 'image coding', 'cso'],
 ['security of data', 'computer security', 'cso'],
 ['frequency division multiple access', 'frequency allocation', 'cso'],
 ['wireless telecommunication systems', 'communication systems', 'cso'],
 ['computer science education', 'curricula', 'cso'],
 ['frequency allocation', 'communication channels', 'cso'],
 ['cloud services', 'mobile devic

In [38]:
wiki_filtered = sum([1 for x in filtered_pairs if 'wikidata' == x[2]])
print(wiki_filtered)
cso_filtered = sum([1 for x in filtered_pairs if 'cso' == x[2]])
print(cso_filtered)


962
204


In [39]:
remaining_pairs

[['ambient intelligence', 'sensors', 'cso'],
 ['text processing', 'computational linguistics', 'cso'],
 ['cloud providers', 'virtual machines', 'cso'],
 ['computational biology', 'bioinformatics', 'cso'],
 ['real time systems', 'embedded systems', 'cso'],
 ['electronic document identification systems', 'cryptography', 'cso'],
 ['human computer interaction', 'computer science', 'cso'],
 ['image super-resolution', 'low resolution images', 'cso'],
 ['recommender systems', 'information retrieval', 'cso'],
 ['computer networks', 'computer science', 'cso'],
 ['digital image storage', 'image compression', 'cso'],
 ['text classification', 'text processing', 'cso'],
 ['synthesized speech', 'hidden markov model %28hmm%29', 'cso'],
 ['blockchain', 'distributed ledger', 'cso'],
 ['interpreter', 'program interpreters', 'cso'],
 ['object detection', 'object recognition', 'cso'],
 ['interactive computer graphics', 'computer imaging and vision', 'cso'],
 ['digital signal processing', 'signal processin

In [40]:
hex_colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#00FFFF', '#FF00FF', '#800000', '#008000', '#000080']

origin_to_color = {'cso': hex_colors[0], 'wikidata': hex_colors[1], 'gitranking': hex_colors[2], 
                   'root': hex_colors[3]}
nodes = [(x, y, origin_to_color[t]) for x, y, t in combined_taxonomy.pairs if [x, y, t] not in filtered_pairs]
df = pd.DataFrame(nodes, columns=['source', 'target', 'color'])
df.to_csv('../data/interim/joined_graph_combined_filtered_new.csv', index=False)

In [41]:
a = []
for i in combined_taxonomy.pairs:
    if i in filtered_pairs:
        a.append(i)
print(len(a))
print(len(combined_taxonomy.pairs))
print(len(nodes))

1166
1715
549


In [58]:
for node in no_parent:
    g.add_edges_from([(node, 'root', {"origin": 'root'})])
    

In [61]:
nx.shortest_path_length(g, 'root')

{'root': 0}

In [64]:

G = g
paths = []
sink_nodes = [node for node, outdegree in G.out_degree(G.nodes()) if outdegree == 0]
source_nodes = [node for node, indegree in G.in_degree(G.nodes()) if indegree == 0]
for source, sink in [(source, sink) for sink in sink_nodes for source in source_nodes]:
    for path in nx.all_simple_paths(G, source=source, target=sink):
        paths.append(path)


In [80]:
from collections import defaultdict

paths_passing_throught_node = defaultdict(list)
for node in G.nodes():
    paths_passing_throught_node[node].extend([path for path in paths if node in path])

In [65]:
distances = [len(x) for x in paths]

In [66]:
distances

[4,
 4,
 5,
 4,
 7,
 6,
 5,
 7,
 6,
 6,
 5,
 6,
 6,
 5,
 8,
 6,
 5,
 3,
 7,
 6,
 3,
 6,
 5,
 4,
 5,
 4,
 4,
 5,
 4,
 7,
 5,
 6,
 5,
 5,
 4,
 7,
 5,
 6,
 6,
 5,
 3,
 4,
 6,
 5,
 3,
 7,
 5,
 3,
 5,
 6,
 7,
 12,
 7,
 4,
 4,
 4,
 4,
 6,
 5,
 4,
 7,
 6,
 4,
 3,
 5,
 4,
 7,
 5,
 4,
 6,
 5,
 5,
 4,
 7,
 5,
 3,
 6,
 6,
 5,
 6,
 7,
 6,
 7,
 6,
 4,
 3,
 4,
 5,
 4,
 3,
 3,
 6,
 5,
 3,
 3,
 7,
 6,
 5,
 6,
 5,
 6,
 6,
 11,
 5,
 6,
 6,
 6,
 5,
 8,
 6,
 6,
 6,
 5,
 4,
 5,
 4,
 6,
 5,
 4,
 7,
 6,
 4,
 3,
 6,
 5,
 6,
 5,
 6,
 5,
 6,
 5,
 8,
 6,
 3,
 7,
 5,
 5,
 5,
 5,
 6,
 6,
 5,
 3,
 3,
 5,
 4,
 7,
 5,
 4,
 5,
 4,
 7,
 8,
 8,
 6,
 5,
 3,
 5,
 4,
 3,
 5,
 5,
 7,
 6,
 3,
 5,
 7,
 4,
 3,
 7,
 7,
 6,
 6,
 5,
 3,
 6,
 7,
 5,
 7,
 8,
 3,
 3,
 8,
 9,
 8,
 8,
 7,
 4,
 7,
 8,
 5,
 3,
 4,
 3,
 4,
 5,
 4,
 5,
 6,
 5,
 6,
 7,
 5,
 3,
 4,
 4,
 5,
 4,
 7,
 6,
 7,
 7,
 6,
 6,
 3,
 6,
 8,
 9,
 6,
 7,
 6,
 5,
 3,
 3,
 6,
 4,
 5,
 4,
 5,
 5,
 4,
 7,
 5,
 4,
 6,
 4,
 3,
 5,
 5,
 6,
 7,
 4,
 4,
 7,
 8,
 6,
 7,
 5,
 10,
 

In [67]:
import numpy as np

print(np.max(distances))
print(np.min(distances))
print(np.mean(distances))
print(np.std(distances))
print(np.median(distances))

13
3
5.526932084309133
1.8045222148698772
5.0


In [69]:
paths

[['network security', 'computer networks', 'computer science', 'root'],
 ['network security', 'computer security', 'computer science', 'root'],
 ['network security',
  'computer security',
  'information security',
  'security',
  'root'],
 ['network security', 'computer security', 'security', 'root'],
 ['iot',
  'architecture types',
  'software architecture',
  'software design',
  'software engineering',
  'computer science',
  'root'],
 ['iot',
  'architecture types',
  'software architecture',
  'software engineering',
  'computer science',
  'root'],
 ['iot',
  'architecture types',
  'software architecture',
  'systems architecture',
  'root'],
 ['optical character recognition',
  'character recognition',
  'pattern recognition',
  'machine learning',
  'artificial intelligence',
  'computer science',
  'root'],
 ['optical character recognition',
  'character recognition',
  'pattern recognition',
  'machine learning',
  'computer science',
  'root'],
 ['optical character recogn

In [85]:

wiki_taxonomy = Taxonomy.load(f'../data/interim/new_gitranking_wikidata.json')
wiki_edges = [(pair[0], pair[1]) for pair in wiki_taxonomy.pairs]
removed_nodes = []
for node in g:
    inwards = g.in_edges(node)
    outwards = g.out_edges(node)
    both_edges = list(inwards) + list(outwards)
    #if all(edge in wiki_edges for edge in both_edges):
    t = list(flatten(paths_passing_throught_node[node]))
    print(t)
    if not set(t).intersection(set(gitranking.terms)):
        removed_nodes.append(node)

['evolutionary algorithms', 'genetic algorithms', 'artificial intelligence', 'computer science', 'root']
['optical character recognition', 'character recognition', 'pattern recognition', 'machine learning', 'artificial intelligence', 'computer science', 'root', 'optical character recognition', 'pattern recognition', 'machine learning', 'artificial intelligence', 'computer science', 'root', 'relation extraction', 'information extraction', 'natural language processing', 'artificial intelligence', 'computer science', 'root', 'emotion recognition', 'face recognition', 'pattern recognition', 'machine learning', 'artificial intelligence', 'computer science', 'root', 'data clustering', 'cluster analysis', 'machine learning', 'artificial intelligence', 'computer science', 'root', 'question answering', 'natural language processing', 'artificial intelligence', 'computer science', 'root', 'natural language generation', 'natural language processing', 'artificial intelligence', 'computer science', 

In [87]:
removed_nodes

['videotex',
 'video streaming',
 'streaming video',
 'image search',
 'search engines',
 'bibliographic retrieval systems',
 'digital libraries',
 'web crawlers',
 'cloud services',
 'risk management tool',
 'management tool',
 'knowledge organization system',
 'reference work',
 'linear model',
 'regression model',
 'payment system',
 'financial system',
 'malware analysis',
 'reverse engineering',
 'interface standard',
 'communication system',
 'data synchronization',
 'data management',
 'database administration',
 'system administration',
 'systems management',
 'naive Bayes classifier',
 'Bayes classifier']