In [27]:
import networkx as nx
from more_itertools import flatten
from entity.taxonomy import Taxonomy

all_terms = []
cso_taxonomy = Taxonomy.load(f'../data/interim/new_gitranking_cso.json')
wiki_taxonomy = Taxonomy.load(f'../data/interim/new_gitranking_wikidata.json')

combined_taxonomy = Taxonomy('combined-filtered')
combined_taxonomy.pairs = cso_taxonomy.pairs + wiki_taxonomy.pairs
combined_taxonomy.terms = list(set(flatten(combined_taxonomy.pairs)))

In [28]:
len(combined_taxonomy.terms)

1288

In [29]:
import pandas as pd

llm_ad = pd.read_csv('../data/interim/llm_responses_is_ad.csv')
llm_ad = llm_ad[llm_ad['response'] == '1']


In [30]:
llm_ad

Unnamed: 0,term,response
2,image editing,1
5,supervised learning,1
8,algorithmic trading,1
10,neural machine translation,1
11,malware detection,1
...,...,...
1274,machine learning method,1
1279,genomics,1
1282,functional programming,1
1284,password manager,1


In [31]:
filtered_pairs = []
remaining_pairs = []
for pair in combined_taxonomy.pairs:
    if any(term not in llm_ad['term'].values for term in [pair[0], pair[1]]):
        filtered_pairs.append(pair)
    else:
        remaining_pairs.append(pair)
        
print(len(filtered_pairs), len(combined_taxonomy.pairs))

1645 2128


In [32]:
filtered_pairs


[['forecasting models', 'machine learning', 'cso'],
 ['programming course', 'computer science education', 'cso'],
 ['digital storage', 'computer hardware', 'cso'],
 ['search engines', 'world wide web', 'cso'],
 ['smart environment', 'ambient intelligence', 'cso'],
 ['target language', 'speech transmission', 'cso'],
 ['video streaming', 'image coding', 'cso'],
 ['on-line social networks', 'social networks', 'cso'],
 ['internet', 'computer science', 'cso'],
 ['geospatial web services', 'geo-spatial data', 'cso'],
 ['imaging systems', 'image reconstruction', 'cso'],
 ['graphical user interfaces %28gui%29', 'user interfaces', 'cso'],
 ['graph algorithm', 'graph theory', 'cso'],
 ['phishing', 'world wide web', 'cso'],
 ['social networks', 'world wide web', 'cso'],
 ['web applications', 'world wide web', 'cso'],
 ['peer-to-peer', 'computer networks', 'cso'],
 ['statistical analysis', 'mathematics', 'cso'],
 ['graph theory', 'theoretical computer science', 'cso'],
 ['man machine systems', 'co

In [33]:
wiki_filtered = sum([1 for x in filtered_pairs if 'wikidata' == x[2]])
print(wiki_filtered)
cso_filtered = sum([1 for x in filtered_pairs if 'cso' == x[2]])
print(cso_filtered)

1530
115


In [34]:
remaining_pairs

[['genetic algorithms', 'artificial intelligence', 'cso'],
 ['network security', 'computer networks', 'cso'],
 ['iot', 'architecture types', 'cso'],
 ['medical images', 'image analysis', 'cso'],
 ['optical character recognition', 'character recognition', 'cso'],
 ['interactive computer graphics', 'user interfaces', 'cso'],
 ['parallel processing systems', 'distributed systems', 'cso'],
 ['digital image processing', 'digital image storage', 'cso'],
 ['relation extraction', 'information extraction', 'cso'],
 ['theoretical computer science', 'computer science', 'cso'],
 ['emotion recognition', 'face recognition', 'cso'],
 ['natural language processing', 'artificial intelligence', 'cso'],
 ['digital image storage', 'image compression', 'cso'],
 ['computer programming languages', 'computer programming', 'cso'],
 ['augmented reality', 'virtual reality', 'cso'],
 ['user interfaces', 'human computer interaction', 'cso'],
 ['data clustering', 'cluster analysis', 'cso'],
 ['operating systems', '

In [35]:
wiki_remaining = sum([1 for x in remaining_pairs if 'wikidata' == x[2]])
print(wiki_remaining)
cso_remaining = sum([1 for x in remaining_pairs if 'cso' == x[2]])
print(cso_remaining)

344
139


In [44]:
edges = [(pair[0], pair[1], {'origin': pair[2]}) for pair in remaining_pairs]
g = nx.DiGraph()
g.add_edges_from(edges)

In [45]:
nx.is_directed_acyclic_graph(g)

True

In [47]:
no_parent = [node for node, out_degree in g.out_degree() if out_degree == 0]

In [48]:
len(no_parent)

68

In [49]:
for node in no_parent:
    print(node)

computer science
mobile applications
distributed database systems
video streaming
imaging systems
search engines
cryptography
intrusion detection
smart homes
digital libraries
signal processing
classification
machine learning method
management tool
system
reference work
user interface
software component
regression model
recognition
computational biology
management system
program analysis
financial system
computer system
electronic trading
automation
test
security
reverse engineering
generative model
information technology management
simulation
storage
computing platform
data terminal equipment
design pattern
systems architecture
pattern matching
graphics
digital media
e-service
automatic content recognition
programming paradigm
computing infrastructure
information processing
communication system
computer network protocol
database
game
data stream
data pre-processing
translation
software development methodology
classification algorithm
computer vision
data management
image annotation
ex

In [50]:
from entity.gitranking import GitRanking

gitranking = GitRanking('../data/raw/gitranking.jsonl')

In [51]:
missing_gitranking = set(gitranking.terms).difference(set(combined_taxonomy.terms))

In [52]:
len(missing_gitranking)

34

In [53]:
combined_taxonomy.terms.extend(missing_gitranking)

In [54]:
for term in missing_gitranking:
    combined_taxonomy.pairs.append([term, '', 'gitranking'])

In [68]:
hex_colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#00FFFF', '#FF00FF', '#800000', '#008000', '#000080']

origin_to_color = {'cso': hex_colors[0], 'wikidata': hex_colors[1], 'gitranking': hex_colors[2], 
                   'root': hex_colors[3]}
nodes = [(x, y, origin_to_color[t]) for x, y, t in combined_taxonomy.pairs]
df = pd.DataFrame(nodes, columns=['source', 'target', 'color'])
df.to_csv('../data/interim/joined_graph_combined_filtered.csv', index=False)

In [58]:
for node in no_parent:
    g.add_edges_from([(node, 'root', {"origin": 'root'})])
    

In [61]:
nx.shortest_path_length(g, 'root')

{'root': 0}

In [64]:

G = g
paths = []
sink_nodes = [node for node, outdegree in G.out_degree(G.nodes()) if outdegree == 0]
source_nodes = [node for node, indegree in G.in_degree(G.nodes()) if indegree == 0]
for source, sink in [(source, sink) for sink in sink_nodes for source in source_nodes]:
    for path in nx.all_simple_paths(G, source=source, target=sink):
        paths.append(path)


In [80]:
from collections import defaultdict

paths_passing_throught_node = defaultdict(list)
for node in G.nodes():
    paths_passing_throught_node[node].extend([path for path in paths if node in path])

In [65]:
distances = [len(x) for x in paths]

In [66]:
distances

[4,
 4,
 5,
 4,
 7,
 6,
 5,
 7,
 6,
 6,
 5,
 6,
 6,
 5,
 8,
 6,
 5,
 3,
 7,
 6,
 3,
 6,
 5,
 4,
 5,
 4,
 4,
 5,
 4,
 7,
 5,
 6,
 5,
 5,
 4,
 7,
 5,
 6,
 6,
 5,
 3,
 4,
 6,
 5,
 3,
 7,
 5,
 3,
 5,
 6,
 7,
 12,
 7,
 4,
 4,
 4,
 4,
 6,
 5,
 4,
 7,
 6,
 4,
 3,
 5,
 4,
 7,
 5,
 4,
 6,
 5,
 5,
 4,
 7,
 5,
 3,
 6,
 6,
 5,
 6,
 7,
 6,
 7,
 6,
 4,
 3,
 4,
 5,
 4,
 3,
 3,
 6,
 5,
 3,
 3,
 7,
 6,
 5,
 6,
 5,
 6,
 6,
 11,
 5,
 6,
 6,
 6,
 5,
 8,
 6,
 6,
 6,
 5,
 4,
 5,
 4,
 6,
 5,
 4,
 7,
 6,
 4,
 3,
 6,
 5,
 6,
 5,
 6,
 5,
 6,
 5,
 8,
 6,
 3,
 7,
 5,
 5,
 5,
 5,
 6,
 6,
 5,
 3,
 3,
 5,
 4,
 7,
 5,
 4,
 5,
 4,
 7,
 8,
 8,
 6,
 5,
 3,
 5,
 4,
 3,
 5,
 5,
 7,
 6,
 3,
 5,
 7,
 4,
 3,
 7,
 7,
 6,
 6,
 5,
 3,
 6,
 7,
 5,
 7,
 8,
 3,
 3,
 8,
 9,
 8,
 8,
 7,
 4,
 7,
 8,
 5,
 3,
 4,
 3,
 4,
 5,
 4,
 5,
 6,
 5,
 6,
 7,
 5,
 3,
 4,
 4,
 5,
 4,
 7,
 6,
 7,
 7,
 6,
 6,
 3,
 6,
 8,
 9,
 6,
 7,
 6,
 5,
 3,
 3,
 6,
 4,
 5,
 4,
 5,
 5,
 4,
 7,
 5,
 4,
 6,
 4,
 3,
 5,
 5,
 6,
 7,
 4,
 4,
 7,
 8,
 6,
 7,
 5,
 10,
 

In [67]:
import numpy as np

print(np.max(distances))
print(np.min(distances))
print(np.mean(distances))
print(np.std(distances))
print(np.median(distances))

13
3
5.526932084309133
1.8045222148698772
5.0


In [69]:
paths

[['network security', 'computer networks', 'computer science', 'root'],
 ['network security', 'computer security', 'computer science', 'root'],
 ['network security',
  'computer security',
  'information security',
  'security',
  'root'],
 ['network security', 'computer security', 'security', 'root'],
 ['iot',
  'architecture types',
  'software architecture',
  'software design',
  'software engineering',
  'computer science',
  'root'],
 ['iot',
  'architecture types',
  'software architecture',
  'software engineering',
  'computer science',
  'root'],
 ['iot',
  'architecture types',
  'software architecture',
  'systems architecture',
  'root'],
 ['optical character recognition',
  'character recognition',
  'pattern recognition',
  'machine learning',
  'artificial intelligence',
  'computer science',
  'root'],
 ['optical character recognition',
  'character recognition',
  'pattern recognition',
  'machine learning',
  'computer science',
  'root'],
 ['optical character recogn

In [85]:

wiki_taxonomy = Taxonomy.load(f'../data/interim/new_gitranking_wikidata.json')
wiki_edges = [(pair[0], pair[1]) for pair in wiki_taxonomy.pairs]
removed_nodes = []
for node in g:
    inwards = g.in_edges(node)
    outwards = g.out_edges(node)
    both_edges = list(inwards) + list(outwards)
    #if all(edge in wiki_edges for edge in both_edges):
    t = list(flatten(paths_passing_throught_node[node]))
    print(t)
    if not set(t).intersection(set(gitranking.terms)):
        removed_nodes.append(node)

['evolutionary algorithms', 'genetic algorithms', 'artificial intelligence', 'computer science', 'root']
['optical character recognition', 'character recognition', 'pattern recognition', 'machine learning', 'artificial intelligence', 'computer science', 'root', 'optical character recognition', 'pattern recognition', 'machine learning', 'artificial intelligence', 'computer science', 'root', 'relation extraction', 'information extraction', 'natural language processing', 'artificial intelligence', 'computer science', 'root', 'emotion recognition', 'face recognition', 'pattern recognition', 'machine learning', 'artificial intelligence', 'computer science', 'root', 'data clustering', 'cluster analysis', 'machine learning', 'artificial intelligence', 'computer science', 'root', 'question answering', 'natural language processing', 'artificial intelligence', 'computer science', 'root', 'natural language generation', 'natural language processing', 'artificial intelligence', 'computer science', 

In [87]:
removed_nodes

['videotex',
 'video streaming',
 'streaming video',
 'image search',
 'search engines',
 'bibliographic retrieval systems',
 'digital libraries',
 'web crawlers',
 'cloud services',
 'risk management tool',
 'management tool',
 'knowledge organization system',
 'reference work',
 'linear model',
 'regression model',
 'payment system',
 'financial system',
 'malware analysis',
 'reverse engineering',
 'interface standard',
 'communication system',
 'data synchronization',
 'data management',
 'database administration',
 'system administration',
 'systems management',
 'naive Bayes classifier',
 'Bayes classifier']