In [61]:
import glob
from itertools import combinations

import multiset
import numpy as np
import pandas as pd
from more_itertools import flatten

In [62]:
keywords_path = '../data/raw/keywords/{level}/yake/'

In [63]:
def get_keywords(level):
    folder = keywords_path.format(level=level)
    print(folder)
    files = glob.glob(f'{folder}/**.csv')
    keywords = {}
    for file in files:
        label = file.split('/')[-1].split('.')[0]
        df = pd.read_csv(file)
        terms = df['keyword'].tolist()
        weights = df['total_freq'].tolist()
        keywords[label] = multiset.Multiset(flatten([[term] * weight for term, weight in zip(terms, weights)]))
    return keywords

In [64]:
keywords = get_keywords('name')

../data/raw/keywords/name/yake/


In [65]:
keywords

{'convolutional neural network': Multiset({'base': 42, 'camera': 28, 'view': 22, 'connection': 120, 'sql': 82, 'exception': 82, 'exceptions': 52, 'statement': 54, 'optional': 46, 'mysql': 48, 'data': 44, 'xmlrpc': 38, 'result': 32, 'set': 30, 'test': 36, 'wrapper': 26, 'fabric': 26, 'util': 26, 'interceptor': 24, 'prepared': 20, 'factory': 22, 'server': 22, 'load': 20, 'replication': 20, 'log': 20, 'group': 18, 'balance': 18, 'jdbc': 16, 'meta': 14, 'proxy': 14, 'profiler': 14, 'row': 14, 'regression': 16, 'non': 12, 'manager': 12, 'callable': 12, 'proto': 12, 'authentication': 12, 'multi': 12, 'shard': 12, 'driver': 12, 'error': 12, 'method': 10, 'info': 10, 'balanced': 10, 'mapping': 10, 'strategy': 10, 'source': 10, 'socket': 10, 'properties': 10, 'plugin': 10, 'client': 10, 'mat': 60, 'numtrans': 31, 'tonemap': 20, 'point': 16, 'merge': 16, 'callback': 12, 'subtractor': 12, 'loader': 12, 'detector': 18, 'background': 12, 'rect': 12, 'calibrate': 12, 'renderer': 12, 'stereo': 12, 'c

In [66]:
pairs = list(combinations(keywords.keys(), 2))

In [67]:
pairs

[('convolutional neural network', 'recurrent neural network'),
 ('convolutional neural network', '3D modeling'),
 ('convolutional neural network', 'streaming media'),
 ('convolutional neural network', 'software engineering'),
 ('convolutional neural network', 'home automation'),
 ('convolutional neural network', 'animation'),
 ('convolutional neural network', 'artificial intelligence'),
 ('convolutional neural network', 'natural language understanding'),
 ('convolutional neural network', 'video game development'),
 ('convolutional neural network', 'data analysis'),
 ('convolutional neural network', 'image recognition'),
 ('convolutional neural network', 'data binding'),
 ('convolutional neural network', 'internet of things'),
 ('convolutional neural network', 'validator'),
 ('convolutional neural network', 'text mining'),
 ('convolutional neural network', 'benchmark'),
 ('convolutional neural network', 'test automation'),
 ('convolutional neural network', 'virtual reality'),
 ('convolu

In [68]:
def get_overlap(pair):
    return len(keywords[pair[0]] & keywords[pair[1]]) / (len(keywords[pair[0]].union(keywords[pair[1]])) + 1)

In [69]:
pair_index = {term: i for i, term in enumerate(keywords.keys())}

In [70]:
overlap_matrix = np.zeros((len(keywords), len(keywords)))
overlap_triples = []
for pair in pairs:
    overlap = get_overlap(pair)
    overlap_triples.append((pair[0], pair[1], overlap))
    overlap_matrix[pair_index[pair[0]], pair_index[pair[1]]] = overlap

In [71]:
## plot using plotly
import plotly.graph_objects as go
fig = go.Figure(data=go.Heatmap(
                   z=overlap_matrix,
                   x=list(keywords.keys()),
                   y=list(keywords.keys()),
                   hoverongaps = False))
fig.show()

In [74]:
top_triples = sorted(overlap_triples, key=lambda x: x[2], reverse=True)[:30]

In [75]:
top_triples

[('neural machine translation', 'machine translation', 0.9999797656866515),
 ('automated machine learning',
  'hyperparameter optimization',
  0.997229916897507),
 ('website', 'World Wide Web', 0.991389050028762),
 ('audio signal processing', 'signal processing', 0.9877131795347647),
 ('image captioning', 'Bidirectional recurrent neural networks', 0.96),
 ('3D computer graphics', 'computer graphics', 0.927128575606269),
 ('reinforcement learning', 'linear regression', 0.9078303425774877),
 ('data analysis', 'data science', 0.7598614769894451),
 ('Malware Analysis', 'malware', 0.7475488911026058),
 ('computational science', 'signal processing', 0.7471367393685888),
 ('audio signal processing', 'computational science', 0.7409381400054285),
 ('data analysis', 'data mining', 0.7209272990842281),
 ('visualization', 'data visualization', 0.7170231495884832),
 ('static program analysis',
  'static program analysis tool',
  0.709497698679861),
 ('mathematical finance', 'finance', 0.69197885900

In [86]:
sorted_overlap = sorted(overlap_triples, key=lambda x: x[2], reverse=True)
line = [(i, overlap[2]) for i, overlap in enumerate(sorted_overlap)]
# plot line 
import plotly.graph_objects as go
fig = go.Figure(data=go.Scatter(x=[x[0] for x in line], y=[x[1] for x in line]))
fig.show()

In [89]:
# plot the number of keywords in each label
fig = go.Figure(data=go.Scatter(x=[i for i in range(len(keywords))], y=[len(set(keywords[key])) for key in keywords.keys()]))
fig.show()


In [90]:
fig = go.Figure(data=go.Scatter(x=[i for i in range(len(keywords))], y=[len(keywords[key]) for key in keywords.keys()]))
fig.show()