In [1]:
import scipy.sparse as sparse
import seaborn as sns
from gensim.corpora import Dictionary

# Set figure style defaults
sns.set_style("whitegrid")
figsize=(5,4)
dpi=300
palette="dark"

# Fix plot fonts
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'

In [2]:
location = '../../data/cooccurrence_maps/%s/cooccurrence_map.npz'
languages = ['go', 'java', 'javascript', 'php', 'python', 'ruby',]
docstring_sources = ['DOCSTRING'] + ['DOCSTRING-%s' % l for l in languages]
so_sources = ['SO'] + ['SO-%s' % l for l in languages]
LKML_sources = ['LKML']


In [3]:
cooc_maps = list()
for source in docstring_sources + so_sources + LKML_sources:
    cooc_maps.append(sparse.load_npz(location % source))

tags_vocab_location = '../../data/corpora/multilingual/so_majority/tags.dct'
tags_vocab = Dictionary.load(tags_vocab_location)


In [9]:
dists_by_source = dict()
for source, cooc_map in zip(docstring_sources + so_sources + LKML_sources, cooc_maps):
    dists_by_source[source] = dict()
    for i, j in zip(*cooc_map.nonzero()):
        try:
            dists_by_source[source][tags_vocab[i]].append((tags_vocab[j], cooc_map[(i,j)]))
        except KeyError:
            dists_by_source[source][tags_vocab[i]] = [(tags_vocab[j], cooc_map[(i,j)])]

[(0, 8),
 (0, 55),
 (1, 5),
 (1, 8),
 (1, 12),
 (1, 17),
 (1, 18),
 (1, 28),
 (1, 40),
 (1, 55),
 (1, 71),
 (1, 81),
 (1, 82),
 (1, 84),
 (1, 88),
 (1, 91),
 (1, 94),
 (1, 103),
 (1, 105),
 (1, 107),
 (1, 108),
 (2, 6),
 (2, 55),
 (2, 81),
 (2, 106),
 (3, 8),
 (3, 12),
 (3, 13),
 (3, 21),
 (3, 50),
 (3, 55),
 (3, 72),
 (3, 73),
 (3, 81),
 (3, 103),
 (3, 106),
 (4, 4),
 (4, 8),
 (4, 12),
 (4, 19),
 (4, 48),
 (4, 50),
 (4, 72),
 (4, 73),
 (4, 76),
 (4, 116),
 (4, 118),
 (5, 1),
 (5, 6),
 (5, 8),
 (5, 15),
 (5, 18),
 (5, 28),
 (5, 32),
 (5, 44),
 (5, 55),
 (5, 59),
 (5, 62),
 (5, 65),
 (5, 68),
 (5, 72),
 (5, 77),
 (5, 78),
 (5, 81),
 (5, 82),
 (5, 84),
 (5, 88),
 (5, 104),
 (5, 105),
 (5, 106),
 (5, 107),
 (5, 108),
 (5, 116),
 (5, 118),
 (5, 119),
 (5, 120),
 (6, 2),
 (6, 5),
 (6, 7),
 (6, 8),
 (6, 10),
 (6, 11),
 (6, 15),
 (6, 19),
 (6, 22),
 (6, 24),
 (6, 29),
 (6, 37),
 (6, 43),
 (6, 51),
 (6, 54),
 (6, 55),
 (6, 62),
 (6, 65),
 (6, 68),
 (6, 69),
 (6, 72),
 (6, 76),
 (6, 77),
 (6, 8