In [1]:
import os

import matplotlib.pyplot as plt
import pandas as pd
import scipy.sparse as sparse
import seaborn as sns
from gensim.corpora import Dictionary

# Set figure style defaults
sns.set_style("whitegrid")
figsize=(5,4)
dpi=300
palette="rocket"
sns.set_palette(palette)

# Fix plot fonts
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'

In [2]:
location = '../../data/cooccurrence_maps/%s/cooccurrence_map.npz'
languages = ['go', 'java', 'javascript', 'php', 'python', 'ruby',]
docstring_sources = ['DOCSTRING'] + ['DOCSTRING-%s' % l for l in languages]
so_sources = ['SO'] + ['SO-%s' % l for l in languages]
LKML_sources = ['LKML']


In [3]:
cooc_maps = list()
for source in docstring_sources + so_sources + LKML_sources:
    cooc_maps.append(sparse.load_npz(location % source))

tags_vocab_location = '../../data/corpora/multilingual/so_majority/tags.dct'
tags_vocab = Dictionary.load(tags_vocab_location)


In [4]:
dists_by_source = dict()
for source, cooc_map in zip(docstring_sources + so_sources + LKML_sources, cooc_maps):
    dists_by_source[source] = dict()
    for i, j in zip(*cooc_map.nonzero()):
        try:
            dists_by_source[source][tags_vocab[i]].append((tags_vocab[j], cooc_map[(i,j)]))
        except KeyError:
            dists_by_source[source][tags_vocab[i]] = [(tags_vocab[j], cooc_map[(i,j)])]

In [5]:
for source in dists_by_source.keys():
    inner_view = dists_by_source[source]
    for tag in [t for t in inner_view.keys() if (t.upper() == t and t.isalpha()) or t[-1] == '$']:
        data = pd.DataFrame(data={tag: dict(
            filter(lambda p: p[0] != '@O@' and not((p[0].upper() == p[0] and p[0].isalpha()) or p[0][-1] == '$'),
                   sorted(inner_view[tag], key=lambda p: p[-1], reverse=True)))})
        data['Co-Occurring Tag'] = data.index
        if len(data) > 0:
            fig = plt.figure(figsize=figsize, dpi=dpi)
            ax = fig.gca()
            plot = sns.barplot(data=data,
                        x='Co-Occurring Tag',
                        y=tag,
                        order=reversed(data.sort_values(tag)['Co-Occurring Tag']),
                        palette=palette,
                        ax=ax)
            plt.yscale('log')
            ax.tick_params(axis='x', rotation=90)
            plot.set_ylabel('Co-Occurrences for %s' % tag)
            plot.set_title('Co-Occurrences for "%s"' % source)
            os.makedirs('./plots/%s' % source, exist_ok=True)
            plt.savefig("./plots/%s/%s.pdf" % (source, tag),
                #This is simple recomendation for publication plots
                dpi=1000,
                # Plot will be occupy a maximum of available space
                bbox_inches='tight',
            )
            plt.close(fig)