In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas
import pickle
import math
import numpy
import seaborn
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from urllib.request import urlretrieve

In [None]:
numpy.random.seed(0xC0FFEE) 

In [None]:
files = [
    'full_lda_model.pickle',
    'full_tf.pickle',
    'full_tf_vectorizer.pickle',
    'full-dataset.csv.gz',
]

base_url = 'https://daniel.wilshirejones.com/private-uUX6IzfsRYLNiti4ZFmgv6U3dFInnq37r5YSQs46iejeB96q0MAy9Ko7hkgo/'
destination_directory = '../data/'

for file in files:
    url = base_url + file
    destination = destination_directory + file
    print("Downloading '{}'' to location '{}'".format(url, destination))
    urlretrieve(url, destination)

In [None]:
minimal_dataset = pandas.read_csv("../data/dataset.csv.gz", header=None, names=['repo', 'language', 'documents'])
minimal_dataset.head()

In [None]:
def calculate_language_percentages(group):
    total_python_length = 0
    total_r_length = 0
    total_javascript_length = 0
    
    for index, repo, language, document in group.itertuples():
        if language == 'python':
            total_python_length += len(document)
            
        if language == 'javascript':
            total_javascript_length += len(document)
            
        if language == 'r':
            total_r_length += len(document)
            
    total_length = total_python_length + total_r_length + total_javascript_length
            
    return pandas.Series([
        total_python_length/total_length,
        total_r_length/total_length,
        total_javascript_length/total_length,
    ])


In [None]:
def get_word_importance_per_topic(model, tf_vectorizer):
    feature_map = dict(enumerate(tf_vectorizer.get_feature_names()))
    
    word_importance_per_topic = []

    for topic_components in model.components_:
        word_importance = [
            (feature_map[feature_index], feature_importance) 
            for feature_index, feature_importance in enumerate(topic_components)
        ]
        word_importance = sorted(word_importance, key=lambda tup: tup[1], reverse=True)
        word_importance_per_topic.append(word_importance)
        
    return word_importance_per_topic

In [None]:


def jaccard_index(a, b):
    a = set(a)
    b = set(b)
    
    num_shared = float(len(a & b))
    num_total = float(len(a | b))
    
    return num_shared/num_total
jaccard_indexes = pandas.DataFrame(
    numpy.array([
        [
            jaccard_index(javascript_keywords, most_importance_topic_1_keywords)/len(javascript_keywords),
            jaccard_index(javascript_keywords, most_importance_topic_2_keywords)/len(javascript_keywords),
            jaccard_index(javascript_keywords, most_importance_topic_3_keywords)/len(javascript_keywords),
            jaccard_index(javascript_keywords, most_importance_topic_4_keywords)/len(javascript_keywords)
        ],
        [
            jaccard_index(r_keywords, most_importance_topic_1_keywords)/len(r_keywords),
            jaccard_index(r_keywords, most_importance_topic_2_keywords)/len(r_keywords),
            jaccard_index(r_keywords, most_importance_topic_3_keywords)/len(r_keywords),
            jaccard_index(r_keywords, most_importance_topic_4_keywords)/len(r_keywords),
        ],
        [
            jaccard_index(python_keywords, most_importance_topic_1_keywords)/len(python_keywords),
            jaccard_index(python_keywords, most_importance_topic_2_keywords)/len(python_keywords),
            jaccard_index(python_keywords, most_importance_topic_3_keywords)/len(python_keywords),
            jaccard_index(python_keywords, most_importance_topic_4_keywords)/len(python_keywords),
        ],
    ]),
    columns=['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4'],
    index=['Javascript', 'R', 'Python']
)

hm = seaborn.heatmap(
    jaccard_indexes,
    cmap='Blues',
)

In [None]:

jaccard_indexes = pandas.DataFrame(
    numpy.array([
        [
            jaccard_index(javascript_keywords, most_importance_topic_1_keywords)/len(javascript_keywords),
            jaccard_index(javascript_keywords, most_importance_topic_2_keywords)/len(javascript_keywords),
            jaccard_index(javascript_keywords, most_importance_topic_3_keywords)/len(javascript_keywords),
            jaccard_index(javascript_keywords, most_importance_topic_4_keywords)/len(javascript_keywords)
        ],
        [
            jaccard_index(r_keywords, most_importance_topic_1_keywords)/len(r_keywords),
            jaccard_index(r_keywords, most_importance_topic_2_keywords)/len(r_keywords),
            jaccard_index(r_keywords, most_importance_topic_3_keywords)/len(r_keywords),
            jaccard_index(r_keywords, most_importance_topic_4_keywords)/len(r_keywords),
        ],
        [
            jaccard_index(python_keywords, most_importance_topic_1_keywords)/len(python_keywords),
            jaccard_index(python_keywords, most_importance_topic_2_keywords)/len(python_keywords),
            jaccard_index(python_keywords, most_importance_topic_3_keywords)/len(python_keywords),
            jaccard_index(python_keywords, most_importance_topic_4_keywords)/len(python_keywords),
        ],
    ]),
    columns=['Topic 1', 'Topic 2', 'Topic 3', 'Topic 4'],
    index=['Javascript', 'R', 'Python']
)

hm = seaborn.heatmap(
    jaccard_indexes,
    cmap='Blues',