# Topic Modelling on Program Source Code
---
By Kishalay Banerjee, Dan Jones and Sam Harding

In [1]:
import warnings
warnings.filterwarnings('ignore')  # 0y

In [2]:
import pandas
import pickle
import math
import numpy
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

  from collections import Mapping, Set, Iterable, Iterator, defaultdict
  from collections import Mapping, Set, Iterable, Iterator, defaultdict
  from collections import defaultdict, deque, Sequence
  from collections import Hashable


In [3]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from urllib.request import urlretrieve

In [13]:
numpy.random.seed(0xC0FFEE) 

## Preparation

There are a number of files which are too large to store on GitHub. These are hosted on our server, and can be downloaded by running the following cell:

In [5]:
files = [
    'full_lda_model.pickle',
    'full_tf.pickle',
    'full_tf_vectorizer.pickle',
    'full-dataset.csv.gz',
]

base_url = 'https://daniel.wilshirejones.com/private-uUX6IzfsRYLNiti4ZFmgv6U3dFInnq37r5YSQs46iejeB96q0MAy9Ko7hkgo/'
destination_directory = '../data/'

for file in files:
    url = base_url + file
    destination = destination_directory + file
    print("Downloading '{}'' to location '{}'".format(url, destination))
    urlretrieve(url, destination)

Downloading 'https://daniel.wilshirejones.com/private-uUX6IzfsRYLNiti4ZFmgv6U3dFInnq37r5YSQs46iejeB96q0MAy9Ko7hkgo/full_lda_model.pickle'' to location '../data/full_lda_model.pickle'
Downloading 'https://daniel.wilshirejones.com/private-uUX6IzfsRYLNiti4ZFmgv6U3dFInnq37r5YSQs46iejeB96q0MAy9Ko7hkgo/full_tf.pickle'' to location '../data/full_tf.pickle'
Downloading 'https://daniel.wilshirejones.com/private-uUX6IzfsRYLNiti4ZFmgv6U3dFInnq37r5YSQs46iejeB96q0MAy9Ko7hkgo/full_tf_vectorizer.pickle'' to location '../data/full_tf_vectorizer.pickle'
Downloading 'https://daniel.wilshirejones.com/private-uUX6IzfsRYLNiti4ZFmgv6U3dFInnq37r5YSQs46iejeB96q0MAy9Ko7hkgo/full-dataset.csv.gz'' to location '../data/full-dataset.csv.gz'


## Generating the Dataset

TODO: Import dataset.py and explain how it's used + what it does.

TODO: Add Sam's scrub function in here?

In [15]:
minimal_dataset = pandas.read_csv("../data/dataset.csv.gz", header=None, names=['repo', 'language', 'documents'])
minimal_dataset.head()

Unnamed: 0,repo,language,documents
0,28457823,javascript,"b""module.exports = {\n plugins: [\n requir..."
1,28457823,javascript,"b""// The path where to mount the REST API app\..."
2,28457823,javascript,"b""import { Observable } from 'rx';\nimport deb..."
3,28457823,javascript,"b""import { Observable } from 'rx';\n// import ..."
4,28457823,javascript,"b""import { Observable } from 'rx';\n\nmodule.e..."


In [16]:
full_dataset = pandas.read_csv("../data/full-dataset.csv.gz", header=None, names=['repo', 'language',  'topics', 'documents'])

# Remove Github 'topics' since we don't use them in this analysis
full_dataset = full_dataset.drop(columns='topics')

full_dataset.head()

Unnamed: 0,repo,language,documents
0,28457823,javascript,"b""module.exports = {\n plugins: [\n requir..."
1,28457823,javascript,"b""// The path where to mount the REST API app\..."
2,28457823,javascript,"b""import { Observable } from 'rx';\nimport deb..."
3,28457823,javascript,"b""import { Observable } from 'rx';\n// import ..."
4,28457823,javascript,"b""import { Observable } from 'rx';\n\nmodule.e..."


In [17]:
test_dataset = pandas.read_csv("../data/test-dataset.csv.gz", header=None, names=['repo', 'language', 'topics', 'documents'])

# Remove Github 'topics' since we don't use them in this analysis
test_dataset = test_dataset.drop(columns='topics')

test_dataset.head()

Unnamed: 0,repo,language,documents
0,69798748,javascript,"b""const glob = require('glob')\nconst markdown..."
1,128624453,javascript,"b""module.exports = {\n extends: ['@commitlint..."
2,128624453,javascript,"b""module.exports = {\n extends: ['standard', ..."
3,128624453,javascript,"b""const cp = require('child_process')\nconst g..."
4,128624453,javascript,"b""module.exports = {\n verbose: true,\n tran..."


To evaluate the mixture model, we must label each repository with it's percentage of each programming language:

In [18]:
def calculate_language_percentages(group):
    total_python_length = 0
    total_r_length = 0
    total_javascript_length = 0
    
    for index, repo, language, document in group.itertuples():
        if language == 'python':
            total_python_length += len(document)
            
        if language == 'javascript':
            total_javascript_length += len(document)
            
        if language == 'r':
            total_r_length += len(document)
            
    total_length = total_python_length + total_r_length + total_javascript_length
            
    return pandas.Series([
        total_python_length/total_length,
        total_r_length/total_length,
        total_javascript_length/total_length,
    ])

In [19]:
test_composition_actual = test_dataset.groupby(by='repo').apply(calculate_language_percentages)
test_composition_actual.columns = ['python', 'r', 'javascript']

Here are the programming language percentages for each of repository in our test dataset:

In [20]:
test_composition_actual

Unnamed: 0_level_0,python,r,javascript
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
596892,1.0,0.0,0.0
1248263,0.652443,0.0,0.347557
1790564,0.0,0.0,1.0
4751958,0.36073,0.0,0.63927
12465340,0.0,0.995105,0.004895
13523710,0.0,1.0,0.0
14267375,0.941701,0.0,0.058299
14579179,0.0,0.189083,0.810917
16146440,0.0,0.253421,0.746579
17856544,0.0,1.0,0.0


In [21]:
combined_test_documents = test_dataset.groupby(by='repo').apply(concat_texts)

## Topic Modelling on Individual Source Files

Basically done, just need to copy it over. Maybe run on the bigger dataset?

TODO:
  - Copy work from documentation/daniel-jones.ipynb
  - Add visualisation with pyldavis

For our purposes, common words are important and rare words aren't. So we shouldn't use tf-idf as a metric, bag-of-words makes more sense. (TODO: Maybe: "Similarly, filter out words that don't occur very often").


In [8]:
documents = minimal_dataset['documents']

In [None]:
tf_vectorizer = CountVectorizer(stop_words=None)
tf = tf_vectorizer.fit_transform(documents)

with open('../data/minimal_lda_tf.pickle', 'wb') as f:
    pickle.dump(tf, f)
    
with open('../data/minimal_lda_tf_vectorizer.pickle', 'wb') as f:
    pickle.dump(tf_vectorizer, f)

We have four programming languages, try to use LDA to determine these four programming languages.

In [None]:
number_of_languages = 4

lda = LatentDirichletAllocation(n_topics=number_of_languages,  n_jobs=1)
model = lda.fit(tf)

with open('../data/minimal_lda_model.pickle', 'wb') as f:
    pickle.dump(model, f)

In [9]:
with open('../data/minimal_lda_model.pickle', 'rb') as f:
    model = pickle.load(f)
    
with open('../data/minimal_lda_tf.pickle', 'rb') as f:
    tf = pickle.load(f)
    
with open('../data/minimal_lda_tf_vectorizer.pickle', 'rb') as f:
    tf_vectorizer = pickle.load(f)

In [10]:
pyLDAvis.sklearn.prepare(model, tf, tf_vectorizer)

Try to do this on the full dataset:

In [None]:
number_of_languages = 4
all_documents = full_dataset['documents']

full_tf_vectorizer = CountVectorizer(stop_words=None)
full_tf = full_tf_vectorizer.fit_transform(all_documents)
full_tf_feature_names = full_tf_vectorizer.get_feature_names()

full_lda = LatentDirichletAllocation(n_topics=number_of_languages,  n_jobs=1)
full_model = full_lda.fit(full_tf)

with open('../data/full_lda_model.pickle', 'wb') as f:
    pickle.dump(full_model, f)

with open('../data/full_tf.pickle', 'wb') as f:
    pickle.dump(full_tf, f)
    
with open('../data/full_tf_vectorizer.pickle', 'wb') as f:
    pickle.dump(full_tf_vectorizer, f)

In [11]:
with open('../data/full_lda_model.pickle', 'rb') as f:
    full_model = pickle.load(f)

with open('../data/full_tf.pickle', 'rb') as f:
    full_tf = pickle.load(f)
    
with open('../data/full_tf_vectorizer.pickle', 'rb') as f:
    full_tf_vectorizer = pickle.load(f)

Visualize our new model. Note that the following code cell requires more than 8 GB of RAM to run.

In [None]:
pyLDAvis.sklearn.prepare(full_model, full_tf, full_tf_vectorizer)

Reference: https://nbviewer.jupyter.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb

## Topic Modelling on Programming Language Mixtures
Keypoint: topics are programming languages, file with mixture of programming languages, identify which is which.

Applicability to cyber-security: identifying malware embedded within normal programs (shellcode).

### Combining Source Files per Repository

Currently our data set consists of one data point per file containing:
  1. ID of the Github repository the file belongs to.
  2. Programming language it is written in (identified by file extension).
  3. File contents
  
In this section, we extend our analysis from working on documents with one language per file, to a system where there is a mixture of languages inside each document. To do this, we combine all the files in each repository into a single data point.

In [22]:
def concat_texts(group):
    [repo_id] = group['repo'].unique()
    combined = ' '.join(group['documents'])
    return combined

In [23]:
combined_documents = minimal_dataset.groupby(by='repo').apply(concat_texts)

In [24]:
combined_documents.head()

repo
19438      b"#' @include ggplot-global.R\n#' @include ggp...
26554      b'# The contents of this file are subject to t...
544208     b'import logging\nimport os\nimport platform a...
643909     b'#\' Environment variables to set when callin...
2594513    b'# S3 method to deal with chunks and inline t...
dtype: object

### LDA Model 

In [25]:
combined_test_documents.append(combined_documents)

repo
596892       b'from __future__ import print_function\n\nfro...
1248263      b'// VT100.js -- a text terminal emulator in J...
1790564      b"'use strict';\n\nconst ExtractTextPlugin = r...
4751958      b'from .celery import app as celery_app\n\n__a...
12465340     b'#\' @title Aggregation object.\n#\' @descrip...
13523710     b'do_nxt <- function(e)UseMethod("do_nxt")\ndo...
14267375     b'import unittest\nimport os\nimport json\n\nf...
14579179     b'#\' Add data to a plotly visualization\n#\' ...
16146440     b'base64_encode_file <- function(in_file, enco...
17856544     b'#\' Complete list of palettes\n#\'\n#\' Use ...
19117456     b'import React, {Component} from \'react\';\ni...
21289110     b'# coding: utf-8\n\n"""\n    The approach tak...
23932217     b'\n# TODO:\n#  - catch errors and throw a war...
24929423     b'############################################...
28556914     b'#\' R + mermaid.js\n#\'\n#\' Make diagrams i...
33614304     b'# Copyright (c) 2016, Aaron Christi

In [26]:
tf_vectorizer = CountVectorizer(stop_words=None)
tf_vectorizer.fit(combined_test_documents.append(combined_documents))

tf_train = tf_vectorizer.transform(combined_documents)
tf_test = tf_vectorizer.transform(combined_test_documents)

with open('../data/concatDocs_tf_vectorizer.pickle', 'wb') as f:
    pickle.dump(tf_vectorizer, f)
    
with open('../data/concatDocs_tf_train.pickle', 'wb') as f:
    pickle.dump(tf_train, f)
    
with open('../data/concatDocs_tf_test.pickle', 'wb') as f:
    pickle.dump(tf_test, f)

In [27]:
number_of_languages = 4

lda = LatentDirichletAllocation(n_topics=number_of_languages,  n_jobs=1)
model = lda.fit(tf_train)

with open('../data/concatDocs_lda_model.pickle', 'wb') as f:
    pickle.dump(model, f)



In [11]:
with open('../data/concatDocs_lda_model.pickle', 'rb') as f:
    model = pickle.load(f)
    
with open('../data/concatDocs_tf_train.pickle', 'rb') as f:
    tf_train = pickle.load(f)
    
with open('../data/concatDocs_tf_vectorizer.pickle', 'rb') as f:
    tf_vectorizer = pickle.load(f)

with open('../data/concatDocs_tf_test.pickle', 'rb') as f:
    tf_test = pickle.load(f)

In [28]:
pyLDAvis.sklearn.prepare(model, tf_train, tf_vectorizer)

### Alternative Splitting Function

In [19]:
def trim(text):
    if type(text)==str:
        return text.lstrip().rstrip()
    elif type(text)==list:
        tram=[]
        for i in text:
            tram.append(i.lstrip().rstrip())
        return tram
    else:
        return "Not Correct Format"
def splat(text, splt):
    if type(text)==str:
        splot=str.split(text[2:-1], splt)
        return trim(splot)
    elif type(text)==list:
        splot=[]
        for i in text:
            for j in str.split(i,splt):
                splot.append(j)
        return trim(splot)
    else:
        return "wutface"
def scrub(string, inclComments=False):
    x=string
    x=splat(x,"\\n")
    if inclComments==False:
        x=[i for i in x if i[0:2]!="//"]
        x=[i for i in x if i[0:1]!="#"]
    x=splat(x,"(")
    x=splat(x,"[")
    x=splat(x,"{")
    x=splat(x," ")
    x=[i for i in x if i!=""]
    return x

In [20]:
combined_split_documents=combined_documents.apply(scrub)

In [21]:
combined_split_documents

repo
19438        [NULL, NULL, ", b'#\', Calculated, aesthetics,...
26554        [import, os, from, r2.lib.translation, import,...
544208       [import, logging, import, os, import, platform...
643909       [r_env_vars, <-, function, ), vars, <-, c, "R_...
2594513      [process_group, =, function, x), UseMethod, \'...
3834332      [\'\'\', \'\'\', from, __future__, import, pri...
4729944      [shinyApp, <-, function, ui=NULL,, server=NULL...
4751958      [from, .celery, import, app, as, celery_app, _...
6427813      [loc, <-, function, data), .Call, `_dplyr_loc`...
8162715      [const, ROOT, =, "../saleor/static/dashboard-n...
10270250     ['use, strict';, const, es5Paths,, esNextPaths...
13926404     [from, __future__, import, print_function, imp...
14098069     [var, gulp, =, require, 'gulp');, var, markdow...
14579179     [add_data, <-, function, p,, data, =, NULL), i...
18840003     [/**, *, React, Starter, Kit, https://www.reac...
22003158     [library, rvest), library, purrr), Li

## Identifying Program Subjects and Themes

Hypothesis: Using tf-idf rather than bag-of-words as an input to LDA will prioritise rare words. In the case of source code, this means programming language keywords (an identifying feature of programming languages) are deprioritised, and so a more human idea of topics may emerge. 

We can use repo-list.json and the repo-ids to map the github topics/tags to each repo. Might be a small/easy task to compare against the programming langauge identification.

In [16]:
documents = minimal_dataset['documents']

In [18]:
tf_vectorizer = TfidfVectorizer(stop_words=None)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

with open('../data/tfidf_lda_tf.pickle', 'wb') as f:
    pickle.dump(tf, f)
    
with open('../data/tfidf_lda_tf_vectorizer.pickle', 'wb') as f:
    pickle.dump(tf_vectorizer, f)

We have four programming languages, try to use LDA to determine these four programming languages.

In [19]:
number_of_themes = 3

lda = LatentDirichletAllocation(n_topics=number_of_themes,  n_jobs=1)
model = lda.fit(tf)

with open('../data/tfidf_lda_model.pickle', 'wb') as f:
    pickle.dump(model, f)



In [20]:
with open('../data/tfidf_lda_model.pickle', 'rb') as f:
    model = pickle.load(f)
    
with open('../data/tfidf_lda_tf.pickle', 'rb') as f:
    tf = pickle.load(f)
    
with open('../data/tfidf_lda_tf_vectorizer.pickle', 'rb') as f:
    tf_vectorizer = pickle.load(f)

In [None]:
pyLDAvis.sklearn.prepare(model, tf, tf_vectorizer)

This still prioritises programming language keywords. One approach to solving this problem is to consider all keywords as "stopwords". First, gather a list of R, Python and Javascript keywords:

In [21]:
import keyword

python_keywords = keyword.kwlist
python_keywords

['False',
 'None',
 'True',
 'and',
 'as',
 'assert',
 'break',
 'class',
 'continue',
 'def',
 'del',
 'elif',
 'else',
 'except',
 'finally',
 'for',
 'from',
 'global',
 'if',
 'import',
 'in',
 'is',
 'lambda',
 'nonlocal',
 'not',
 'or',
 'pass',
 'raise',
 'return',
 'try',
 'while',
 'with',
 'yield']

R reserved words (sourced from the manual: https://stat.ethz.ch/R-manual/R-devel/library/base/html/Reserved.html)

In [22]:
r_keywords = [
    "if", 
    "else", 
    "repeat",
    "while",
    "function", 
    "for",
    "in",
    "next",
    "break",
    "TRUE",
    "FALSE",
    "NULL", 
    "Inf", 
    "NaN",
    "NA",
    "NA_integer_",
    "NA_real_",
    "NA_complex_",
    "NA_character_", 
]

Javascript keywords and reserved words (source: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#Keywords)

In [23]:
javascript_keywords = [  # jaccard("ideal javascript topic", topic_i)
    "break",
    "case",
    "catch",
    "class",
    "const",
    "continue",
    "debugger",
    "default",
    "delete",
    "do",
    "else",
    "export",
    "extends",
    "finally",
    "for",
    "function",
    "if",
    "import",
    "in",
    "instanceof",
    "new",
    "return",
    "super",
    "switch",
    "this",
    "throw",
    "try",
    "typeof",
    "var",
    "void",
    "while",
    "with",
    "yield",
]

In [24]:
documents = minimal_dataset['documents']

In [28]:
tf_vectorizer = TfidfVectorizer(stop_words=javascript_keywords+python_keywords+r_keywords)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
    
with open('../data/tfidf_lda_tf_ignore_keywords.pickle', 'wb') as f:
    pickle.dump(tf, f)
    
with open('../data/tfidf_lda_tf_vectorizer_ignore_keywords.pickle', 'wb') as f:
    pickle.dump(tf_vectorizer, f)

We have four programming languages, try to use LDA to determine these four programming languages.

In [29]:
number_of_themes = 3

lda = LatentDirichletAllocation(n_topics=number_of_themes,  n_jobs=1)
model = lda.fit(tf)

with open('../data/tfidf_lda_model_ignore_keywords.pickle', 'wb') as f:
    pickle.dump(model, f)



In [30]:
with open('../data/tfidf_lda_model_ignore_keywords.pickle', 'rb') as f:
    model = pickle.load(f)
    
with open('../data/tfidf_lda_tf_ignore_keywords.pickle', 'rb') as f:
    tf = pickle.load(f)
    
with open('../data/tfidf_lda_tf_vectorizer_ignore_keywords.pickle', 'rb') as f:
    tf_vectorizer = pickle.load(f)

In [None]:
pyLDAvis.sklearn.prepare(model, tf, tf_vectorizer)

## Measuring the Efficacy of Topic Models (WRITEUP: WHO?)

Main question: How do we evaluate how well a topic (from LDA for example) represents a meaningful "topic" or theme?

TODO: do some research on this??? There must be some papers etc that try to formalise this that we can borrow ideas from?

Paper dump:
  - Looks like a good summary paper: http://www.aclweb.org/anthology/E14-4005 Find more papers from this ones references?
    - " KL-divergence (Li and McCallum, 2006; Wang et al., 2009; Newman et al., 2009), cosine measure (He et al., 2009; Ramage et al., 2009) and the average Log Odds Ratio (Chaney and Blei, 2012). "
    - "Kim and Oh (2011) also applied  the  cosine  measure  and  KL-Divergence which were compared with four other measures: Jaccard’s Coefficient, Kendall’s τ coefficient, Discount  Cumulative  Gain  and  Jensen  Shannon  Divergence (JSD)."
  - Cool name haven't read it: http://papers.nips.cc/paper/3700-reading-tea-leaves-how-humans-interpret-topic-models.pdf
  
We considered all of these metrics, and found th Jaccard Index to be most suitable. This was primarily due to it's use of set operations, which are invariant to ordering and number of observations.

### Reading Tea Leaves Paper
http://papers.nips.cc/paper/3700-reading-tea-leaves-how-humans-interpret-topic-models.pdf

### Word Overlap

I think this is used as a baseline measure in the summary paper above (http://www.aclweb.org/anthology/E14-4005). Should be a quick implementation so worth a try.

### Jaccard Index

From the papers above this seems to have been used relatively often for _linking machine-generated topics to human topics_ and so maybe this is a good application for it. Apparently explored here "https://link.springer.com/chapter/10.1007/978-3-642-19437-5_13" but I haven't read it.


In [9]:
def jaccard_index(a, b):
    a = set(a)
    b = set(b)
    return len(a & b) / len(a | b)

Consider the group of language keywords as the best possible topic for each language. Compare each of our machine generated topics with each of our ideal topics by computing their Jaccard Index:

Consider the group of language keywords as the best possible topic for each language. Compare each of our machine generated topics with each of our ideal topics by computing their Jaccard Index:


In [5]:
topic1_keywords = [

"this",
"function",
"if",
"the",
"return",
"var",
"to",
"for",
"is",
"true",
"in",
"of",
"value",
"data",
"null",
"length",
"and",
"else",
"false",
"name",
"type",
"new",
"const",
"it",
"assert",
"object",
"be",
"options",
"key",
"that"]

In [6]:
topic3_keywords = [
"self",
"def",
"import",
"from",
"none",
"user",
"in",
"ndef",
"response",
"name",
"assert",
"equal",
"id",
"not",
"str",
"models",
"true",
"request",
"get",
"email",
"assert",
"realm",
"data",
"message",
"result",
"for",
"nclass",
"dict",
"django",
"thread",
"url"]

In [7]:
topic4_keywords = [
"react",
"from",
"default",
"nimport",
"nexport",
"createsvgicon",
"path",
"import",
"fragment",
"xd0",
"xe0",
"classname",
"as",
"props",
"utils",
"none",
"div",
"proptypes",
"createelement",
"theme",
"xe1",
"material",
"xd1",
"button",
"classes",
"m0",
"fill",
"ui",
"xe2",
"0h24v24h0v0z"]

In [None]:
print(jaccard_index(javascript_keywords, topic1_keywords),
jaccard_index(javascript_keywords, topic3_keywords),
jaccard_index(javascript_keywords, topic4_keywords))

In [None]:
print(jaccard_index(r_keywords, topic1_keywords),
jaccard_index(r_keywords, topic3_keywords),
jaccard_index(r_keywords, topic4_keywords))

In [None]:
print(jaccard_index(python_keywords, topic1_keywords),
jaccard_index(python_keywords, topic3_keywords),
jaccard_index(python_keywords, topic4_keywords))

### Kendall’s τ Coefficient

Measures the association between two ranked lists. Source: Computational Linguistics and Intelligent Text Processing book.

### Evaluating Topic Models
TODO: better title needed

Idea:
  - save the actual % of each program langauge per repo
  - Then try to use LDA model to tell us "I believe repo <x> is 10% Topic 1, 20% Topic 2 etc". 
  - Use analysis from above two sections to create a "most likely mapping from lda topic to programming language".
  - rate our models

Here we can do cross-validation etc.

Load in the test data set:

In [32]:
test_dataset = pandas.read_csv("../data/test-dataset.csv.gz", header=None, names=['repo', 'language', 'topics', 'documents'])

# Remove Github 'topics' since we don't use them in this analysis
test_dataset = test_dataset.drop(columns='topics')

test_dataset.head()

Unnamed: 0,repo,language,documents
0,69798748,javascript,"b""const glob = require('glob')\nconst markdown..."
1,128624453,javascript,"b""module.exports = {\n extends: ['@commitlint..."
2,128624453,javascript,"b""module.exports = {\n extends: ['standard', ..."
3,128624453,javascript,"b""const cp = require('child_process')\nconst g..."
4,128624453,javascript,"b""module.exports = {\n verbose: true,\n tran..."


To evaluate the mixture model, we must label each repository with it's percentage of each programming language:

In [33]:
def calculate_language_percentages(group):
    total_python_length = 0
    total_r_length = 0
    total_javascript_length = 0
    
    for index, repo, language, document in group.itertuples():
        if language == 'python':
            total_python_length += len(document)
            
        if language == 'javascript':
            total_javascript_length += len(document)
            
        if language == 'r':
            total_r_length += len(document)
            
    total_length = total_python_length + total_r_length + total_javascript_length
            
    return pandas.Series([
        total_python_length/total_length,
        total_r_length/total_length,
        total_javascript_length/total_length,
    ])

In [34]:
test_composition_actual = test_dataset.groupby(by='repo').apply(calculate_language_percentages)
test_composition_actual.columns = ['python', 'r', 'javascript']

Here are the programming language percentages for each of repository in our test dataset:

In [36]:
test_composition_actual

Unnamed: 0_level_0,python,r,javascript
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
596892,1.0,0.0,0.0
1248263,0.652443,0.0,0.347557
1790564,0.0,0.0,1.0
4751958,0.36073,0.0,0.63927
12465340,0.0,0.995105,0.004895
13523710,0.0,1.0,0.0
14267375,0.941701,0.0,0.058299
14579179,0.0,0.189083,0.810917
16146440,0.0,0.253421,0.746579
17856544,0.0,1.0,0.0


In [37]:
combined_test_documents = test_dataset.groupby(by='repo').apply(concat_texts)

In [57]:
with open('../data/concatDocs_lda_model.pickle', 'rb') as f:
    model = pickle.load(f)
    
with open('../data/concatDocs_tf_train.pickle', 'rb') as f:
    tf_train = pickle.load(f)
    
with open('../data/concatDocs_tf_vectorizer.pickle', 'rb') as f:
    tf_vectorizer = pickle.load(f)

with open('../data/concatDocs_tf_test.pickle', 'rb') as f:
    tf_test = pickle.load(f)

In [58]:
combined_test_documents_tf = tf_vectorizer.transform(combined_test_documents)
combined_test_model = model.transform(combined_test_documents_tf)

In [62]:
combined_test_model

array([[1.90661496e-01, 8.07648843e-01, 4.78215219e-06, 1.68487898e-03],
       [5.41157197e-01, 4.47795188e-01, 1.04497870e-02, 5.97828094e-04],
       [7.65079147e-01, 2.07808158e-01, 9.78836147e-03, 1.73243342e-02],
       [1.59282454e-05, 9.98934927e-01, 1.04864905e-03, 4.95710834e-07],
       [2.65019130e-01, 6.41208301e-01, 3.48327547e-03, 9.02892938e-02],
       [6.67248762e-01, 3.31216180e-01, 1.54776206e-05, 1.51958084e-03],
       [3.52752118e-04, 9.98974555e-01, 3.34400158e-04, 3.38292524e-04],
       [9.73834351e-01, 7.09203679e-07, 2.61642358e-02, 7.04124487e-07],
       [9.25075962e-01, 7.31192529e-02, 1.69474412e-06, 1.80309002e-03],
       [2.97893274e-01, 4.74194950e-01, 6.22969301e-04, 2.27288807e-01],
       [8.04771825e-01, 4.58890587e-02, 1.64098727e-05, 1.49322707e-01],
       [4.84929996e-01, 5.13540584e-01, 7.57352821e-04, 7.72067435e-04],
       [2.58767113e-01, 7.09967336e-01, 2.49810251e-03, 2.87674476e-02],
       [2.80705528e-01, 6.62976682e-01, 2.37653687e

In [None]:
# insert kish's mappings here


### KL Divergence

In [None]:
test_composition_estim=test_composition_actual

In [None]:
if all(test_composition_estim.index==test_composition_actual.index)==True:
    KDDataframe=pandas.DataFrame([],columns=["Repo","KD Divergence"])
    for i in range(0,len(test_composition_actual)):
        a=0
        for j in range(0,len(test_composition_actual.iloc[i])):
            a=a-((test_composition_actual.iloc[i][j])*math.log(test_composition_actual.iloc[i][j]/test_composition_estim.iloc[i][j]))
        #m=pandas.DataFrame([test_composition_actual.index[i],a],columns=["Repo","KD Divergence"])
        print(a)
        #KDDataframe.append(m)
#KDDataframe

# Notes:
  - our keyword lists have simliar/common words e.g. python javascript and r all share some keywords. this can be seen in the documents. choosing another language, with a completely different set of keywords might prove easier to differentiate for the LDA model. somethind somthing LDA uses distance, but if true topics share key words, then distance metric breaks.