See `./project/dataset.py` for my code that generates the main dataset.

# Latent Dirichlet Allocation

In [6]:
import pandas
import numpy
import pickle
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

  from collections import Mapping, Set, Iterable, Iterator, defaultdict
  from collections import Mapping, Set, Iterable, Iterator, defaultdict
  from collections import defaultdict, deque, Sequence
  from collections import Hashable


In [9]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [10]:
numpy.random.seed(0xD00D5) 

In [11]:
minimal_dataset = pandas.read_csv("../data/dataset.csv.gz", header=None, names=['repo', 'language', 'documents'])

In [12]:
minimal_dataset.head()

Unnamed: 0,repo,language,documents
0,28457823,javascript,"b""module.exports = {\n plugins: [\n requir..."
1,28457823,javascript,"b""// The path where to mount the REST API app\..."
2,28457823,javascript,"b""import { Observable } from 'rx';\nimport deb..."
3,28457823,javascript,"b""import { Observable } from 'rx';\n// import ..."
4,28457823,javascript,"b""import { Observable } from 'rx';\n\nmodule.e..."


In [13]:
documents = minimal_dataset['documents']
documents.head()

0    b"module.exports = {\n  plugins: [\n    requir...
1    b"// The path where to mount the REST API app\...
2    b"import { Observable } from 'rx';\nimport deb...
3    b"import { Observable } from 'rx';\n// import ...
4    b"import { Observable } from 'rx';\n\nmodule.e...
Name: documents, dtype: object

For our purposes, common words are important and rare words aren't. So we shouldn't use tf-idf as a metric, bag-of-words makes more sense. (TODO: Maybe: "Similarly, filter out words that don't occur very often").

In [14]:
tf_vectorizer = CountVectorizer(stop_words=None)
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [15]:
len(tf_feature_names)

284872

We have four programming languages, try to use LDA to determine these four programming languages.

In [29]:
number_of_languages = 4

lda = LatentDirichletAllocation(n_topics=number_of_languages,  n_jobs=1)

In [30]:
model = lda.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=4,
             perp_tol=0.1, random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [31]:
model = _

In [32]:
model

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=4,
             perp_tol=0.1, random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [33]:
model.components_

array([[2.38928456e+02, 1.48743859e+02, 6.63647964e+01, ...,
        2.50756970e-01, 2.50756970e-01, 2.50756970e-01],
       [2.99244369e+02, 2.57844111e-01, 3.16097064e-01, ...,
        2.50141731e-01, 2.50141731e-01, 2.50141731e-01],
       [3.75245861e+01, 2.66373116e-01, 2.91116932e-01, ...,
        2.50002502e-01, 2.50002502e-01, 2.50002502e-01],
       [2.66187753e-01, 6.41071354e+00, 2.59056078e-01, ...,
        1.38201933e+00, 1.38201933e+00, 1.38201933e+00]])

Reference: https://nbviewer.jupyter.org/github/bmabey/pyLDAvis/blob/master/notebooks/sklearn.ipynb

In [34]:
with open('../data/minimal_lda_model.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(model, f)

**TODO** Next, I reckon stick this in pyldavis and we can get a nice visualisation like in the DST workshop. Maybe after that we can try copy to compare the topics against programming languages and see if there was any correlation?

In [3]:
with open('../data/minimal_lda_model.pickle', 'rb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    model = pickle.load(f)



In [7]:
model

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1, n_topics=4,
             perp_tol=0.1, random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [15]:
pyLDAvis.sklearn.prepare(model, tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# Building documents as mixtures of programming languages

In [16]:
documents[0]


'b"module.exports = {\\n  plugins: [\\n    require.resolve(\'babel-plugin-transform-function-bind\'),\\n    require.resolve(\'@babel/plugin-proposal-class-properties\'),\\n    require.resolve(\'@babel/plugin-proposal-object-rest-spread\'),\\n  ],\\n  presets: [\\n    [\\n      require.resolve(\'@babel/preset-env\'),\\n      {\\n        targets: {\\n          node: \'10\',\\n        },\\n      },\\n    ],\\n  ],\\n};\\n"'