# LDA Model

In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
import io
import os.path
import re
import tarfile
from smart_open import open
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models.phrases import Phrases, Phraser
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from pprint import pprint

### **Data**

In [None]:
def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    fname = url.split('/')[-1]

    if not os.path.isfile(fname):
        with open(url, "rb") as fin:
            with open(fname, 'wb') as fout:
                while True:
                    buf = fin.read(io.DEFAULT_BUFFER_SIZE)
                    if not buf:
                        break
                    fout.write(buf)

    with tarfile.open(fname, mode='r:gz') as tar:
        # Ignore directory entries, as well as files like README, etc.
        files = [
            m for m in tar.getmembers()
            if m.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', m.name)
        ]
        for member in sorted(files, key=lambda x: x.name):
            member_bytes = tar.extractfile(member).read()
            yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())
print(len(docs))
print(docs[0][:500])

1740
1 
CONNECTIVITY VERSUS ENTROPY 
Yaser S. Abu-Mostafa 
California Institute of Technology 
Pasadena, CA 91125 
ABSTRACT 
How does the connectivity of a neural network (number of synapses per 
neuron) relate to the complexity of the problems it can handle (measured by 
the entropy)? Switching theory would suggest no relation at all, since all Boolean 
functions can be implemented using a circuit with very low connectivity (e.g., 
using two-input NAND gates). However, for a network that learns a pr


### **Pre-process and vectorize the documents**

In [None]:
# Split document into tokens
# remove numeric tokens
tokenizer = RegexpTokenizer(r'\w+')

for idx in range(len(docs)):
  # Convert to lowercase
  docs[idx] = docs[idx].lower()

  # Split into words
  docs[idx] = tokenizer.tokenize(docs[idx])

# Remove numbers, but not words that contain numbers
docs = [ [ token for token in doc if not token.isnumeric() ] for doc in docs ]

# Remove words that are only one character
docs = [ [ token for token in doc if len(token) > 1] for doc in docs ]

# Lemmatize the documents
lemmatizer = WordNetLemmatizer()
docs = [ [ lemmatizer.lemmatize(token) for token in doc ] for doc in docs ]

In [None]:
# Add bigrams to docs for those that appear 20 times or more
bigram = Phrases(docs, min_count=20)
bigram_phraser = Phraser(bigram)

for idx in range(len(docs)):
  for token in bigram_phraser[docs[idx]]:
    if '_' in token:
      # if token is a bigram, add to document
      docs[idx].append(token)

In [None]:
# Remove rare and common tokens

# Create a dictionary representation of the documents
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents or more than 50% of the documents
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [None]:
# Create a BoW representation of the documents
corpus = [ dictionary.doc2bow(doc) for doc in docs ]

print("Number of unique tokens: {}".format(len(dictionary)))
print("Number of documents: {}".format(len(corpus)))

Number of unique tokens: 9014
Number of documents: 1740


### **Training**

In [None]:
# Set training parameters
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary
temp = dictionary[0] # This is only to "load" the dictionary
id2word = dictionary.id2token

model = LdaModel(corpus=corpus,
                 id2word=id2word,
                 chunksize=chunksize,
                 alpha='auto',
                 eta='auto',
                 iterations=iterations,
                 num_topics=num_topics,
                 passes=passes,
                 eval_every=eval_every)

In [None]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics
avg_topic_coherence = sum(t[1] for t in top_topics) / num_topics
print("Average topic coherence: {}".format(avg_topic_coherence))

2020-06-17 05:12:49,846 : INFO : CorpusAccumulator accumulated stats from 1000 documents


Average topic coherence: -1.2434110980962532


In [None]:
pprint(top_topics)

[([(0.013251751, 'cell'),
   (0.008315591, 'receptive_field'),
   (0.0077909525, 'visual'),
   (0.0070467973, 'response'),
   (0.007024779, 'neuron'),
   (0.0063420315, 'field'),
   (0.005984506, 'stimulus'),
   (0.0050518224, 'activity'),
   (0.0047271326, 'motion'),
   (0.004555714, 'direction'),
   (0.00423173, 'cortex'),
   (0.004117712, 'map'),
   (0.004083545, 'orientation'),
   (0.003965542, 'eye'),
   (0.0039230143, 'layer'),
   (0.0038713678, 'connection'),
   (0.0037870945, 'spatial'),
   (0.0032465241, 'receptive'),
   (0.0031250194, 'cortical'),
   (0.0029832036, 'movement')],
  -1.0504100188650463),
 ([(0.011502048, 'circuit'),
   (0.009149489, 'chip'),
   (0.008468477, 'neuron'),
   (0.008114835, 'analog'),
   (0.0056743193, 'analog_vlsi'),
   (0.0053256736, 'voltage'),
   (0.0046318215, 'winner_take'),
   (0.0045395843, 'control'),
   (0.0044109635, 'figure_show'),
   (0.0042716023, 'vlsi'),
   (0.0041246484, 'implementation'),
   (0.003788327, 'signal'),
   (0.003668705