In [None]:
import io
import os.path
import re
import tarfile

import smart_open

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    with smart_open.open(url, "rb") as file:
        with tarfile.open(fileobj=file) as tar:
            for member in tar.getmembers():
                if member.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', member.name):
                    member_bytes = tar.extractfile(member).read()
                    yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())

In [None]:
print(len(docs))
print(docs[0][:500])

1740
387 
Neural Net and Traditional Classifiers  
William Y. Huang and Richard P. Lippmann 
MIT Lincoln Laboratory 
Lexington, MA 02173, USA 
Abstract
Previous work on nets with continuous-valued inputs led to generative 
procedures to construct convex decision regions with two-layer percepttons (one hidden 
layer) and arbitrary decision regions with three-layer percepttons (two hidden layers). 
Here we demonstrate that two-layer perceptton classifiers trained with back propagation 
can form both c


In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [None]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
nltk.download('wordnet')


lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.75)

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 6754
Number of documents: 1740


In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 1000
passes = 10
iterations = 50


 # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,



)

In [None]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -0.9062.
[([(0.010398114, 'algorithm'),
   (0.007678453, 'bound'),
   (0.007675012, 'vector'),
   (0.006607112, 'point'),
   (0.0065486594, 'let'),
   (0.0060697766, 'class'),
   (0.0053764535, 'any'),
   (0.0053564017, 'theorem'),
   (0.0053136796, 'linear'),
   (0.0050979364, 'weight'),
   (0.0046688165, 'xi'),
   (0.0046005356, 'error'),
   (0.0043408815, 'probability'),
   (0.00415273, 'equation'),
   (0.0040320335, 'size'),
   (0.004026827, 'following'),
   (0.0039799516, 'parameter'),
   (0.003856438, 'distribution'),
   (0.0038247309, 'approximation'),
   (0.0038112951, 'matrix')],
  -0.5884014732246655),
 ([(0.01606943, 'training'),
   (0.012552043, 'recognition'),
   (0.011280242, 'word'),
   (0.010469176, 'speech'),
   (0.00990295, 'output'),
   (0.0077464166, 'were'),
   (0.007657672, 'feature'),
   (0.007533429, 'hidden'),
   (0.007151039, 'layer'),
   (0.006634554, 'classifier'),
   (0.0064629903, 'unit'),
   (0.006409938, 'performance'),
   (0.006

In [None]:
dictionary.filter_extremes(no_below=20, no_above=0.9)

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 15
chunksize = 1000
passes = 10
iterations = 50


# Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,



)

In [None]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -0.9218.
[([(0.017597258, 'error'),
   (0.014259963, 'training'),
   (0.010176478, 'method'),
   (0.00931773, 'weight'),
   (0.008318964, 'parameter'),
   (0.0075895134, 'prediction'),
   (0.0063006394, 'estimate'),
   (0.0061006765, 'gaussian'),
   (0.006091857, 'test'),
   (0.005965685, 'noise'),
   (0.005710339, 'regression'),
   (0.0052454197, 'performance'),
   (0.005159346, 'variance'),
   (0.004944791, 'mean'),
   (0.0047239033, 'algorithm'),
   (0.0047000386, 'distribution'),
   (0.004479941, 'output'),
   (0.004474498, 'sample'),
   (0.0044667358, 'term'),
   (0.0043123015, 'approach')],
  -0.5466584554569953),
 ([(0.010376171, 'bound'),
   (0.0099022575, 'weight'),
   (0.007791584, 'error'),
   (0.007213967, 'theorem'),
   (0.0070689702, 'let'),
   (0.006289797, 'any'),
   (0.006089082, 'size'),
   (0.0056120944, 'generalization'),
   (0.0054806746, 'threshold'),
   (0.0050513525, 'output'),
   (0.0049433606, 'theory'),
   (0.004880567, 'linear'),
   

My Interpretation:

1) LDA model uses huge time to process

2)Hence trying to avoid perplexity at the first case by reducing number of iterations on the worst case,also the default number of iterations are 50

3)Topics are chosen as 10,15,20 accordingly with the above set as 0.75,0.9 and 0, the output/interpretation is

first case : The text which is related to 10 topics is about -0.84% coherence

Second case :  The text which is related to 15 topics is about -0.913%

Third case :  The text which is related to 20 topics is about -1.4%

Overall I see LDA model helps in picking out most similar text from a corpus.I see for the last one when considering without removal of tokens and reduction of filtering the coherence is almost about -1.4 percent that means the model says the words are mostly relevant to each other.

Interpreting the topics:
Example: Algorithm has related worlds such as bound,vector,theorem,class, probability, parameter, distribution {in this context almost the resulted words are related to algorithm except words like any,xi,let these might have decreased the avg of coherence

Example 2: words like weight,error,pattern,memory, equation, training,rule,unit, parameter,capacity are also coherent with each other,when LDA model is performed if resulted a coherence of -0.6


Example 3: when no words are removed there is less coherence which is resulted like -1.4 which says there is extreme relevance between words

In [None]:
dictionary= Dictionary (docs)
dictionary.filter_extremes(no_below= 20)

In [None]:
corpus =[dictionary.doc2bow(doc) for doc in docs]

In [None]:

# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 20
chunksize = 1000
passes = 10
iterations = 50


# Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,



)

In [None]:

top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.4050.
[([(0.03056175, 'neuron'),
   (0.026194867, 'cell'),
   (0.015978487, 'spike'),
   (0.01249003, 'synaptic'),
   (0.010713542, 'firing'),
   (0.009804561, 'stimulus'),
   (0.009459405, 'response'),
   (0.0080794925, 'activity'),
   (0.00654565, 'potential'),
   (0.0058221878, 'synapsis'),
   (0.0055375383, 'cortical'),
   (0.005433991, 'cortex'),
   (0.005346227, 'excitatory'),
   (0.005226254, 'frequency'),
   (0.004959464, 'connection'),
   (0.0047365944, 'membrane'),
   (0.004672655, 'inhibitory'),
   (0.0045390846, 'fig'),
   (0.0041231094, 'neuronal'),
   (0.0040308377, 'temporal')],
  -0.9188208382305046),
 ([(0.013537649, 'mixture'),
   (0.011154318, 'likelihood'),
   (0.01071118, 'gaussian'),
   (0.010203601, 'estimate'),
   (0.00973361, 'density'),
   (0.008463204, 'prior'),
   (0.00821127, 'em'),
   (0.008192208, 'prediction'),
   (0.007912517, 'bayesian'),
   (0.0074417773, 'posterior'),
   (0.006815681, 'estimation'),
   (0.006543996, 'log')

Overall there are many patterns that are similar to each other since the topics increase the coherence increases