In [None]:
import io
import os.path
import re
import tarfile

import smart_open

def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):
    with smart_open.open(url, "rb") as file:
        with tarfile.open(fileobj=file) as tar:
            for member in tar.getmembers():
                if member.isfile() and re.search(r'nipstxt/nips\d+/\d+\.txt', member.name):
                    member_bytes = tar.extractfile(member).read()
                    yield member_bytes.decode('utf-8', errors='replace')

docs = list(extract_documents())

In [None]:
print(len(docs))
print(docs[0][:500])

1740
387 
Neural Net and Traditional Classifiers  
William Y. Huang and Richard P. Lippmann 
MIT Lincoln Laboratory 
Lexington, MA 02173, USA 
Abstract
Previous work on nets with continuous-valued inputs led to generative 
procedures to construct convex decision regions with two-layer percepttons (one hidden 
layer) and arbitrary decision regions with three-layer percepttons (two hidden layers). 
Here we demonstrate that two-layer perceptton classifiers trained with back propagation 
can form both c


In [None]:
# Tokenize the documents.
from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [None]:
import nltk
nltk.download("wordnet")
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [None]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 6617
Number of documents: 1740


In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 15
chunksize = 2000
passes = 20
iterations = 400
#eval_every = 50  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    #eval_every=eval_every
)

In [None]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.2020.
[([(0.013479179, 'gaussian'),
   (0.011418022, 'mixture'),
   (0.010912382, 'density'),
   (0.009938585, 'likelihood'),
   (0.008921776, 'prior'),
   (0.008400434, 'estimate'),
   (0.008085511, 'bayesian'),
   (0.0072674896, 'log'),
   (0.0070434567, 'posterior'),
   (0.006905677, 'sample'),
   (0.0065356083, 'em'),
   (0.0062981057, 'estimation'),
   (0.0058998214, 'approximation'),
   (0.005893358, 'noise'),
   (0.0058586993, 'variance'),
   (0.005125972, 'component'),
   (0.0047610467, 'matrix'),
   (0.004748705, 'conditional'),
   (0.0046389275, 'maximum'),
   (0.0045906566, 'covariance')],
  -0.8238302472253216),
 ([(0.028441312, 'neuron'),
   (0.013542826, 'cell'),
   (0.012267889, 'spike'),
   (0.009443831, 'synaptic'),
   (0.008971165, 'firing'),
   (0.008547852, 'activity'),
   (0.0059060045, 'potential'),
   (0.005214696, 'response'),
   (0.0049713, 'signal'),
   (0.00463704, 'fig'),
   (0.004569963, 'phase'),
   (0.0045319092, 'frequency'),


In [None]:
def check_topic_threshold(x, topic,threshold):
    topics = model.get_document_topics(corpus[x])
    for i in topics:
        if i[0]==topic and i[1]>threshold:
            return True
    else: return False
t=[docs[x] for x in range(len(corpus)) if check_topic_threshold(x, 0, .9)]

##Homework:

For this line of code:

dictionary.filter_extremes(no_below=20, no_above=0.5)

Rerun with no_above=.75, .9 and removed. How do the topics change?

no_above=0.75

In [None]:
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=20, no_above=0.75)
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 15
chunksize = 2000
passes = 20
iterations = 400
#eval_every = 50  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    #eval_every=eval_every
)

In [None]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -0.9479.
[([(0.03184096, 'unit'),
   (0.025773048, 'weight'),
   (0.015961958, 'output'),
   (0.015487374, 'error'),
   (0.014199246, 'hidden'),
   (0.011860103, 'training'),
   (0.010217733, 'layer'),
   (0.00823273, 'net'),
   (0.008085175, 'algorithm'),
   (0.006899768, 'gradient'),
   (0.0051578237, 'pattern'),
   (0.004814464, 'propagation'),
   (0.004782344, 'back'),
   (0.0047676046, 'term'),
   (0.0046353056, 'method'),
   (0.004178382, 'activation'),
   (0.0039306814, 'rate'),
   (0.0039172806, 'performance'),
   (0.0039162645, 'parameter'),
   (0.0038843655, 'vector')],
  -0.5362680525331535),
 ([(0.010442262, 'training'),
   (0.008247604, 'were'),
   (0.007714964, 'unit'),
   (0.007470081, 'node'),
   (0.0069617727, 'output'),
   (0.006697184, 'performance'),
   (0.0064106896, 'layer'),
   (0.0063345972, 'pattern'),
   (0.0057911403, 'task'),
   (0.0057210363, 'word'),
   (0.005196388, 'representation'),
   (0.0047377874, 'rule'),
   (0.0043941163, '

no_above=0.9

In [None]:
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=20, no_above=0.9)
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 15
chunksize = 2000
passes = 20
iterations = 400
#eval_every = 50  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    #eval_every=eval_every
)

In [None]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -0.6987.
[([(0.008894015, 'data'),
   (0.008885119, 'error'),
   (0.00887421, 'model'),
   (0.00851973, 'learning'),
   (0.007423914, 'parameter'),
   (0.007327188, 'distribution'),
   (0.0062584844, 'algorithm'),
   (0.0059128506, 'method'),
   (0.0056150276, 'training'),
   (0.0053846627, 'noise'),
   (0.005219904, 'case'),
   (0.0049594333, 'given'),
   (0.0049147955, 'gaussian'),
   (0.0047643445, 'mean'),
   (0.0047113937, 'linear'),
   (0.0045512943, 'estimate'),
   (0.0043587335, 'value'),
   (0.004337409, 'approximation'),
   (0.0043372964, 'example'),
   (0.0043035666, 'network')],
  -0.3866014483718242),
 ([(0.028832642, 'training'),
   (0.026269155, 'classifier'),
   (0.01726333, 'classification'),
   (0.016036453, 'network'),
   (0.015113941, 'class'),
   (0.014380766, 'data'),
   (0.012157125, 'pattern'),
   (0.012009674, 'performance'),
   (0.009987192, 'test'),
   (0.009529039, 'error'),
   (0.008833212, 'were'),
   (0.00793349, 'feature'),
   (0

removing the no_above parameter

In [None]:
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=20)
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 15
chunksize = 2000
passes = 20
iterations = 400
#eval_every = 50  # Don't evaluate model perplexity, takes too much time.

# Make an index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    #eval_every=eval_every
)

In [None]:
top_topics = model.top_topics(corpus)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

Average topic coherence: -1.2238.
[([(0.03196925, 'neuron'),
   (0.010638103, 'cell'),
   (0.010500433, 'activity'),
   (0.010348936, 'synaptic'),
   (0.010306895, 'spike'),
   (0.008773265, 'firing'),
   (0.008514337, 'connection'),
   (0.0055349413, 'cortical'),
   (0.0055229315, 'stimulus'),
   (0.0052355346, 'cortex'),
   (0.005199938, 'response'),
   (0.004973815, 'layer'),
   (0.0048481515, 'synapsis'),
   (0.0047055394, 'excitatory'),
   (0.0046897163, 'simulation'),
   (0.004634566, 'potential'),
   (0.004305348, 'fig'),
   (0.0042589065, 'inhibitory'),
   (0.0039785933, 'dynamic'),
   (0.0037828335, 'correlation')],
  -0.8747092951331608),
 ([(0.011156219, 'gaussian'),
   (0.009794317, 'mixture'),
   (0.008224291, 'matrix'),
   (0.007415868, 'component'),
   (0.00715237, 'density'),
   (0.007030722, 'likelihood'),
   (0.0061688516, 'prior'),
   (0.0058235973, 'estimate'),
   (0.005270515, 'bayesian'),
   (0.0050791902, 'kernel'),
   (0.0050300006, 'em'),
   (0.005006265, 'post

no_above=0.75, average topic coherence=-0.9479

no_above=0.9, average topic coherence=-0.6987

remove no_above, average topic coherence=-1.2238

With removing the no_above, it decreases the coherence which means there are more noisy terms included.

With increase in no_above parameter, it increases the coherence, which means we are filtering out terms which are in large proportions, which means the results are more coherent due to removal of noise from the documents.

For this line of code:

Set training parameters.

num_topics = 15
Set to 10, 15, 20. Try to interpret the topics.

In [None]:
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=20, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
# Set training parameters for different numbers of topics
num_topics_values = [10, 15, 20]

# Iterate over different numbers of topics
for num_topics in num_topics_values:
    print(f"\nTraining LDA model with {num_topics} topics:")

    # Train LDA model
    model = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
    )

    # Print topics
    top_topics = model.top_topics(corpus)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    print("\nTopics:")
    pprint(top_topics)



Training LDA model with 10 topics:
Average topic coherence: -1.1250.

Topics:
[([(0.020584572, 'neuron'),
   (0.019997776, 'cell'),
   (0.009395983, 'response'),
   (0.008767695, 'activity'),
   (0.008689296, 'stimulus'),
   (0.0077806474, 'spike'),
   (0.0074262847, 'visual'),
   (0.0068669026, 'synaptic'),
   (0.006167346, 'firing'),
   (0.0059826514, 'cortex'),
   (0.005555033, 'connection'),
   (0.005545681, 'field'),
   (0.0046865377, 'orientation'),
   (0.0046355673, 'cortical'),
   (0.0045257877, 'direction'),
   (0.004338121, 'layer'),
   (0.004196991, 'eye'),
   (0.004105382, 'fig'),
   (0.0039060735, 'frequency'),
   (0.0037677197, 'potential')],
  -0.9583879901943559),
 ([(0.009124373, 'class'),
   (0.0066637867, 'sample'),
   (0.0055528497, 'estimate'),
   (0.0052348576, 'density'),
   (0.0051311147, 'approximation'),
   (0.004851386, 'kernel'),
   (0.0046446407, 'xi'),
   (0.0044579376, 'regression'),
   (0.004410163, 'classification'),
   (0.004219093, 'log'),
   (0.0040

num_topics=10, average topic coherence=-1.1250

num_topics=15, average topic coherence=-1.2440

num_topics=20, average topic coherence=-1.4014

When the num_topics-20, it has more finer details and specifics for the topics, compared to num_topics=10, where there is lot more overlapping.