In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
tokenizer = RegexpTokenizer(r'\w+')
vectorizer = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)
vectors = vectorizer.fit_transform(newsgroups_train.data)

In [3]:
num_components=5

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components)

# Fit and Transform SVD model on data
lda_matrix = model.fit_transform(vectors)

# Get Components 
lda_components=model.components_

In [4]:
terms = vectorizer.get_feature_names()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['geb', 'msg', 'banks', 'gordon', 'halat', 'dyer', 'cadre']
Topic 1:  ['bontchev', 'hamburg', 'fbihh', 'beauchaine', 'bobbe', 'vesselin', 'lafibm']
Topic 2:  ['israeli', 'israel', 'escrow', 'clipper', 'encryption', 'crypto', 'arab']
Topic 3:  ['edu', 's', 'com', 't', '1', 'subject', 'lines']
Topic 4:  ['turkish', 'armenian', 'armenians', 'armenia', 'gld', 'argic', 'serdar']


In [None]:
# Define Search Param
search_params = {'n_components': [10,15], 'learning_decay': [.5, 0.7]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(vectors)

In [15]:
GridSearchCV(cv=None, error_score='raise',
       estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_components': [10, 15], 'learning_decay': [0.5, 0.7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

TypeError: __init__() got an unexpected keyword argument 'n_topics'

In [17]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(vectors))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -279523.1993188141
Model Perplexity:  164796.7235914736


In [22]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [10, 15]
log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if gscore.params['learning_decay']==0.5]
log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if gscore.params['learning_decay']==0.7]
#log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if gscore.parameters['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

AttributeError: 'str' object has no attribute 'params'

In [32]:
print(model.cv_results_.items)
for key,value in model.cv_results_.items():
    print(key)
    print(value)

<built-in method items of dict object at 0x000001313C999C80>
mean_fit_time
[25.21519012 35.84408865]
std_fit_time
[1.94332899 6.52504797]
mean_score_time
[0.64138322 0.72031679]
std_score_time
[0.0497628  0.05762157]
param_learning_decay
[0.5 0.7]
param_n_components
[10 10]
params
[{'learning_decay': 0.5, 'n_components': 10}, {'learning_decay': 0.7, 'n_components': 10}]
split0_test_score
[-277191.36350667 -281188.46045791]
split1_test_score
[-282605.21560971 -282148.74133555]
split2_test_score
[-285259.32954121 -284621.59664876]
split3_test_score
[-276988.569942   -279659.23248665]
split4_test_score
[-275571.51799449 -286416.95405876]
mean_test_score
[-279523.19931881 -282806.99699753]
std_test_score
[3738.43828268 2420.1629838 ]
rank_test_score
[1 2]


In [12]:
from pprint import pprint
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [9]:
newsgroups_train.data[3]

'From: jgreen@amber (Joe Green)\nSubject: Re: Weitek P9000 ?\nOrganization: Harris Computer Systems Division\nLines: 14\nDistribution: world\nNNTP-Posting-Host: amber.ssd.csd.harris.com\nX-Newsreader: TIN [version 1.1 PL9]\n\nRobert J.C. Kyanko (rob@rjck.UUCP) wrote:\n> abraxis@iastate.edu writes in article <abraxis.734340159@class1.iastate.edu>:\n> > Anyone know about the Weitek P9000 graphics chip?\n> As far as the low-level stuff goes, it looks pretty nice.  It\'s got this\n> quadrilateral fill command that requires just the four points.\n\nDo you have Weitek\'s address/phone number?  I\'d like to get some information\nabout this chip.\n\n--\nJoe Green\t\t\t\tHarris Corporation\njgreen@csd.harris.com\t\t\tComputer Systems Division\n"The only thing that really scares me is a person with no sense of humor."\n\t\t\t\t\t\t-- Jonathan Winters\n'

In [10]:
import numpy as np
import math
# loading the train dataset from csv skiping the first row(label)
data = np.loadtxt('MNIST_train.csv',comments='#',delimiter=',',skiprows=1)
x = int(len(data)*.8) # spliting the data into 80% and 20% for traing and vallidation
# validation data will be used for finding the optimal value k in KNN
np.random.shuffle(data)
train_data=data[:x,:] # training_data
validation_data=data[x:,:]
# loading the test dataset from csv skiping the first row(label)
test_data = np.loadtxt('MNIST_test.csv',comments='#',delimiter=',',skiprows=1)
len(train_data)

759

In [11]:
math.sqrt(759)

27.54995462791182