In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nimfa

## Data

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

In [3]:
newsgroups_train.filenames.shape, newsgroups_train.target.shape

((2034,), (2034,))

In [4]:
print("\n".join(newsgroups_train.data[:3]))

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych


Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 1993.


Nope - fruitcakes like Koresh have been demonstrating such evil corruption
for centuries.

 >In article <1993Apr19.020359.26996@sq.sq.com>, msb@sq.sq.c

In [5]:
vectorizer = CountVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(newsgroups_train.data).todense() # (documents, vocab)
vectors.shape #, vectors.nnz / vectors.shape[0], row_means.shape

(2034, 26576)

In [6]:
vocab = np.array(vectorizer.get_feature_names())

In [7]:
vocab.shape

(26576,)

## NMF

In [8]:
m,n=vectors.shape
d=5  # num topics

In [9]:
clf = decomposition.NMF(n_components=d, random_state=1)

W1 = clf.fit_transform(vectors)
H1 = clf.components_

In [10]:
num_top_words=8

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [33]:
H1

array([[1.21260827e-01, 0.00000000e+00, 0.00000000e+00, ...,
        3.20158583e-05, 6.40317166e-05, 2.89955054e-04],
       [1.19285748e-01, 1.20852675e-01, 1.65916014e-04, ...,
        1.16499969e-04, 2.32999937e-04, 5.18641224e-02],
       [5.76639107e-02, 4.81606444e-01, 7.99303371e-04, ...,
        2.87622789e-04, 5.75245577e-04, 0.00000000e+00],
       [0.00000000e+00, 1.47943345e-01, 0.00000000e+00, ...,
        6.58515991e-05, 1.31703198e-04, 0.00000000e+00],
       [1.27421920e-01, 1.93564170e-01, 5.04719721e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [11]:
show_topics(H1)

['jpeg image gif file color images format quality',
 'edu graphics pub mail 128 ray ftp send',
 'space launch satellite nasa commercial satellites year market',
 'jesus god people matthew atheists does atheism said',
 'image data available software processing ftp edu analysis']

## NIMFA

In [28]:
lsnmf = nimfa.Lsnmf(vectors, seed='random_vcol', rank=d, max_iter=100)
lsnmf_fit = lsnmf()

In [29]:
# fitted=lsnmf_fit.fit()

In [30]:
H = lsnmf_fit.fit.coef().T

In [35]:
show_topics(np.array(H))

['00 0000 000 000000 00000',
 '000000 00000 0000 00 000',
 '000000 0000 00 00000 000',
 '000000 0000 00 00000 000',
 '000000 00 00000 0000 000',
 '00 000000 00000 0000 000',
 '000000 00000 0000 000 00',
 '00000 000000 0000 00 000',
 '000 00000 0000 00 000000',
 '000000 0000 00 00000 000',
 '000000 0000 00 00000 000',
 '000000 0000 00 00000 000',
 '000000 0000 00 00000 000',
 '000000 0000 00 00000 000',
 '000000 00 00000 0000 000',
 '000000 00000 00 0000 000',
 '0000 000 000000 00000 00',
 '000000 0000 00 00000 000',
 '00 000000 00000 0000 000',
 '000000 00 00000 0000 000',
 '000 00000 0000 00 000000',
 '000000 00000 0000 00 000',
 '0000 00 000000 00000 000',
 '000000 00000 0000 000 00',
 '000000 0000 00 00000 000',
 '00 000000 00000 0000 000',
 '000000 00000 0000 000 00',
 '000000 0000 00000 000 00',
 '000000 00 00000 0000 000',
 '0000 00 000000 00000 000',
 '00 000000 00000 0000 000',
 '000000 00000 000 0000 00',
 '000000 00000 000 0000 00',
 '0000 00 000000 00000 000',
 '000000 00 00

In [34]:
np.array(H)

array([[4.33784397e-02, 3.28197364e-02, 3.76840359e-02, 0.00000000e+00,
        2.14664650e-02],
       [4.39200403e-02, 0.00000000e+00, 5.72801764e-02, 6.55313851e-02,
        1.79296663e-01],
       [6.02625533e-05, 0.00000000e+00, 1.49340563e-04, 0.00000000e+00,
        2.97546809e-04],
       ...,
       [4.23810736e-05, 8.66428103e-06, 0.00000000e+00, 2.91710863e-05,
        1.07098353e-04],
       [8.47621472e-05, 1.73285621e-05, 0.00000000e+00, 5.83421726e-05,
        2.14196706e-04],
       [1.88638844e-02, 7.84148753e-05, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00]])

## From scratch

In [59]:
lam=1e3
lr=1e-3
mu = 1e-6
def grads(M, W, H):
    R = W@H-M
    return R@H.T + penalty(W, mu)*lam, W.T@R + penalty(H, mu)*lam # dW, dH

In [60]:
def penalty(M, mu):
    return np.where(M>=mu,0, np.min(M - mu, 0))

In [61]:
def upd(M, W, H, lr):
    dW,dH = grads(M,W,H)
    W -= lr*dW 
    H -= lr*dH

In [62]:
def report(M,W,H): 
    print(np.linalg.norm(M-W@H), W.min(), H.min(), (W<0).sum(), (H<0).sum())

In [63]:
W = np.abs(np.random.normal(scale=0.01, size=(m,d)))
H = np.abs(np.random.normal(scale=0.01, size=(d,n)))

In [64]:
report(vectors, W, H)

937.3927790582676 6.940921023182598e-07 1.197458773763938e-07 0 0


In [65]:
upd(vectors,W,H,lr)

In [66]:
report(vectors, W, H)

937.3022424640812 8.568132472810028e-06 -1.1423762527027232e-06 0 3


In [67]:
for i in range(50): 
    upd(vectors,W,H,lr)
    if i % 10 == 0: report(vectors,W,H)

937.159111357315 -2.361601261347711e-05 -2.338942350933999e-06 4 2
846.3741298662884 -0.002266363577134057 -0.0028848871193172297 182 13132
829.5380601874638 -0.004825528296999216 -0.011645725226333031 516 53693
831.4205296626534 -0.7859568520474961 -0.38586257774170263 1435 48263
823.7211110563494 -0.432612688998524 -0.4379667501139365 142 22347


In [68]:
show_topics(H)

['space data nasa available information ftp launch program',
 'jpeg gif color quality don file jfif better',
 'jesus space god launch people matthew atheists does',
 'jpeg image file gif images format color files',
 'edu graphics pub mail 128 ftp send 3d']