In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nimfa

## Data

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

In [3]:
newsgroups_train.filenames.shape, newsgroups_train.target.shape

((2034,), (2034,))

In [4]:
print("\n".join(newsgroups_train.data[:3]))

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych


Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 1993.


Nope - fruitcakes like Koresh have been demonstrating such evil corruption
for centuries.

 >In article <1993Apr19.020359.26996@sq.sq.com>, msb@sq.sq.c

In [5]:
vectorizer = CountVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(newsgroups_train.data).todense() # (documents, vocab)
vectors.shape #, vectors.nnz / vectors.shape[0], row_means.shape

(2034, 26576)

In [6]:
vocab = np.array(vectorizer.get_feature_names())

In [7]:
vocab.shape

(26576,)

## NMF

In [8]:
m,n=vectors.shape
d=5  # num topics

In [9]:
clf = decomposition.NMF(n_components=d, random_state=1)

W1 = clf.fit_transform(vectors)
H1 = clf.components_

In [10]:
num_top_words=8

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [11]:
H1

array([[1.21260827e-01, 0.00000000e+00, 0.00000000e+00, ...,
        3.20158583e-05, 6.40317166e-05, 2.89955054e-04],
       [1.19285748e-01, 1.20852675e-01, 1.65916014e-04, ...,
        1.16499969e-04, 2.32999937e-04, 5.18641224e-02],
       [5.76639107e-02, 4.81606444e-01, 7.99303371e-04, ...,
        2.87622789e-04, 5.75245577e-04, 0.00000000e+00],
       [0.00000000e+00, 1.47943345e-01, 0.00000000e+00, ...,
        6.58515991e-05, 1.31703198e-04, 0.00000000e+00],
       [1.27421920e-01, 1.93564170e-01, 5.04719721e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [12]:
show_topics(H1)

['jpeg image gif file color images format quality',
 'edu graphics pub mail 128 ray ftp send',
 'space launch satellite nasa commercial satellites year market',
 'jesus god people matthew atheists does atheism said',
 'image data available software processing ftp edu analysis']

## NIMFA

In [13]:
lsnmf = nimfa.Lsnmf(vectors, seed='random_vcol', rank=d, max_iter=100)
lsnmf_fit = lsnmf()

In [14]:
# fitted=lsnmf_fit.fit()

In [15]:
H = lsnmf_fit.fit.coef().T

In [16]:
show_topics(np.array(H))

['0000 000000 000 00 00000',
 '000 00 0000 00000 000000',
 '000 0000 000000 00000 00',
 '000 0000 000000 00000 00',
 '000 0000 000000 00000 00',
 '0000 000000 00000 000 00',
 '000 00000 00 0000 000000',
 '00000 00 000 0000 000000',
 '000000 00000 00 0000 000',
 '000 0000 000000 00000 00',
 '000 0000 000000 00000 00',
 '000 0000 000000 00000 00',
 '000 0000 000000 00000 00',
 '000 0000 000000 00000 00',
 '000 0000 000000 00000 00',
 '000 00000 00 0000 000000',
 '0000 000000 00000 000 00',
 '000 0000 000000 00000 00',
 '0000 000000 00000 000 00',
 '000 00000 000000 0000 00',
 '000000 00000 00 0000 000',
 '000 00 00000 000000 0000',
 '0000 000 00000 000000 00',
 '000 000000 00000 0000 00',
 '000 0000 000000 00000 00',
 '0000 000 000000 00000 00',
 '000 00 00000 0000 000000',
 '000 00000 00 000000 0000',
 '000 000000 00000 0000 00',
 '0000 000 00000 000000 00',
 '0000 000000 00000 000 00',
 '000 00 000000 00000 0000',
 '000 00000 00 000000 0000',
 '0000 000 00000 000000 00',
 '000 0000 000

In [17]:
np.array(H)

array([[7.87631899e-04, 1.66692736e-02, 3.62779893e-02, 0.00000000e+00,
        2.15360760e-02],
       [6.29852075e-02, 1.28198195e-01, 4.37414889e-02, 8.62160969e-04,
        0.00000000e+00],
       [0.00000000e+00, 2.22382097e-04, 8.53746742e-05, 0.00000000e+00,
        6.84550662e-06],
       ...,
       [4.30123217e-05, 7.04561180e-05, 1.53479778e-05, 0.00000000e+00,
        2.44046820e-06],
       [8.60246434e-05, 1.40912236e-04, 3.06959555e-05, 0.00000000e+00,
        4.88093641e-06],
       [0.00000000e+00, 0.00000000e+00, 9.08880374e-03, 0.00000000e+00,
        0.00000000e+00]])

## From scratch

In [18]:
lam=1e3
lr=1e-3
mu = 1e-6
def grads(M, W, H):
    R = W@H-M
    return R@H.T + penalty(W, mu)*lam, W.T@R + penalty(H, mu)*lam # dW, dH

In [19]:
def penalty(M, mu):
    return np.where(M>=mu,0, np.min(M - mu, 0))

In [20]:
def upd(M, W, H, lr):
    dW,dH = grads(M,W,H)
    W -= lr*dW 
    H -= lr*dH

In [21]:
def report(M,W,H): 
    print(np.linalg.norm(M-W@H), W.min(), H.min(), (W<0).sum(), (H<0).sum())

In [22]:
W = np.abs(np.random.normal(scale=0.01, size=(m,d)))
H = np.abs(np.random.normal(scale=0.01, size=(d,n)))

In [23]:
report(vectors, W, H)

937.3947814056215 5.486283037033018e-07 4.4101492479273006e-07 0 0


In [24]:
upd(vectors,W,H,lr)

In [25]:
report(vectors, W, H)

937.3036719332895 -4.825866834858099e-05 -3.6619723950852096e-06 4 6


In [26]:
for i in range(50): 
    upd(vectors,W,H,lr)
    if i % 10 == 0: report(vectors,W,H)

937.1610403497286 -6.585675963456483e-05 -5.22654845926433e-06 4 8
846.3740413913592 -0.001978364637420332 -0.002889839944171911 195 13162
827.9663827073815 -0.006950598064520037 -0.0852336508876248 608 52780
788.6866435334224 -0.4649121713089179 -0.7292400532794109 1298 45640
733.6593394617055 -0.36604207320235793 -0.740072711721256 991 44845


In [27]:
show_topics(H)

['image jpeg gif color images file software version',
 'jpeg file gif format color quality images files',
 'jesus god matthew people said does atheists prophecy',
 'edu graphics pub mail ftp data 128 3d',
 'space launch data satellite nasa image commercial available']

## Example

In [28]:
#http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/

def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break
    return P, Q.T

In [30]:
R = [
     [5,3,0,1],
     [4,0,0,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4],
    ]

R = np.array(R)

N = len(R)
M = len(R[0])
K = 2

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)

In [31]:
nR

array([[4.99234177, 2.94362954, 3.98471475, 0.99871633],
       [3.96611805, 2.34772117, 3.36081072, 0.99710059],
       [1.06541655, 0.8423692 , 5.40276131, 4.96347318],
       [0.96776026, 0.74102334, 4.39454676, 3.9732169 ],
       [1.74108693, 1.19267059, 4.91978054, 4.0319109 ]])

## Example from here https://github.com/canerturkmen/nmflib/blob/master/nmf.py

In [91]:
def frobenius(A, B):
    """
    Function for calculating the Euclidean distance between two matrices
    """
    return np.linalg.norm(A-B, 'fro')

def normalize_factor_matrices(W, H):
    """
    Bring W to unit columns while keeping W*H constant
    :param W: left factorizing matrix
    :param H: right factorizing matrix
    :type W: numpy.ndarray
    :type H: numpy.ndarray
    :returns: tuple, two-tuple (W, H)
    """
    # normalize W,H before returning
    norms = np.linalg.norm(W, 2, axis=0)
    norm_gt_0 = norms > 0
    W[:, norm_gt_0] /= norms[norm_gt_0]
    H[norm_gt_0, :] = ((H[norm_gt_0, :].T) * norms[norm_gt_0]).T

    return (W,H)


def kldivergence(A,B):
    """
    Function for determining the divergence of A from B
    as presented in Lee and Seung, otherwise known as generalized Kullback-Leibler
    divergence or I-divergence.
    :param A: first matrix
    :type A: numpy.ndarray
    :param B: second matrix
    :type B: numpy.ndarray
    :returns: the divergence
    :rtype: float
    """
    return np.sum((A*np.log(A/B) - A + B))

class BaseNMF:
    """
    Class that serves as the base-class for different implementations of NMF.
    """

    maxiter = 10000
    stopconv = 1e-4


    def __init__(self, X, k, **kwargs):
        """
        Initialize the NMF problem with a matrix.
        The initializer also responds to the following keyword arguments:
        - metric: (str) the objective metric to be reduced. Not all NMF versions have implementations of different metrics. May
            take "eu" (default) for euclidean distance or "kl" for generalized Kullback-Leibler divergence (I-divergence)
        - maxiter: (int) the maximum number of iterations to be performed. Default is 10000
        - stopconv: (int) the convergence criterion for the objective function. Default is 40
        :param X: matrix
        :param k: number of dimensions for NMF
        """

        #TODO: ClusterNMF can have negative entries!
        if X.min() < 0:
            raise Exception("The matrix cannot have negative entries")

        if kwargs.get("maxiter"):
            self.maxiter = kwargs.get("maxiter")

        if kwargs.get("stopconv"):
            self.stopconv = kwargs.get("stopconv")

        self.X = X
        self.k = k

class NMFResult:
    """
    Simple object for storing the results of an NMF training run
    """

    convgraph = None # an array of objective function values to plot convergence
    matrices = None # a python **list** of factorizing matrices
    objvalue = None # the final value of the objective function
    converged = None

    def __init__(self, matrices, convgraph=None, objvalue=None, converged=None):
        self.matrices = matrices
        self.convgraph = convgraph
        self.objvalue = objvalue
        self.converged = converged
        
        
class NMF(BaseNMF):
    """
    Implementation of basic NMF (Lee and Seung) algorithm with Euclidean and KL-divergence objective functions
    """

    def predict(self):
        """
        Euclidean distance reducing update rules for NMF, presented in Lee and Seung (2001)
        """

        m, n = self.X.shape
        V = self.X
        pdist = 1e9 #very large number

        W = np.random.rand(m, self.k)
        H = np.random.rand(self.k, n)
#         print(self.maxiter)
        convgraph = np.zeros(int(self.maxiter / 10))
        converged = False

        eps = 1e-7 # small number for stability

        for i in range(self.maxiter):
            # multiplicative update steps, Euclidean error reducing
            H = H * (( W.T.dot(V) + eps) / (W.T.dot(W).dot(H) + eps))
            H = (H.T / np.linalg.norm(H, 2, 1)).T

            W = W * ( (V.dot(H.T) + eps) / (W.dot(H.dot(H.T)) + eps) )
            W /= np.linalg.norm(W, 2, 0)
            # normalize columns of H and W

            # every 10 iterations, check convergence
            if i % 100 == 0:
                dist = frobenius(V, W.dot(H))
                convgraph[int(i/10)] = dist

                # print dist
#                 print(dist)
#                 print(pdist)

                if pdist - dist < self.stopconv and pdist - dist > self.stopconv:
                    converged = True
                    break

                pdist = dist

#         W, H = normalize_factor_matrices(W,H)
#         return W, H
        return NMFResult((W, H), convgraph, dist)

In [92]:
R = [
     [5,3,0,1],
     [4,0,0,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4],
    ]

R = np.array(R)

R
nmf=NMF(R, 4)
results=nmf.predict()

In [93]:
W, H=results.matrices

In [94]:
np.round(W@H,2)

array([[1.04, 0.58, 0.05, 0.39],
       [0.83, 0.06, 0.  , 0.18],
       [0.24, 0.32, 0.03, 0.92],
       [0.21, 0.1 , 0.01, 0.74],
       [0.16, 0.21, 0.92, 0.74]])

In [80]:
W.shape, R.shape

((5, 2), (5, 4))