In [1]:
from scipy import sparse as sp

import numpy as np
import io
from os import path
from glob import glob
from tqdm import tqdm_notebook

from multiprocessing import Pool,cpu_count

from sklearn.metrics.pairwise import paired_distances, paired_cosine_distances
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

from collections import Counter

from random import shuffle

In [2]:
def read_texts(filename):
    with io.open(filename, newline='\n') as filin:
        return filin.readlines()
def read_dataset(pathname):
    texts = read_texts(path.join(pathname, 'texts.txt'))
    scores = read_texts(path.join(pathname, 'score.txt'))
    scores = list(map(int, scores))
    return texts,scores

In [13]:
texts,scores = read_dataset('../../datasets/newsdatasets/topics/reut/')

In [14]:
texts,scores = read_dataset('../../datasets/newsdatasets/topics/20ng/')

In [3]:
texts,scores = read_dataset('../../datasets/newsdatasets/topics/webkb/')

In [16]:
texts,scores = read_dataset('../../datasets/newsdatasets/sentiment/3Label/tweet_semevaltest/')

In [4]:
a = list(zip(scores, texts))
shuffle(a)
scores, texts = list(zip(*a))
texts = list(texts)
scores = list(scores)

In [5]:
cv = CountVectorizer(min_df=2)
X = cv.fit_transform(texts).tocsr()

In [6]:
le = LabelEncoder()
y = le.fit_transform(scores)

In [7]:
# número de documentos
N = X.shape[0]

# tamanho do vocabulário
V = X.shape[1] 

# Número de cada co-ocrrência por classe
Ntc = [ sp.lil_matrix( (V,V) ) for _ in range(max(y)+1) ]

In [8]:
for i in tqdm_notebook(range(N), total=N):
    co_sparse = (X[i].multiply(X[i].T))
    
    Ntc[ y[i] ] = (Ntc[ y[i] ] + co_sparse)

HBox(children=(IntProgress(value=0, max=8199), HTML(value='')))




In [20]:
N,V

(8199, 26343)

In [10]:
i = 0

# frequencia de documentos dos termos
df = X.sum(axis=0)

# número de documentos por classe
Nc = [ v for (k,v) in sorted(Counter(y).items(), key=lambda x: x[0]) ]

# Número de cada co-ocrrência
Nt = np.sum(Ntc)

# priori da classe P(c)
Pc = Nc[i]/N

# priori da não-classe P(!c)
Pnc = (N-Nc[i]) / N

# priori de cada termo P(t)
Pt = Nt/N

# Probabilidade P(t,c)
data = np.array(Ntc[i][ Nt.nonzero() ] / Nt[ Nt.nonzero() ])[0]
Ptc = sp.csr_matrix( (data, Nt.nonzero()), shape=Nt.shape )

In [11]:
def entropy(M):
    data = np.array(np.log2(M[M.nonzero()]))[0]
    log_prob = sp.csr_matrix( (data, M.nonzero()), shape=M.shape )
    return M.multiply(log_prob)

In [12]:
entropy(Pt)

<26343x26343 sparse matrix of type '<class 'numpy.float64'>'
	with 102061517 stored elements in Compressed Sparse Row format>

In [13]:
entropy(Ptc)

<26343x26343 sparse matrix of type '<class 'numpy.float64'>'
	with 6437175 stored elements in Compressed Sparse Row format>

In [14]:
len(Nc)

7

In [18]:
def generate_lines(X):
    for i in range(X.shape[0]):
        yield (X[i].multiply(X[i].T))
def paired_di(X):
    for i in range(len(Nc)):
        paired_cosine_distances(Ptc, X)
    return 0

In [19]:
with Pool(processes=cpu_count()) as pool:
    for m in tqdm_notebook(pool.imap_unordered(paired_di, generate_lines(X)), total=X.shape[0], smoothing=0.):
        pass

HBox(children=(IntProgress(value=0, max=8199), HTML(value='')))

In [None]:
for i in tqdm_notebook(range(N), total=N, smoothing=.8):
    co_sparse = (X[i].multiply(X[i].T))
    
    paired_cosine_distances(Ptc, co_sparse)

In [97]:
dir(pool)

['Process',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cache',
 '_ctx',
 '_get_tasks',
 '_guarded_task_generation',
 '_handle_results',
 '_handle_tasks',
 '_handle_workers',
 '_help_stuff_finish',
 '_initargs',
 '_initializer',
 '_inqueue',
 '_join_exited_workers',
 '_maintain_pool',
 '_map_async',
 '_maxtasksperchild',
 '_outqueue',
 '_pool',
 '_processes',
 '_quick_get',
 '_quick_put',
 '_repopulate_pool',
 '_result_handler',
 '_setup_queues',
 '_state',
 '_task_handler',
 '_taskqueue',
 '_terminate',
 '_terminate_pool',
 '_worker_handler',
 '_wrap_exception',
 'apply',
 'apply_async',
 'close',
 'imap',
 'imap_unordered',
 'join',
 