## Spark MLlib 

In [1]:
from pyspark import SparkContext
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors

In [2]:
import time

def timer(f):
    def tmp(*args, **kwargs):
        t = time.time()
        res = f(*args, **kwargs)
        print("Время выполнения функции {}: {}\n".format(f.__name__, time.time()-t))
        return res
    return tmp

In [3]:
NUM_COLLECTION_PASSES = 10
NUM_TOPICS = 3

In [4]:
@timer
def prepare_corpus(file_path):
    sc = SparkContext(appName="LDA")
    data = sc.textFile(file_path)
    parsedData = data.map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))
    return sc, parsedData.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()
    

@timer
def fit_spark_mllab(sc_corpus, maxIterations, num_topics):
    ldaModel = LDA.train(sc_corpus[1], maxIterations=maxIterations, k=num_topics)
    sc_corpus[0].stop()
    
fit_spark_mllab(prepare_corpus("pubmed.txt"), NUM_COLLECTION_PASSES, NUM_TOPICS)

Время выполнения функции prepare_corpus: 93.01701879501343

Время выполнения функции fit_spark_mllab: 221.61254501342773



## BigARTM

In [9]:
import artm

In [5]:
@timer
def prepare_corpus(collection_name):
    return artm.BatchVectorizer(data_path='.', data_format='bow_uci',
                                collection_name=collection_name, target_folder= collection_name+'_batches')

In [6]:
@timer
def fit_artm(batch_vectorizer, num_topics, num_collection_passes):
    lda = artm.LDA(num_topics=num_topics, alpha=0.01, beta=0.001, cache_theta=True,
                   num_document_passes=5, dictionary=batch_vectorizer.dictionary)
    lda.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=num_collection_passes)

In [11]:
fit_artm(prepare_corpus('pubmed'), NUM_TOPICS, NUM_COLLECTION_PASSES)

Время выполнения функции prepare_corpus: 123.62481212615967

Время выполнения функции fit_artm: 167.00855827331543

