# 12. 性能優化、性能分析與併發性

In [None]:
import random
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy
from lprof_hack import profile

@profile
def label_docs():
    docs = [(list(movie_reviews.words(fid)), cat)
            for cat in movie_reviews.categories()
            for fid in movie_reviews.fileids(cat)]
    random.seed(42)
    random.shuffle(docs)

    return docs

@profile
def isStopWord(word):
    return word in sw or len(word) == 1

@profile
def filter_corpus():
    review_words = movie_reviews.words()
    print("# Review Words", len(review_words))
    res = [w.lower() for w in review_words if not isStopWord(w.lower())]
    print("# After filter", len(res))

    return res

@profile
def select_word_features(corpus):
    words = FreqDist(corpus)
    N = int(.02 * len(list(words.keys())))
    return list(words.keys())[:N]

@profile
def doc_features(doc):
    doc_words = FreqDist(w for w in doc if not isStopWord(w))
    features = {}
    for word in word_features:
        features['count (%s)' % word] = (doc_words.get(word, 0))
    return features

@profile
def make_features(docs):
    return [(doc_features(d), c) for (d,c) in docs]

@profile
def split_data(sets):
    return sets[200:], sets[:200]

if __name__ == "__main__":
    labeled_docs = label_docs()

    sw = set(stopwords.words('english'))
    filtered = filter_corpus()
    word_features = select_word_features(filtered)
    featuresets = make_features(labeled_docs)
    train_set, test_set = split_data(featuresets)
    classifier = NaiveBayesClassifier.train(train_set)
    print("Accuracy", accuracy(classifier, test_set))
    print(classifier.show_most_informative_features())

## 12.4 multiprocessing

In [None]:
from numpy.random import random_integers
from numpy.random import randn, randint
import numpy as np
import timeit
import argparse
import multiprocessing as mp
import matplotlib.pyplot as plt


def simulate(size):
    n = 0
    mean = 0
    M2 = 0

    speed = randn(10000)

    for i in range(1000): 
        n = n + 1
        indices = randint(0, len(speed)-1, size=size)
        x = (1 + speed[indices]).prod()
        delta = x - mean
        mean = mean + delta/n
        M2 = M2 + delta*(x - mean)

    return mean

def serial():
    start = timeit.default_timer()

    for i in range(10, 50):
        simulate(i)
    
    end = timeit.default_timer() - start
    print("Serial time", end)

    return end

def parallel(nprocs):
    start = timeit.default_timer()
    p = mp.Pool(nprocs)
    print(nprocs, "Pool creation time", timeit.default_timer() - start)

    p.map(simulate, [i for i in range(10, 50)])
    p.close()
    p.join()

    end = timeit.default_timer() - start
    print(nprocs, "Parallel time", end)
    return end

if __name__ == "__main__":
    ratios = []
    baseline = serial()

    for i in range(1, mp.cpu_count()):
        ratios.append(baseline/parallel(i))

    plt.xlabel('# processes')
    plt.ylabel('Serial/Parallel')
    n = np.arange(1, mp.cpu_count())
    plt.plot(n, ratios)
    plt.grid(True)
    plt.show()

## 12.5 Joblib

In [1]:
from numpy.random import random_integers
from numpy.random import randn
import numpy as np
import timeit
import argparse
import matplotlib.pyplot as plt
from joblib import Parallel
from joblib import delayed
import multiprocessing as mp


def simulate(size):
    n = 0
    mean = 0
    M2 = 0

    speed = randn(10000)

    for i in range(1000): 
        n = n + 1
        indices = random_integers(0, len(speed)-1, size=size)
        x = (1 + speed[indices]).prod()
        delta = x - mean
        mean = mean + delta/n
        M2 = M2 + delta*(x - mean)

    return mean

def serial():
    start = timeit.default_timer()

    for i in range(10, 50):
        simulate(i)
    
    end = timeit.default_timer() - start
    print("Serial time", end)

    return end

def parallel(nprocs):
    start = timeit.default_timer()
    Parallel(nprocs)(delayed(simulate)(i) for i in range(10, 50))

    end = timeit.default_timer() - start
    print(nprocs, "Parallel time", end)
    return end

if __name__ == "__main__":
    ratios = []
    baseline = serial()

    for i in range(1, mp.cpu_count()):
        ratios.append(baseline/parallel(i))

    plt.xlabel('# processes')
    plt.ylabel('Serial/Parallel')
    plt.plot(np.arange(1, mp.cpu_count()), ratios)
    plt.grid(True)
    plt.show()

ImportError: No module named 'joblib'

In [3]:
from mpi4py import MPI
from numpy.random import random_integers
from numpy.random import randn
import numpy as np
import statsmodels.api as sm
import bottleneck as bn
import logging


def jackknife(a, parallel=True):
    data_loader = sm.datasets.sunspots.load_pandas()
    vals = data_loader.data['SUNACTIVITY'].values

    func, _ = bn.func.nanmean_selector(vals, axis=0)
    results = []

    for i in a:
        tmp = np.array(vals.tolist())
        tmp[i] = np.nan
        results.append(func(tmp))

    results = np.array(results)

    if parallel:
        comm = MPI.COMM_WORLD
        rcvBuf = np.zeros(0.0, 'd')
        comm.gather([results, MPI.DOUBLE], [rcvBuf, MPI.DOUBLE])

    return results

if __name__ == "__main__":
    skiplist = np.arange(39, dtype='int')
    print(jackknife(skiplist, False))

ImportError: No module named 'mpi4py'