## PCA Reduction

In [None]:
from nltk.tokenize import sent_tokenize
import numpy as np
from sklearn.decomposition import PCA
import os, codecs
import cPickle as pickle
from sklearn.externals import joblib

tfkld_location = "tfkld.pkl"
text_corpus_dir = "/home/cjacques/code/gensim/extracted_wiki"


# Load the TFKLD Model, we'll be training the PCA Reduction on the vectors it outputs
tfkld_model = joblib.load(tfkld_location)

In [None]:
def iter_comments(loc):
    for subdir, dirs, files in os.walk(loc):
        for file in files:
            yield os.path.join(subdir, file)

In [None]:
# read through all the files in whatever directory was passed until the number of desired sentences is hit (default 20000000)
# personally I used https://github.com/attardi/wikiextractor to get the clean text for wikipedia articles.
def read_files_for_pca(location, num_sentences=20000000):
    vec_list = []
    counter = 0
    for file_name in iter_comments(location):
        with codecs.open(file_name, encoding='utf-8', mode="r") as read_file:
            if counter < num_sentences:
                text = read_file.read()
                sentences = sent_tokenize(text)
                vec = tfkld_model.transform(sentences)[0].toarray()[0].tolist()
                vec_list.append(vec)
                counter += len(sentences)
            else:
                break
    return np.array(vec_list)

# Train PCA

In [None]:
explained_variance = []

X = np.array(read_files_for_pca(text_corpus_dir))
pca = PCA(n_components=150)

while np.sum(explained_variance) < 0.95:
    pca.fit(X)
    explained_variance = pca.explained_variance_ratio_
    print(np.sum(explained_variance)) 

In [None]:
test_vec = tfkld_model.transform(["hello world"])[0].toarray()[0].tolist()
new_vec = pca.transform([test_vec])

print len(new_vec)
print new_vec

In [None]:
joblib.dump(pca, "pca_tfkld.pkl")

## Test loading PCA Model

In [None]:
pca_model = joblib.load("pca_tfkld.pkl")

### Test PCA Model

In [None]:
test_vec = tfkld_model.transform(["hello world"])[0].toarray()[0].tolist()
new_vec = pca_model.transform([test_vec])

print len(new_vec)
print new_vec