In [1]:
import os
import pickle
from glob import glob
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.decomposition import PCA

In [2]:
def load_clusters():
    fnames = glob(os.path.join('..','data','clusters','*.csv'))
    data = {}
    indices = {}
    labels = {}
    for name in fnames:
        dfx = name.split('/')[-1][:-4]
        df = pd.read_csv(name, index_col='Id')
        data[dfx] = df
        indices[dfx] = df.index
        labels[dfx] = df.Response
    return data, indices, labels

In [3]:
def drop_non_fails(data):
    dfs = []
    for name, df in data.items():
        if df.Response.sum() > 2:
            dfs.append(name)
    return dfs

In [4]:
def center_data(dfs, data):
    A = {name: data[name].drop('Response', axis=1).values for name in dfs}
    X = {name: preprocessing.scale(A[name]) for name in dfs}
    return X

In [5]:
def choose_k(singular_values, expalained_variance=.95):
    total = np.nansum(singular_values)
    k = 1
    exp_var = (singular_values[:k].sum()/total)
    while exp_var < expalained_variance:
        k += 1
        exp_var = (singular_values[:k].sum()/total)
    return k

In [6]:
def transform_clusters(dfs, X):
    Z = {}
    pca_map = {}
    for df in dfs:
        model = PCA(svd_solver='full', random_state=11).fit(X[df])
        k = choose_k(model.singular_values_)
        pca_map[df] = PCA(n_components=k, random_state=11)
        Z[df] = pca_map[df].fit_transform(X[df])
    return Z, pca_map

In [13]:
def save_Z_pca_maps(Z, pca_map, dfs, indices, labels):
    for df in dfs:
        dfx = pd.DataFrame(Z[df], index=indices[df])
        dfx['Response'] = labels[df]
        dfx.to_csv(os.path.join('..','data','pca',df+'.csv'))
        with open(os.path.join('..','data','pca_map',df+'.pickle'), 'wb') as f:
            pickle.dump(pca_map[df], f)

In [14]:
def run():
    data, indices, labels = load_clusters()
    dfs = drop_non_fails(data)
    X = center_data(dfs, data)
    Z, pca_map = transform_clusters(dfs, X)
    save_Z_pca_maps(Z, pca_map, dfs, indices, labels)

In [15]:
run()

  mask |= (ar1 == a)
