## Functions to perform PCA on each cluster

In [None]:
from glob import glob
import pandas as pd
import numpy as np
from sklearn import preprocessing
from numpy.linalg import svd

In [None]:
fnames = glob('../data/clusters/*[!old]')

In [None]:
fnames.remove('../data/clusters/df15.csv') #15 is too big for svd
fnames.remove('../data/clusters/df3.csv')  #3  "                "

In [None]:
df_dict = {df_.split('/')[-1].replace('.csv', '') \
           :pd.read_csv(df_) for df_ in fnames}

In [None]:
for df_ in df_dict.values():
    df_.set_index('Id', drop=True, inplace=True)
    df_.drop('Response', inplace=True, axis=1)

In [None]:
df = pd.read_csv('../data/train_numeric.csv')

In [None]:
df.set_index('Id', inplace=True, drop=True)
labels = df['Response']
del df

In [None]:
index_dict = {name: df_.index.values for name, df_ in df_dict.items()}

In [None]:
array_dict = {name: df_.values for name, df_ in df_dict.items()}

In [None]:
scaled_d = {}
for name, X in array_dict.items():
    X = preprocessing.scale(X)
    scaled_d[name] = X

In [None]:
pc_dict = {}
eigen_dict = {}
for name, X in scaled_d.items():
    print(name)
    u, e, v = svd(X)
    pc_dict[name] = u
    eigen_dict[name] = e
    input('press ENTER')

In [None]:
n_components = {}
for name, vect in eigen_dict.items():
    percentile = 0
    n = 0
    ttl = vect.sum()
    while percentile < .95:
        n += 1
        percentile = vect[:n].sum()/ttl
    n_components[name] = n

In [None]:
arrs_to_model = {}
for name, X in pc_dict.items():
    n = n_components[name]
    arrs_to_model[name] = X[:,:n]

In [None]:
for name, arr in arrs_to_model.items():
    df = pd.DataFrame(arr)
    df['Id'] = index_dict[name]
    df.set_index('Id', drop=True, inplace=True)
    df['Response'] = labels
    df.to_csv('../data/pca/{}.csv'.format(name))

In [None]:
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds

In [None]:
df15 = pd.read_csv('../data/clusters/df15.csv') #15 is too big for svd
df3 = pd.read_csv('../data/clusters/df3.csv')

In [None]:
df15.set_index('Id', inplace=True, drop=True)
df3.set_index('Id', inplace=True, drop=True)
df15.drop('Response', inplace=True, axis=1)
df3.drop('Response', inplace=True, axis=1)

In [None]:
X15 = preprocessing.scale(df15.values)
X3 = preprocessing.scale(df3.values)

In [None]:
sprs15 = csc_matrix(X15)

In [None]:
sprs15.shape

In [None]:
sprs3 = csc_matrix(X3)

In [None]:
mat_sprs15 = csr_matrix(sprs15)

In [None]:
mat_sprs15.min()

In [None]:
u15, s15, vt15 = svds(sprs15, k=13)

In [None]:
u3, s3, vt3 = svds(sprs3, k=52)

In [None]:
s15 = s15[::-1]

In [None]:
s3 = s3[::-1]