In [99]:
import os
import numpy as np
import matplotlib.pyplot as plt

import dask
import dask.array as da
import h5py
import pandas as pd

In [38]:
f = h5py.File('/Users/tnonet/Documents/FlashPCATests/run_directory/matricies/5000_20000.h5py', 'r')

array = dask.array.from_array(f['/array'])

In [39]:
array[0:10,0:10].compute()

array([[0, 1, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [1, 1, 1, 1, 1, 1, 0, 0, 1, 2],
       [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 1, 2, 1, 0],
       [2, 1, 0, 1, 0, 1, 1, 1, 0, 2],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 0]], dtype=int8)

In [105]:
run_dir = '../../../Documents/FlashPCATests/results/runs_Feb19/'

In [113]:
def get_data(run_dir):
    text_file = os.path.join(run_dir, 'info.txt')
    file1 = open(text_file,"r") 
    lines = file1.readlines() 
    file1.close()
    
    return [line.rstrip() for line in lines]

def parse_grid(lines):    
    keys = ['matrix_list', 'k_list', 'maxiter_list', 'tol_list']
    
    
    grid = dict(zip(keys, [[],[],[],[]]))
    
    key_counter = 0
    for line in lines:
        try:
            k = keys[key_counter]
        except:
            break
        if line.startswith(k):
            k_line = True
        elif k_line and not line:
            k_line = False
            key_counter += 1
        elif k_line:
            grid[k].append(line)
            
    grid['k_list'] = [int(i) for i in grid['k_list']]
    grid['maxiter_list'] = [int(i) for i in grid['maxiter_list']]
    grid['tol_list'] = [float(i) for i in grid['tol_list']]
    
    return grid

def get_run_data(lines, grid):    
    run_finder = {}
    start_run = False
    for i, line in enumerate(lines):
        if not line:
            start_run = True
        elif start_run:
            try:
                run_number = int(line)
                run_finder[run_number] = (i, i+11)
            except:
                start_run = False
                
    run_data = []
    for run_number, v in run_finder.items():
        start, end = v
        matrix = lines[start+2]
        tol = float(lines[start+4])
        k = int(lines[start+6])
        maxiter = int(lines[start+8])
        time = float(lines[start+10])
        
        
        run_data.append([matrix, tol, k, maxiter,time,run_number])
      
    run_data = pd.DataFrame(run_data)
    run_data.columns = ['Matrix', 'tol', 'k', 'maxiter', 'time', 'run']
    run_data.set_index('run', inplace=True)
    
    return run_data

# Inspect Data

In [138]:
lines = get_data(run_dir)
grid = parse_grid(lines)
get_run_data(lines, grid).head()

Unnamed: 0_level_0,Matrix,tol,k,maxiter,time
run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5K_20K,0.01,10,100,16.258906
2,5K_20K,0.003594,10,100,26.405412
3,5K_20K,0.001292,10,100,34.323761
4,5K_20K,0.000464,10,100,36.478796
5,5K_20K,0.000167,10,100,37.964726


# Checking Real Accuracy vs FlashPCA Accuracy

In [141]:
def rmse_k(array, u, s):
    n, m = array.shape
    _, k = u.shape

    flog = {}

    acc = da.linalg.norm((1/n) * (array.dot(array.T.dot(u)) - u.dot(s)), ord='fro')
    acc = da.sqrt(acc**2/(m*k)).compute()
    
    return acc

def find_accuracy(run_dir, matrix_dir='../run_directory/matricies'):
    lines = get_data(run_dir)
    grid = parse_grid(lines)
    run_data = get_run_data(lines, grid)
    
    for run_number in run_data.index:
        vec_values_dir = os.path.join(run_dir, str(run_number))
        try:
            values = np.load(os.path.join(vec_values_dir, 'values'))
            values = da.diag(values)
            
            vectors = np.load(os.path.join(vec_values_dir, 'vectors'))
        except FileNotFoundError:
            break
            
            
        array_file_path = os.path.abspath(
            os.path.join(run_dir, '../../run_directory/matricies', str(run_data.iloc[run_number]['Matrix'])) + '.h5py')
        f = h5py.File(array_file_path, 'r')
        
        array = dask.array.from_array(f['/array']).T
        
        print(array[0:10,0:10].compute())
        
        print((array.dot(array.T.dot(vectors)) - vectors.dot(values))[0:3, 0:10].compute())
        
        
        #print(rmse_k(array, vectors, values))

In [142]:
find_accuracy(run_dir)

[[0 0 0 1 1 1 0 0 2 1]
 [1 0 0 0 1 1 1 1 1 0]
 [0 1 0 0 1 1 1 0 0 0]
 [0 0 1 1 1 1 1 0 1 0]
 [0 0 0 0 1 0 1 0 0 0]
 [0 1 0 0 1 0 0 1 1 0]
 [0 0 0 0 0 0 0 1 1 0]
 [0 1 0 0 0 0 0 2 1 0]
 [1 0 0 1 1 0 1 1 0 1]
 [1 0 0 0 2 0 0 0 2 0]]
[[ 4.73622506e+03  3.90737786e+03  1.84356087e+04  9.96469508e+02
   3.52974254e+01 -1.71047587e+02  3.66985792e+01  1.03735132e+01
   3.71212391e+01  1.56747643e+00]
 [ 4.77565972e+03  2.79083483e+03  1.68016515e+04 -5.18446563e+01
   1.26043510e+02 -1.09233958e+02  1.75880212e+02 -4.59785027e+01
   1.94634453e+01  1.81062296e+02]
 [ 6.28639626e+03  3.95305050e+03  1.90443764e+04 -4.72651184e+02
   3.60566879e+01 -1.66145986e+02  1.22108912e+02 -6.49339901e+01
  -8.56678381e+00  1.36012573e+02]]
[[0 0 0 1 1 1 0 0 2 1]
 [1 0 0 0 1 1 1 1 1 0]
 [0 1 0 0 1 1 1 0 0 0]
 [0 0 1 1 1 1 1 0 1 0]
 [0 0 0 0 1 0 1 0 0 0]
 [0 1 0 0 1 0 0 1 1 0]
 [0 0 0 0 0 0 0 1 1 0]
 [0 1 0 0 0 0 0 2 1 0]
 [1 0 0 1 1 0 1 1 0 1]
 [1 0 0 0 2 0 0 0 2 0]]
[[ 4.73622506e+03  3.90737786e+03  1

KeyboardInterrupt: 

In [15]:
values = np.load(os.path.join(run_dir, '1', 'values'))
vectors =np.load(os.path.join(run_dir, '1', 'vectors'))
scale = np.load(os.path.join(run_dir, '1', 'scale'))
center = np.load(os.path.join(run_dir, '1', 'center'))

In [14]:
def rmse_k_scale(array, u, s, scale, center):
    n, m = array.shape
    _, k = u.shape

    flog = {}

    acc = da.linalg.norm((1/n) * (array.dot(array.T.dot(u)) - u.dot(s)), ord='fro')
    acc = da.sqrt(acc**2/(m*k)).compute()
    
    return acc

Unnamed: 0,Array,Chunk
Bytes,100.00 MB,100.00 MB
Shape,"(5000, 20000)","(5000, 20000)"
Count,4 Tasks,1 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 100.00 MB 100.00 MB Shape (5000, 20000) (5000, 20000) Count 4 Tasks 1 Chunks Type int8 numpy.ndarray",20000  5000,

Unnamed: 0,Array,Chunk
Bytes,100.00 MB,100.00 MB
Shape,"(5000, 20000)","(5000, 20000)"
Count,4 Tasks,1 Chunks
Type,int8,numpy.ndarray


In [50]:
da.rechunk(da.tile(center, (20000, 1)).T, 'auto')

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,100.00 MB
Shape,"(5000, 20000)","(5000, 2500)"
Count,40010 Tasks,8 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 800.00 MB 100.00 MB Shape (5000, 20000) (5000, 2500) Count 40010 Tasks 8 Chunks Type float64 numpy.ndarray",20000  5000,

Unnamed: 0,Array,Chunk
Bytes,800.00 MB,100.00 MB
Shape,"(5000, 20000)","(5000, 2500)"
Count,40010 Tasks,8 Chunks
Type,float64,numpy.ndarray


In [56]:
da.rechunk(da.tile(array.T.dot(center), (10, 1)).T, 'auto')

Unnamed: 0,Array,Chunk
Bytes,1.60 MB,1.60 MB
Shape,"(20000, 10)","(20000, 10)"
Count,29 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.60 MB 1.60 MB Shape (20000, 10) (20000, 10) Count 29 Tasks 1 Chunks Type float64 numpy.ndarray",10  20000,

Unnamed: 0,Array,Chunk
Bytes,1.60 MB,1.60 MB
Shape,"(20000, 10)","(20000, 10)"
Count,29 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [71]:
da.tile(center.dot(da.ones(array.shape)), (10,1)).T.shape

(20000, 10)

In [73]:
test = array.T.dot(array.dot(vectors)) - da.tile(center.dot(da.ones(array.shape)), (10,1)).T

In [95]:
center.shape

(5000,)

In [91]:
(test - vectors.dot(da.diag(values**.5)))[0:10,0:10].compute()

array([[ 1993.80954658,  1164.21660978, 15700.18792732, -1747.26577076,
        -2709.27655305, -2915.6281077 , -2707.86095488, -2734.19085078,
        -2707.43454498, -2743.02412749],
       [ 2033.24941786,    47.24713535, 14065.40978049, -2796.16946505,
        -2618.50250523, -2853.78630111, -2568.62996687, -2790.5611995 ,
        -2725.0983873 , -2563.46618051],
       [ 3544.72326951,  1210.01989041, 16309.25140993, -3217.16441582,
        -2708.51489467, -2910.72265271, -2622.43491035, -2809.52266782,
        -2753.1396259 , -2608.52516222],
       [ 3970.81497863,   710.39620778, 15416.96181264, -2216.35873151,
        -2598.12371354, -2937.30256589, -2671.12713516, -2658.44106005,
        -2914.54498829, -2615.28072733],
       [ 2633.6465234 ,   678.43465303, 16882.74209855, -3014.20737275,
        -2824.21290829, -2901.76724949, -2690.58775419, -2924.35977653,
        -2669.40200333, -2624.4676759 ],
       [ 2558.70724545,  1096.43029176, 15010.58929596, -2128.0744048 ,
   