In [1]:
import numpy as np
import time
from scipy import sparse
import random
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from scipy.sparse import linalg as lg

In [2]:
def prep_data():
    x = np.loadtxt("simulated_genos", delimiter=" ", dtype="float32")
    y = np.array([[1] * 10000 + [0] * 10000], dtype="float32")
    y_c = y - 0.5
    return x,y_c

# Permutation Test

In [3]:
x, y_c = prep_data()

In [4]:
y_c

array([[ 0.5,  0.5,  0.5, ..., -0.5, -0.5, -0.5]], dtype=float32)

In [5]:
start_time = time.clock()
xxt = np.dot(x, x.T)
end_time = time.clock()

In [6]:
print('xxt computation time: ', end_time - start_time)

xxt computation time:  4.404278440841199


In [7]:
np.savetxt("XXT",xxt[1:100, 1:100])

In [8]:
#np.set_printoptions(threshold=np.nan)

In [9]:
xxt[1:100, 1:100]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

Using numpy to calculate the $Y'(XX^T)Y'^T$

In [10]:
start_time = time.clock()
q0 = y_c.dot(xxt)
q2 = (q0 * y_c).sum(axis=1)
end_time = time.clock()
print('original q computation time: ', end_time - start_time)
print(q2)

original q computation time:  0.2208549457685569
[223.25]


Convert $ X X^T$ to a sparse matrix

In [11]:
start_time = time.clock()
xxt_sparse = sparse.csr_matrix(xxt)
end_time = time.clock()
print('covert to sparse time: ', end_time - start_time)

covert to sparse time:  4.096197505622398


Using scipy.sparse to calculate

In [12]:
start_time = time.clock()

q00 = y_c @ xxt_sparse
q11 = (y_c * q00).sum(axis=1) 

end_time = time.clock()
print('sparse computation time: ', end_time - start_time)
print(q11)

sparse computation time:  0.007877543305582435
[223.25]


In [13]:
start_time = time.clock()
y_cc = y_c.T
for i in range(1000):
# you can try to forbid shuffle here. See time change.
    #np.random.shuffle(y_cc)
    q0 = y_cc.T.dot(xxt)
    q2 = (q0 * y_cc.T).sum(axis=1)
    #print(q2)
end_time = time.clock()
print('original 1000 q computation time: ', end_time - start_time)

original 1000 q computation time:  121.7461010149969


In [14]:
y_cc

array([[ 0.5],
       [ 0.5],
       [ 0.5],
       ...,
       [-0.5],
       [-0.5],
       [-0.5]], dtype=float32)

In [15]:
start_time = time.clock()
y_cc = y_c.T
for i in range(1000):
# you can try to forbid shuffle here. See time change.
    #numpy.random.choicenp.random.shuffle(y_cc)
    q00 = y_cc.T @ xxt_sparse
    q11 = (y_cc.T * q00).sum(axis=1) 
    if i % 100 == 0:
        print(q11)
end_time = time.clock()
print('sparse 1000 q computation time: ', end_time - start_time)

[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
sparse 1000 q computation time:  0.18097869611577266


In [16]:
start_time = time.clock()
np.random.shuffle(y_cc)
end_time = time.clock()
print('shuffle once time of random.shuffle: ', end_time - start_time)

shuffle once time of random.shuffle:  0.02949502741543597


Very embarrasing thing here is that shuffling is the most time consuming thing!!!

In [17]:
start_time = time.clock()
np.random.permutation(y_cc)
end_time = time.clock()
print('shuffle once time of random.permutation: ', end_time - start_time)

shuffle once time of random.permutation:  0.02523220256759373


In [18]:
y_cc.shape

(20000, 1)

In [19]:
y_c.shape

(1, 20000)

In [20]:
start_time = time.clock()
y_cc[np.random.choice(20000, size = 10000, replace = False)] = 1

end_time = time.clock()
print('shuffle once time of random.int: ', end_time - start_time)

shuffle once time of random.int:  0.004575144714607404


In [21]:
start_time = time.clock()
z = np.zeros((20000, 1))
for i in range(1000):
    y_cc = np.zeros((20000, 1)) - 0.5
    y_cc[np.random.choice(20000, size = 10000, replace = False)] = 0.5
    q00 = y_cc.T @ xxt_sparse
    q11 = (y_cc.T * q00).sum(axis=1) 
    if i % 100 == 0:
        print(q11)
end_time = time.clock()
print('sparse 1000 q computation time, choice: ', end_time - start_time)

[351.25]
[219.25]
[336.25]
[325.25]
[207.25]
[207.25]
[302.25]
[175.25]
[169.25]
[254.25]
sparse 1000 q computation time, choice:  1.0304870269922048


In [22]:
sum(y_cc)

array([0.])

In [23]:
y_cc

array([[-0.5],
       [-0.5],
       [-0.5],
       ...,
       [-0.5],
       [ 0.5],
       [ 0.5]])

In [24]:
z.shape

(20000, 1)

Fianl Test! See the 1000 permutations here!

In [25]:
start_time = time.clock()

m = 1000
x = np.loadtxt("D:\MS\Master Course\BST234\Project\simulated_genos", delimiter=" ", dtype="float32")
x_sparse = sparse.csr_matrix(x)
xxt_sparse = x_sparse @ x_sparse.transpose()
q = np.empty((m, 1))

for i in range(m):
    y_cc = np.zeros((20000, 1)) - 0.5
    y_cc[np.random.choice(20000, size = 10000, replace = False)] = 0.5
    q[i,0] = y_cc.T @ xxt_sparse @ y_cc
    #q00 = y_cc.T @ xxt_sparse
    #q11 = (y_cc.T * q00).sum(axis=1) 
end_time = time.clock()

print('Whole permutation time ', end_time - start_time)


OSError: D:\MS\Master Course\BST234\Project\simulated_genos not found.

In [None]:
print("Expectation:", np.average(q))
plt.hist(q)

# Eigenvalue Calculation