In [3]:
import numpy as np
import time
from scipy import sparse
import random

In [4]:
def prep_data():
    x = np.loadtxt("D:\MS\Master Course\BST234\Project\simulated_genos", delimiter=" ", dtype="float32")
    y = np.array([[1] * 10000 + [0] * 10000], dtype="float32")
    y_c = y - 0.5
    return x, y_c

In [5]:
x, y_c = prep_data()

In [6]:
y_c

array([[ 0.5,  0.5,  0.5, ..., -0.5, -0.5, -0.5]], dtype=float32)

In [7]:
start_time = time.clock()
xxt = np.dot(x, x.T)
end_time = time.clock()

In [8]:
print('xxt computation time: ', end_time - start_time)

xxt computation time:  12.023855756131299


Using numpy to calculate the $Y'(XX^T)Y'^T$

In [9]:
start_time = time.clock()
q0 = y_c.dot(xxt)
q2 = (q0 * y_c).sum(axis=1)
end_time = time.clock()
print('original q computation time: ', end_time - start_time)
print(q2)

original q computation time:  2.0975188085092586
[223.25]


Convert $ X X^T$ to a sparse matrix

In [10]:
start_time = time.clock()
xxt_sparse = sparse.csr_matrix(xxt)
end_time = time.clock()
print('covert to sparse time: ', end_time - start_time)

covert to sparse time:  9.73876684104511


Using scipy.sparse to calculate

In [12]:
start_time = time.clock()

q00 = y_c @ xxt_sparse
q11 = (y_c * q00).sum(axis=1) 

end_time = time.clock()
print('sparse computation time: ', end_time - start_time)
print(q11)

sparse computation time:  0.0012684783763461382
[223.25]


In [13]:
start_time = time.clock()
y_cc = y_c.T
for i in range(1000):
# you can try to forbid shuffle here. See time change.
    #np.random.shuffle(y_cc)
    q0 = y_cc.T.dot(xxt)
    q2 = (q0 * y_cc.T).sum(axis=1)
    #print(q2)
end_time = time.clock()
print('original 1000 q computation time: ', end_time - start_time)

original 1000 q computation time:  158.70266278059165


In [101]:
y_cc

array([[ 0.5],
       [ 0.5],
       [ 0.5],
       ...,
       [-0.5],
       [-0.5],
       [ 0.5]], dtype=float32)

In [17]:
start_time = time.clock()
y_cc = y_c.T
for i in range(1000):
# you can try to forbid shuffle here. See time change.
    #numpy.random.choicenp.random.shuffle(y_cc)
    q00 = y_cc.T @ xxt_sparse
    q11 = (y_cc.T * q00).sum(axis=1) 
    if i % 100 == 0:
        print(q11)
end_time = time.clock()
print('sparse 1000 q computation time: ', end_time - start_time)

[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
[223.25]
sparse 1000 q computation time:  0.31275565300609287


In [18]:
start_time = time.clock()
np.random.shuffle(y_cc)
end_time = time.clock()
print('shuffle once time of random.shuffle: ', end_time - start_time)

shuffle once time of random.shuffle:  0.04801913853549422


Very embarrasing thing here is that shuffling is the most time consuming thing!!!

In [19]:
start_time = time.clock()
np.random.permutation(y_cc)
end_time = time.clock()
print('shuffle once time of random.permutation: ', end_time - start_time)

shuffle once time of random.permutation:  0.04323023799861403


In [20]:
y_cc.shape

(20000, 1)

In [23]:
y_c.shape

(1, 20000)

In [40]:
start_time = time.clock()
y_cc[np.random.choice(20000, size = 10000, replace = False)] = 1

end_time = time.clock()
print('shuffle once time of random.int: ', end_time - start_time)

shuffle once time of random.int:  0.0018431976407100592


In [63]:
start_time = time.clock()
z = np.zeros((20000, 1))
for i in range(1000):
    y_cc = np.zeros((20000, 1)) - 0.5
    y_cc[np.random.choice(20000, size = 10000, replace = False)] = 0.5
    q00 = y_cc.T @ xxt_sparse
    q11 = (y_cc.T * q00).sum(axis=1) 
    if i % 100 == 0:
        print(q11)
end_time = time.clock()
print('sparse 1000 q computation time, choice: ', end_time - start_time)

[202.25]
[216.25]
[159.25]
[233.25]
[186.25]
[211.25]
[253.25]
[224.25]
[247.25]
[246.25]
sparse 1000 q computation time, choice:  1.6444958150452749


In [61]:
sum(y_cc)

array([5000.])

In [62]:
y_cc

array([[ 1. ],
       [ 1. ],
       [ 1. ],
       ...,
       [-0.5],
       [ 1. ],
       [-0.5]])

In [58]:
z.shape

(20000, 1)