forked from ja-che/hidimstat
-
Notifications
You must be signed in to change notification settings - Fork 4
/
data_simulation.py
57 lines (49 loc) · 1.75 KB
/
data_simulation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import numpy as np
from scipy.linalg import toeplitz
def simu_data(n, p, rho=0.25, snr=2.0, sparsity=0.06, effect=1.0, seed=None):
"""Function to simulate data follow an autoregressive structure with Toeplitz
covariance matrix
Parameters
----------
n : int
number of observations
p : int
number of variables
sparsity : float, optional
ratio of number of variables with non-zero coefficients over total
coefficients
rho : float, optional
correlation parameter
effect : float, optional
signal magnitude, value of non-null coefficients
seed : None or Int, optional
random seed for generator
Returns
-------
X : ndarray, shape (n, p)
Design matrix resulted from simulation
y : ndarray, shape (n, )
Response vector resulted from simulation
beta_true : ndarray, shape (n, )
Vector of true coefficient value
non_zero : ndarray, shape (n, )
Vector of non zero coefficients index
"""
# Setup seed generator
rng = np.random.default_rng(seed)
# Number of non-null
k = int(sparsity * p)
# Generate the variables from a multivariate normal distribution
mu = np.zeros(p)
Sigma = toeplitz(rho ** np.arange(0, p)) # covariance matrix of X
# X = np.dot(np.random.normal(size=(n, p)), cholesky(Sigma))
X = rng.multivariate_normal(mu, Sigma, size=(n))
# Generate the response from a linear model
non_zero = rng.choice(p, k)
beta_true = np.zeros(p)
beta_true[non_zero] = effect
eps = rng.standard_normal(size=n)
prod_temp = np.dot(X, beta_true)
noise_mag = np.linalg.norm(prod_temp) / (snr * np.linalg.norm(eps))
y = prod_temp + noise_mag * eps
return X, y, beta_true, non_zero