# Synthetic Data

In [25]:
import numpy as np
from numpy import logical_or as lor
from numpy import logical_and as land
from numpy import logical_not as lnot
from numpy import logical_xor as lxor
import random
import pandas as pd
import matplotlib.pyplot as plt

In [26]:
# generate a list of all possible combinations of 3-bits
def gen_3(): 
  rlvnt = []
  for i in [0,1]:
    for j in [0,1]:
      for k in [0,1]:
        rlvnt.append([i,j,k])
  return rlvnt

# create 2 correlated features in case of binary target (y) 
# by randomly fliping 30% of the values of y  

def make_cor(y):
  random.seed(0)
  cor_vars = []
  for i in range(2):
    cor_i = y.copy()
    ind = random.sample(range(len(y)), int(0.3*len(y)))
    cor_i[ind] = lnot(cor_i[ind])
    cor_vars.append(cor_i)
  return np.array(cor_vars).transpose()

### ORAND

In [27]:
def orand(n_obs=50,n_I=92, seed=0):
  np.random.seed(seed)
  red = lnot(gen_3()).astype(int) #redundant variables
  rr = np.hstack([gen_3(), red]) #rlvnt & rdnt joined
  q=n_obs//8
  r=n_obs%8
  rr_exp = np.vstack([np.repeat(rr,q, axis=0),rr[:r,:]]) #replicate rr according to n_obs
  irlvnt = np.random.randint(2, size=[n_obs,n_I], )
  y = land(rr_exp[:,0], 
           lor(rr_exp[:,1], rr_exp[:,2])).astype(int) #calculate y according to the formula
  cor = make_cor(y)
  features = np.hstack([rr_exp,cor, irlvnt])
  return features, y

(X, y) = orand(seed=1)

X, y

(array([[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 1],
        ...,
        [1, 1, 1, ..., 0, 1, 1],
        [0, 0, 0, ..., 1, 1, 0],
        [0, 0, 1, ..., 1, 0, 1]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0]))

### ANDOR

In [28]:
# generate a list of all possible combinations of 3-bits
def gen_4():
  rlvnt_0 = gen_3()
  for seq in rlvnt_0:
    seq.append(0)

  rlvnt_1 = gen_3()
  for seq in rlvnt_1:
    seq.append(1)

  return rlvnt_0 + rlvnt_1


def andor(n_obs=50,n_I=90, seed=0):
  np.random.seed(seed)
  red = lnot(gen_4()).astype(int)
  rr = np.hstack([gen_4(), red])
  q=n_obs//16
  r=n_obs%16
  rr_exp = np.vstack([np.repeat(rr,q, axis=0),rr[:r,:]])
  irlvnt = np.random.randint(2, size=[n_obs,n_I])
  y = lor(land(rr_exp[:,0], rr_exp[:,1]), 
          land(rr_exp[:,2], rr_exp[:,3])).astype(int)
  cor = make_cor(y)
  features = np.hstack([rr_exp, cor, irlvnt])
  return features, y

X, y = andor()


X, y

(array([[0, 0, 0, ..., 1, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 1, 1],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 1, 1],
        [0, 0, 1, ..., 0, 1, 1]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0]))

### ADDER

In [29]:
# make 2 correlated features when n_class => 2
# this is more general than make_cor which works only when n_class=2
# works by adding 1 to the y value and modding 
# flips 30% of y values

def make_cor_adv(y, n_class=4):
  n_ind = int(0.3*len(y))
  cor_vars = []
  for i in range(2):
    random.seed(0)
    np.random.seed(0)
    cor_i = y.copy()
    ind = random.sample(range(len(y)), n_ind)
    adjust = np.random.randint(n_class, size=n_ind)
    cor_i[ind] = (cor_i[ind]+adjust)%n_class
    cor_vars.append(cor_i)
  return np.array(cor_vars).transpose()

In [30]:
def adder(n_obs=50,n_I=92, seed=0):
  np.random.seed(seed)
  red = lnot(gen_3()).astype(int)
  rr = np.hstack([gen_3(), red])
  q=n_obs//8
  r=n_obs%8
  rr_exp = np.vstack([np.repeat(rr,q, axis=0),rr[:r,:]])
  irlvnt = np.random.randint(2, size=[n_obs,n_I])
  y1 = lxor(lxor(rr_exp[:,0], rr_exp[:,1]), 
            rr_exp[:,2]).astype(int)
  y2 = lor(land(rr_exp[:,0], rr_exp[:,1]), 
           land(rr_exp[:,2], lxor(rr_exp[:,0], rr_exp[:,1]))).astype(int)
  y = [y1[j] + 2*y2[j] for j in range(len(y1))]
  cor = make_cor_adv(np.array(y))
  features = np.hstack([rr_exp, cor, irlvnt])
  return features, y

X, y = adder(n_obs=50,n_I=92, seed=0)


X, y

(array([[0, 0, 0, ..., 0, 1, 1],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 1, 0, 1],
        [0, 0, 0, ..., 0, 1, 1],
        [0, 0, 1, ..., 0, 0, 0]]),
 [0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  2,
  2,
  2,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  3,
  3,
  3,
  3,
  3,
  3,
  0,
  1])

### LED

In [31]:
# import the table showing which LED segments light up for each character
df = pd.read_csv('16_segment_truth_table2.csv')
df = df.fillna(0)
df.index = df.iloc[:,0].values
df.drop(columns='char', inplace=True)
df = df.astype(int)

def led(df, n_obs=180,n_I=66, seed=0):
    np.random.seed(seed)
    rlvnt = df.values
    red = np.logical_not(rlvnt)
    rr = np.hstack([rlvnt, red])
    d = rlvnt.shape[0]
    q=n_obs//d
    r=n_obs%d
    rr_exp = np.vstack([np.repeat(rr, q, axis=0), rr[:r,:]])
    irlvnt = np.random.randint(2, size=[n_obs,n_I])
    y = np.array(range(36))
    y = np.hstack([np.repeat(y, q), y[:r]])
    cor = make_cor_adv(y, n_class=36)
    features = np.hstack([rr_exp, cor, irlvnt])
    return features, y
    
    
X, y = led(n_obs=180,n_I=90, df=df)


X, y

(array([[1, 1, 1, ..., 1, 1, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 1, 1],
        ...,
        [1, 1, 0, ..., 0, 0, 0],
        [1, 1, 0, ..., 1, 1, 1],
        [1, 1, 0, ..., 1, 1, 1]]),
 array([ 0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  3,  3,
         3,  3,  3,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,  6,  6,  6,
         6,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,  9,  9, 10,
        10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 13, 13, 13,
        13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16,
        17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 20, 20,
        20, 20, 20, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23,
        23, 24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 27,
        27, 27, 27, 27, 28, 28, 28, 28, 28, 29, 29, 29, 29, 29, 30, 30, 30,
        30, 30, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
        34, 34, 34, 34, 34,

### PRC

In [32]:
def r_total(r_array):
  r_sum = 0
  for k in range(5):
    rk_sum = 1
    for j in range(5):
      if j!=k:
        rk_sum = rk_sum*r_array[j]
    r_sum = r_sum + rk_sum
  return np.product(r_array)/r_sum



def prc(n_obs,n_I, seed):
  np.random.seed(seed)
  rlvnt = 3 + np.random.randn(n_obs,5)/3
  red = 2*rlvnt+3   #redundant features are linear transform of relevant variables
  rr = np.hstack([rlvnt, red])

  irlvnt = 3 + np.random.randn(n_obs,n_I//2)/3
  irlvnt = np.hstack([irlvnt, 3+np.random.rand(n_obs,n_I//2)])
  
  features = np.hstack([rr, irlvnt])
  y = [r_total(features[j,:5]) for j in range(features.shape[0])]
  return features, y

X, y = prc(50, 90, 0)


X, y

(array([[3.58801745, 3.13338574, 3.32624599, ..., 3.75249141, 3.00623777,
         3.98430173],
        [2.67424071, 3.31669614, 2.9495476 , ..., 3.35150958, 3.29399125,
         3.92450521],
        [3.04801452, 3.48475784, 3.25367924, ..., 3.96668199, 3.38958478,
         3.29338089],
        ...,
        [3.26039937, 3.49816151, 2.31000499, ..., 3.92095383, 3.21207824,
         3.37636976],
        [2.78752099, 2.86757606, 2.95570647, ..., 3.63373196, 3.66568643,
         3.50202453],
        [2.44133206, 3.38411052, 3.35987286, ..., 3.25422661, 3.8668669 ,
         3.04138781]]),
 [0.6937593826965497,
  0.5986346601351837,
  0.6373712341212849,
  0.6101978926538122,
  0.5848946315830535,
  0.6093782215007715,
  0.5581813299175287,
  0.6218208862731849,
  0.5519747942136953,
  0.5580283345660548,
  0.567797596352605,
  0.596120505328424,
  0.5515023938896366,
  0.563338260945351,
  0.6104027753015651,
  0.5673361753228572,
  0.5914285525477756,
  0.6301520501714677,
  0.629100291597