<a href="https://colab.research.google.com/github/SoumyaShreeram/Microlensing_with_NeuralNets/blob/master/dr02_Preprocessing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 02. Functions for preprocessing, generation, and plotting data

Preprocessing includes: Sampling, initialization, setting the filename, and loading data

Author: Soumya Shreeram <br>
Script adapted from: Millon Martin & Kevin MÃ¼ller <br>
Date: 23rd February 2020 <br>


### 1. Sampling, Initialization, loading data

In [0]:
def setSamplingParameters(v_t, data_dir, euler_sampling):
  """
  Function defines the values for sampling parameters and no. of pixels 

  Inputs:
  @v_t :: transverse velocity
  @euler_sampling :: data samples taken by the Euler telescope
  @data_dir :: directory with the model light-curves for inputted v_t

  Retutns:
  @n_sample :: no. of samples in the training set
  @n_sample_max :: option to reduce no. of samples in training set
  @n_pix :: no. of pixels, i.e. no. of data-points in the light-curve
  """
  if v_t == 500:
    if euler_sampling: 
      n_sample, n_sample_max, n_pix = 10000, 10000, 955
    else:
      n_sample, n_sample_max, n_pix = 10000, 10000, 486

  else:
    n_sample, n_sample_max, n_pix = 20000, 3000, 1137
  sample_params = [n_sample, n_sample_max, n_pix]
  return sample_params

def initializer(r_0, sample_params):
  """
  Function defines the class names, categories, and initializes the data arrays
  Input:
  @r_0 :: arr with all the scale radii of the background quasar
  @sample_params :: arr containing defined values for the sampled data

  Returns:
  @l_curves :: data to store the light curves
  @class_cat :: 2D array containing the classes and categories to be classified
  @out_categories :: output array with the categories of light cureves
  @out_radii ::  output array with all the radii of the light curves
  """
  # generate categories and class names 
  classes = [str(radius) for radius in r_0]
  categories = np.arange(len(r_0))

  # initialize data arrays to be classified
  l_curves = np.zeros((sample_params[0]*len(r_0), sample_params[2], 1))
  out_catergories = np.zeros(sample_params[0]*len(r_0))
  out_radii = np.zeros(sample_params[0]*len(r_0))

  class_cat = [classes, categories]
  return class_cat, l_curves, out_catergories, out_radii

def getFilename(data_dir, r, v_t, sample_params):
  """
  Fuctions defines the file name based on the inputted transverse velocity
  @data_dir :: path to the directory containing the data
  @r :: arr containing the scale radii
  @v_t :: input value for the transverse velocity
  """
  if v_t == 300:
    filename = data_dir + 'v300/simLC_A-B_n%d_v300_R'%sample_params[0] + str(r)  + '_M0,3.pkl'
  else:
    filename = data_dir + 'v500/simLC_A-B_n%d_v500_R'%sample_params[0] + str(r)  + '_M0,3.pkl'
  return filename

def loadData(filename, l_curves, sample_params, r):
  """
  Function loads the data from the data files 
  Input:
  @l_curves :: empty and initialized arr to hold light-curve info
  @sample_params :: arr containing defined values for the sampled data 
  @r :: iterating variable over all the scale radii

  Returns @l_curves :: fills the data array with required no. of light curves
  """
  l_curve_file = open(filename, 'rb')
  l_curve_data = pickle.load(l_curve_file, encoding='latin1')
  l_curve_file.close()

  # counter makes sure background is eliminated
  count = 0

  # iterating over the number of maximum sample points
  for i in range(sample_params[1]):
    
    # gets rid of corrupted data points (None and Nan entries)
    if np.any(l_curve_data[i]) is None:
      continue
    if np.any(np.isnan(np.asarray(l_curve_data[i]))):
      continue

    # fills l_curves with non-corrupted data points
    if np.max(np.abs(l_curve_data[i])) > 0.5:
      l_curves[r*sample_params[0]+count, :, 0] = np.asarray(l_curve_data[i])
      count += 1
    
    # checks if there are enough light-curves to exit the loop
    if count == sample_params[0]:
      break      
  return l_curves

### 2. Plot properties

In [0]:
def setLabels(ax, xlabel, ylabel, title):
    """
    Function sets the labels of the x-y axis in the plot below
    """
    ax.set_ylabel(xlabel, fontsize=16)
    ax.set_xlabel(ylabel)
    ax.set_title(title)
    ax.legend()
    return 

### 3. Generate training and testing data set

In [0]:
def generateTestTrain(out_catergories, out_radii, r_0):
  """
  Function to generate test and train data sets
  @out_categories, out_radii :: arr with all the categories/radii of the light curves
  @r_0 :: array with all the entries for the scale radius of the bkg object

  Returns:
  test/train data sets into which the categories and radii of light curves are classified
  """  
  categories_idx = np.arange(len(out_catergories))
  train_idx, test_idx = train_test_split(categories_idx, test_size=0.2)
  
  # train data sets
  train_l_curves = l_curves[train_idx]
  train_radii = out_radii[train_idx]
  train_cat = out_catergories[train_idx]
  # encodes categorical integer features using a one-hot scheme
  onehot_train = tf.keras.utils.to_categorical(train_cat, num_classes = len(r_0))

  # test data sets
  test_l_curves = l_curves[test_idx]
  test_radii = out_radii[test_idx]
  test_cat = out_catergories[test_idx]
  onehot_test = tf.keras.utils.to_categorical(test_cat, num_classes = len(r_0))

  data_sets = [test_radii, train_radii, test_cat, train_cat]
  return train_l_curves, test_l_curves, data_sets, onehot_train, onehot_test