
# Input Generation
This notebook will generate new data limited by a maxiumum RatioTotalArea. It generates a2 maximumvalues for given values of r1, r2 and frac i.e. a2 should be less than some value to ensure the RatioTotalArea is less than some value. It then fills up the dataset. Some stray values may appear. These can be removed here or when joining the samples. 
```



Imports.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()
import math

RTA = (1-f)(1/r1) + (f)[(a2^2)/r2]. Providing r1, r2 and frac can generate a full set of samples limited by RTA. The functions below allow for generation and comparison with another dataset. 

In [None]:
def new_sample_options(r1_range=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50], 
                r2_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50],
                f_range = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99],
                RTA_max = 0.0130):
  """ This function is limited by the RTA range. Takes a list for r1, r2 and
      and frac and prdocues the a2 values. Type defines RTA range and can be 
      either defined by min value, max value or over a range. Returns a 
      dataframe with [frac r1 r2 a2<=]. """

  names = ['frac', 'r1', 'r2', 'a2<=']
  temp = [[0,0,0,0]]
  new_sample_points = pd.DataFrame(temp, columns=names)

  for i in range(0, len(f_range)):
    # f loop
    f = f_range[i]

    for ii in range(0, len(r1_range)):
      # r1 loop
      r1 = r1_range[ii]

      for iii in range(0, len(r2_range)):
        # r2 loop
        r2 = r2_range[iii]

        x = RTA_max-(1-f)*(1/r1)
        if x>0:
          y = x*(r2/f)
          a2 = np.sqrt(y)
          # a2_range.append(a2)
          row = [f, r1, r2, a2]

          new_sample_points.loc[len(new_sample_points)] = row

  # remove starting row
  new_sample_points = new_sample_points[new_sample_points.index > 0]
  new_sample_points.reset_index(drop=True, inplace=True)

  print("Number of new samples:", len(new_sample_points))
  
  return new_sample_points

def a2_lessthan_visual(df, a2values=[0.5, 0.45, 0.4, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05, 0.025, 0.01, 0.005]):
  """ Supply dataframe from new_samples or with 'a2<=' column. Can also supply 
      the distinct a2 values to be used in the MC simulation. Otherwise will

      print a default set. """
  print("Distinct a2 values: {}".format(a2values))
  a2_range = df["a2<="]
  fig, axs = plt.subplots(figsize=(12,6))
  sns.histplot(a2_range, shrink=.8, color='red', bins=100)

def get_all_samples(new_sample_points, 
                    a2values=[0.5, 0.45, 0.4, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05, 0.025, 0.01, 0.005],
                    shuffle=False):
  """ Requires new_sample_points DATAFRAME. Takes distinct a2values or uses 
  default list. Returns a new dataframe with all possible inputs. """

  temp = new_sample_points.copy()
  names = ['frac', 'r1', 'r2', 'a2']
  df = pd.DataFrame(columns=names)

  print("Distinct a2 values: {}".format(a2values))
  for i in range(0, len(temp)):
    a2 = float(temp.loc[i, "a2<="])
    for ii in range(0, len(a2values)):
      if a2values[ii] <= a2:
        temp.loc[i, "a2<="] = a2values[ii]
        working_a2 = a2values[ii:]
        working_a2_vec = np.reshape(np.ones((1, len(working_a2)))*working_a2, (-1,1))

        x = np.array(temp.loc[i, ["frac", "r1", "r2"]])
        other_inputs = np.ones((len(working_a2),3))*np.reshape(x,(1,-1))

        all_inputs = np.append(other_inputs, working_a2_vec, axis=1)
        df_temp = pd.DataFrame(all_inputs, columns=names)
        df = df.append(df_temp)
        empty = False
        break

  df.reset_index(drop=True, inplace=True)
  df.drop_duplicates(inplace=True)
  df["2a2"]=df["a2"]*2
  print("# of total new samples:", len(df))
  if shuffle:
    df = df.sample(frac=1, random_state=97)
    df = df.reset_index()
  columns_titles = ["r1", "2a2", "r2", "frac"]
  df=df.reindex(columns=columns_titles)
  return df

def compare(original_data, new_sample_inputs, only_unique=False):
  """ Take old and new data. Will drop duplicates from both. Both should be 
      DATAFRAMES and have [r1, r2, 2a2, frac] columns. Returns only input for 
      new data samples. """
  # currently splits from index of original data- seems to have 10 extra though
  
  original_data = original_data[["r1", "2a2", "r2", "frac"]].copy()
  new_sample_inputs = new_sample_inputs[["r1", "2a2", "r2", "frac"]].copy()
  print("# of ORIGINAL samples                :", len(original_data))
  print("# of NEW samples                     :", len(new_sample_inputs))

  original_data = (original_data.drop_duplicates())
  original_unique = len(original_data)
  new_sample_inputs = (new_sample_inputs.drop_duplicates())
  print("# of ORIGINAL samples W/O DUPLICATES :", original_unique)
  print("# of NEW samples W/O DUPLICATES      :", len(new_sample_inputs))

  all_data = (original_data.append(new_sample_inputs))
  print()
  print("# of ALL samples                     :", len(all_data))

  all_unique_samples = all_data.drop_duplicates()
  print("# UNIQUE samples                     :", len(all_unique_samples))
  if only_unique:
    new_unique_samples = all_unique_samples[len(original_data):]
    new_unique_samples = new_unique_samples.reset_index()
    new_unique_samples.pop("index")
    print("# UNIQUE NEW samples                 :", len(new_unique_samples))
    print("Returned only unique "+ str(len(new_unique_samples))+" samples.")
    return new_unique_samples
  else:
    print("Returned all unique "+ str(len(all_unique_samples))+" samples.")
    return all_unique_samples

def augment_data(dataset):
  """ Augments the data with 
      [a1, b1, a2, b2, area1, area2, TotalArea, RatioTotalArea]. """
  
  dataset["a1"]=1/2
  dataset["b1"]=dataset["a1"]/dataset["r1"]
  dataset["a2"]=dataset["2a2"]/2
  dataset["b2"]=dataset["a2"]/dataset["r2"]
  dataset["area1"]=dataset["a1"]*dataset["b1"]*np.pi
  dataset["area2"]=dataset["a2"]*dataset["b2"]*np.pi
  dataset["TotalArea"] = dataset["area1"] + dataset["area2"]
  dataset["RatioTotalArea"] = dataset["area1"]*(1-dataset["frac"]) + dataset["area2"]*(dataset["frac"])
  return dataset

These functions are for saving the inputs to text. It presumes the MonteCarlo code will be run from the terminal with 2 lines of code:

[1] Instantiator

[2] Parameters and inputs

In [None]:
def to_txt(loc):
  """ Give the location and name of the csv with the inputs. Remember to use 
      .csv"""
  uploaded = pd.read_csv(loc)
  test = uploaded.copy()
  test.pop("Unnamed: 0")
  test = test.sample(frac=1, random_state=97)
  columns_titles = ["r1", "2a2", "r2", "frac"]
  test=test.reindex(columns=columns_titles)

  loc_txt = loc.rstrip('csv')
  loc_txt = loc_txt+'txt'
  np.savetxt(loc_txt, test.values, fmt='%f')

def add_commands(inputfilename, new_file_loc, split_size=25, save_all=False, check_files=False):
  """ Give the location and name of the txt with the inputs. Remember to use 
      .txt"""

  lines = []
  with open(inputfilename) as f:
      lines = f.readlines()
  
  new_lines = []
  for i in range(0, len(lines)):
    new_lines.append('g++ -o cEP2PBCself_brent cEP2PBCself_brent.cpp\n')
    x = lines[i].rstrip('\n')
    new_lines.append('./cEP2PBCself_brent 32 32 '+x+' 10000 10 $RANDOM &\n')
  print("Length of file with added commands is {} lines.".format(len(new_lines)))

  if save_all:
    name = new_file_loc + "AllInputs.txt"
    with open(name, 'a') as the_file:
      for line in new_lines:
        the_file.write(line)
    print("{} inputs saved successfully.".format(len(new_lines)))

  split_size = split_size*2 
  num_files = math.ceil(len(new_lines)/split_size)
  print("There should be {x} files with ~{y} samples in each.".format(x=num_files, y=split_size))
  x = '1'
  for i in range(0, num_files):
    name = new_file_loc+x+'Inputs.txt'
    if i < num_files:
      with open(name, 'a') as the_file:
        for line in new_lines[split_size*i:split_size*(i+1)]:
          the_file.write(line)
      x = str(int(x)+1)
    else:
      with open(name, 'a') as the_file:
        for line in new_lines[split_size*i:]:
            the_file.write(line)

  if check_files:
    for i in range(1, num_files+1):
      i_str = str(i)
      name = new_file_loc+i_str+'Inputs.txt'
      lines = []
      with open(name) as f:
          lines = f.readlines()
      print("File {x}  : {y} lines".format(x=i_str,y=len(lines)))
  
  print("All files are saved!")

Load the data

In [None]:
uploaded = pd.read_csv("alldata.csv")

dataset = uploaded.copy()
dataset.pop("Unnamed: 0")

dataset["RatioTotalArea"].max()

Generate new sample points as per RTA equation. 

In [None]:
x = new_sample_options(r1_range=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50], 
                r2_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 30, 40, 50],
                f_range = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99],
                RTA_max = 0.0130)
x.head()

In [None]:
x.frac.unique()

Vistualise a2 possibilities.

In [None]:
a2_lessthan_visual(x)

Fill up with all samples.

In [None]:
new_samples = get_all_samples(x, 
                a2values=[0.5, 0.45, 0.4, 0.35, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05, 0.025, 0.01, 0.005],
                shuffle=True)
new_samples.head()

In [None]:
new_samples.frac.unique()

Check New vs Old Data. Look for duplicates in the old, in the new and between the two. Only include new samples. 

In [None]:
original_data = dataset[["r1", "2a2", "r2", "frac"]].copy()
new_samples = new_samples[["r1", "2a2", "r2", "frac"]].copy()
new_unique_samples = compare(original_data, new_samples, only_unique=True)
new_unique_samples = new_unique_samples.sample(frac=1, random_state=97)

In [None]:
new_unique_samples.frac.unique()

Save the data to a txt.

In [None]:
new_unique_samples.head()

In [None]:
np.savetxt('MoreSamples.txt', new_unique_samples.values, fmt='%f')
print("Saved {} data to csv.".format(len(new_unique_samples)))

## Executable Texts
Here we will save the inputs in batch sizes of 25. Each txt doc will have 50 lines:

[1] Instantiator

[2] Parameters and inputs

Open these txts and copy the contents into the terminal.

In [None]:
# NOTE: Store the txt files in a new folder. The folder needs to be present.

add_commands("MoreSamples.txt",
             "/MoreSamples6/",
             split_size=25,
             save_all=True,
             check_files=True)