In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import MaxAbsScaler
import seaborn as sns
sns.set()

seed_val = 2021
np.random.seed(seed_val)

In [None]:
def split_data(dataset, seed, train_ratio=0.6, shuffle=True, highest_at_top=False, no_outliers=False):
    if shuffle:
        dataset = dataset.sample(frac=1, random_state=seed)#.reset_index(drop=True)
    
    if highest_at_top:
        df = dataset.copy()
        sub_115 = df.drop( df[ df["eta c"]>1.0 ].index)
        big_115 = df[df["eta c"] > 1.0]
        # shuffle eta c < 1.15 data
        sub_115 = sub_115.sample(frac=1, random_state=seed)
        # join the data
        x = big_115.append(sub_115)

        train_dataset = x[0:int(len(x)*train_ratio)]
        test_dataset = dataset.drop(train_dataset.index)
        
        if shuffle:
          train_dataset = train_dataset.sample(frac=1, random_state=seed)#.reset_index(drop=True)
          test_dataset = test_dataset.sample(frac=1, random_state=seed)#.reset_index(drop=True)
        
        return train_dataset, test_dataset

    elif no_outliers:
        df = dataset.copy()
        sub_115 = df[df["eta c"]<=1.15 ]
        big_115 = df[df["eta c"] > 1.15]
        num_samples = len(df)*train_ratio
        sub_115 = sub_115.sample(frac=1, random_state=seed)
        train_dataset = sub_115[0:int(num_samples)]
        test_dataset = big_115.append(sub_115.drop(train_dataset.index))

        return train_dataset, test_dataset

    
    train_dataset = dataset.sample(frac=train_ratio, random_state=0)
    test_dataset = dataset.drop(train_dataset.index)

    return train_dataset, test_dataset

def split_data_with_vali(dataset, seed, train_ratio=0.6, vali_ratio=0.2, shuffle=True, highest_at_top=False, no_outliers=False):    
    train_dataset = dataset.sample(frac=train_ratio, random_state=0)
    test_and_vali = dataset.drop(train_dataset.index)
    vali_dataset = test_and_vali.sample(frac=vali_ratio/(1-train_ratio), random_state=0)
    test_dataset = test_and_vali.drop(vali_dataset.index)

    return train_dataset, vali_dataset, test_dataset

def add_bias(data):
    N1 = np.shape(data)[0]
    N2 = np.shape(data)[1]
    a = -1*np.ones((N1,N2+1))
    a[:,:-1] = data
    return a

def add_noise(dataset, label_column=4, noise_var=0.01, input_n=False, output_n=False, augment=False, aug_frac=0.5, n_c_limit=1.4):
    """ Called on DATAFRAME training data. """
    dataset2 = dataset.copy()
    dataset = dataset[dataset["eta c"]<n_c_limit]
    print("# samples with restriction:", len(dataset))

    if augment:
      dataset = dataset.sample(frac=aug_frac, random_state=7)
      # original features and labels
      features2 = dataset2.to_numpy()[:,0:label_column]
      labels2 = np.reshape(dataset2.to_numpy()[:,label_column], (-1,1))

    # to add noise to
    features = dataset.to_numpy()[:,0:label_column]
    labels = np.reshape(dataset.to_numpy()[:,label_column], (-1,1))

    if input_n:
        noise = np.reshape(np.random.normal(0,noise_var,np.shape(features)[0]*np.shape(features)[1]),(np.shape(features)[0],np.shape(features)[1]))
        features = features + noise 

    if output_n:
        noise = np.reshape(np.random.normal(0,noise_var,np.shape(labels)[0]*np.shape(labels)[1]),(np.shape(labels)[0],np.shape(labels)[1]))
        labels = labels + noise
    
    if augment:
      # if augmenting original data
      print("Train # samples:", len(features2))
      print("# samples with noise:", len(dataset))
      features3 = np.vstack((features2, features))
      labels3 = np.vstack((labels2, labels))
      print("New # of samples:", len(features3))
      return features3, labels3
    else:
      return features, labels

def check_eta_range(dataset, nums=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4]):
    # nums = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4]
    less_than_percents = []
    in_range_percents = []
    x_ranges = []

    # lim_dataset[ (lim_dataset["b1"]<0.1) & (lim_dataset["RatioTotalArea"]<0.2) ]

    for i in range(len(nums)):
        less_than_percents.append( ((len(dataset[dataset["eta c"] <= nums[i]]))/len(dataset))*100)
        in_range_percents.append( (len(dataset[ (dataset["eta c"]<nums[i]) & (dataset["eta c"] >= nums[i-1]) ])/len(dataset))*100 )
        x_ranges.append("{one}<EtaC<{two}".format(one=nums[i-1], two=nums[i]) )

    nums.pop(0)
    less_than_percents.pop(0)
    in_range_percents.pop(0)
    x_ranges.pop(0)

    fig = plt.figure()
    fig, ax = plt.subplots(ncols=2, figsize=(15,8))

    ax[0].scatter(nums, less_than_percents)
    ax[0].set_ylabel("Percentage")
    ax[0].set_xlabel("Eta c")
    ax[0].set_title("Percentage of data with Eta C less than x")
    for i in range(len(less_than_percents)):
        if less_than_percents[i] > 99.0:
            ax[0].scatter(nums[i], less_than_percents[i], c="red", label=">99%")
    ax[0].legend()



    ax[1].scatter(x_ranges, in_range_percents)
    ax[1].set_ylabel("Percentage")
    ax[1].set_xlabel("Eta c")
    ax[1].set_title("Percentage of data with Eta C within range")
    for i in range(len(in_range_percents)):
        if sum(in_range_percents[0:i])>99:
            ax[1].scatter(x_ranges[i], in_range_percents[i], c="red", label="sum>99%")
    ax[1].legend()

    plt.xticks(rotation='vertical')



In [None]:
def unique_values(dataset, input_column, output_column="eta c", plot=False, output=False):
    reduced_dataset = dataset.drop_duplicates(subset=input_column)
    # data_range = reduced_dataset[input_column].to_numpy()
    if output:
        print("{col} range is: {rng}    (output is {out})".format(col=input_column, rng=len(reduced_dataset), out=output_column))

    if plot:
        ax1 = reduced_dataset.plot.scatter( x=input_column,
                        y=output_column,
                        c='DarkBlue')
        return ax1
        # return sns.scatterplot(data=reduced_dataset, x=input_column, y=output_column)
    
    return reduced_dataset[[input_column, output_column]]

def add_input_noise(dataset, noise_perc=0.01):
  print("ADDING {}% INPUT NOISE.".format(noise_perc*100))
  # adds noise to b1 and a2
  pd.options.mode.chained_assignment = None  # default='warn'

  # dataset2 = dataset[dataset["eta c"] < 0.07]
  dataset2 = dataset.copy()

  # add noise to b1
  names = [x for x in dataset.columns]
  df_b = pd.DataFrame(columns=names)
  b1_unique = list(unique_values(dataset, "b1")["b1"])
  for i in range(0, len(b1_unique)):
    temp = dataset2[dataset2["b1"] == b1_unique[i]]
    noise_var = b1_unique[i]*noise_perc
    noise = np.reshape(np.random.normal(0,noise_var,len(temp)), (-1,1))
    temp["b1"] = temp[["b1"]] + noise
    df_b = df_b.append(temp)
  
  # if len(df_b) == len(dataset):
  #   print("All {} b1 have noise. Moving on to a2.".format(len(df_b)))

  # add noise to a2
  df_a = pd.DataFrame(columns=names)
  a2_unique = list(unique_values(dataset, "a2")["a2"])
  for i in range(0, len(a2_unique)):
    temp = df_b[df_b["a2"] == a2_unique[i]]
    noise_var = a2_unique[i]*noise_perc
    noise = np.reshape(np.random.normal(0,noise_var,len(temp)), (-1,1))
    temp["a2"] = temp[["a2"]] + noise
    df_a = df_a.append(temp)
  
  # if len(df_a) == len(dataset):
  #   print("All {} a2 have noise.".format(len(df_a)))
  
  # add noise to b2
  df_b2 = pd.DataFrame(columns=names)
  b2_unique = list(unique_values(dataset, "b2")["b2"])
  for i in range(0, len(b2_unique)):
    temp = df_a[df_a["b2"] == b2_unique[i]]
    noise_var = b2_unique[i]*noise_perc
    noise = np.reshape(np.random.normal(0,noise_var,len(temp)), (-1,1))
    temp["b2"] = temp[["b2"]] + noise
    df_b2 = df_b2.append(temp)
  
  # if len(df_b2) == len(dataset):
  #   print("All {} b2 have noise.".format(len(df_b2)))

  # add noise to a1
  df_a1 = df_b2
  noise = np.reshape(np.random.normal(0,0.5*noise_perc,len(df_a1)), (-1,1))
  df_a1["a1"] = df_a1[["a1"]] + noise
  # if len(df_a1) == len(dataset):
  #   print("All {} a1 have noise.".format(len(df_a1)))

  # add noise to frac
  df_f = pd.DataFrame(columns=names)
  frac_unique = list(unique_values(dataset, "frac")["frac"])
  for i in range(0, len(frac_unique)):
    temp = df_a1[df_a1["frac"] == frac_unique[i]]
    noise_var = frac_unique[i]*noise_perc
    noise_var = 0
    noise = np.reshape(np.random.normal(0,noise_var,len(temp)), (-1,1))
    temp["frac"] = temp[["frac"]] + noise
    df_f = df_f.append(temp)

  # if len(df_f) == len(dataset):
  #   print("All {} frac have noise.".format(len(df_f)))

  df_a = df_f
  # noisey to RTA
  df_a["area1"]=df_a["a1"]*df_a["b1"]*np.pi
  df_a["area2"]=df_a["a2"]*df_a["b2"]*np.pi
  df_a["TotalArea"] = df_a["area1"] + df_a["area2"]
  df_a["RatioTotalArea"] = df_a["area1"]*(1-df_a["frac"]) + df_a["area2"]*(df_a["frac"])

  # print("All RTA have noise. Returning dataset.")
  print("# samples with input noise:", len(df_a))
  print()
  return df_a.sort_index()

def add_output_noise(dataset, ds_no_noise, noise_perc=0.01, noise_frac=1.0):
    """ Called on DATAFRAME training data. """
    print("ADDING {}% OUTPUT NOISE.".format(noise_perc*100))
    dataset2 = dataset.copy()

    # dataset = dataset.sample(frac=noise_frac, random_state=7)
    rta_signs = np.reshape(np.array(np.sign(dataset[["RatioTotalArea"]] - ds_no_noise[["RatioTotalArea"]])), (-1,1))*(-1)
    noise_ints = abs(np.reshape(np.random.normal(0, noise_perc, len(dataset)), (-1,1)))*rta_signs
    # noise_ints = np.reshape(np.random.normal(0, noise_perc, len(dataset)), (-1,1))
    dataset["eta c"] = dataset[["eta c"]]*(1+noise_ints) 

    print("# samples with output noise:", len(dataset))
    print()
    return dataset

In [None]:
def scale_data(data, label_name, scale_label=False):
  scaled_dataset = data.copy()
  scaled_dataset['b1'] = MaxAbsScaler().fit_transform(data['b1'].values.reshape(-1,1))
  scaled_dataset['a2'] = MaxAbsScaler().fit_transform(data['a2'].values.reshape(-1,1))
  scaled_dataset['b2'] = MaxAbsScaler().fit_transform(data['b2'].values.reshape(-1,1))
  scaled_dataset['RatioTotalArea'] = MaxAbsScaler().fit_transform(data['RatioTotalArea'].values.reshape(-1,1))
  scaled_dataset['frac'] = MaxAbsScaler().fit_transform(data['frac'].values.reshape(-1,1))
  if scale_label:
    scaled_dataset[label_name] = MaxAbsScaler().fit_transform(data[label_name].values.reshape(-1,1))
  return scaled_dataset

In [None]:
def change_num_samples(dataset, ratio_to_keep=0.5, condition=0.15, seed=97, label_name='Nc'):
  names = [x for x in dataset.columns]
  df_data = pd.DataFrame(columns=names)
  df_data = df_data.append(dataset[dataset[label_name]>condition])
  df_leftover = pd.DataFrame(columns=names)
  
  if label_name == 'eta c':
    for i in range(0, int(100*condition)+5, 5):
      lim_df = dataset[(dataset[label_name] >= (i-5)/100) & (dataset[label_name] <= i/100)]
      sampled_df = lim_df.sample(frac=ratio_to_keep, random_state=seed)
      remain_df = lim_df.drop(sampled_df.index)

      df_data = df_data.append(sampled_df)
      df_leftover = df_leftover.append(remain_df)

      print("{x} / {y}".format(x=len(sampled_df), y=len(remain_df)))
  elif label_name == 'Nc':
      for i in range(0, int(condition)+5, 5):
        lim_df = dataset[(dataset[label_name] >= i) & (dataset[label_name] <= i+5)]
        sampled_df = lim_df.sample(frac=ratio_to_keep, random_state=seed)
        remain_df = lim_df.drop(sampled_df.index)

        df_data = df_data.append(sampled_df)
        df_leftover = df_leftover.append(remain_df)

        # print("{x} / {y}".format(x=len(sampled_df), y=len(remain_df)))

  return df_data, df_leftover

In [None]:
def check_representation(train_dataset, validation_dataset, test_dataset, label_name):
  # sort the data to check representation
  sorted_train = train_dataset.sort_index()
  sorted_test = test_dataset.sort_index()

  # sort the data to check representation
  sorted_train = train_dataset.sort_index()
  sorted_vali = validation_dataset.sort_index()
  sorted_test = test_dataset.sort_index()

  # check representation
  fig = plt.figure()
  fig, ax = plt.subplots(ncols=3, nrows=2, figsize=(15,12))

  ax[0][0].scatter(x=np.arange(len(sorted_train)), y=sorted_train[label_name], marker='.', alpha=0.4)
  ax[0][0].set_title("Index Sorted Training Data")
  ax[0][1].scatter(x=np.arange(len(sorted_vali)), y=sorted_vali[label_name], marker='.', alpha=0.4)
  ax[0][1].set_title("Index Sorted Validation Data")
  ax[0][2].scatter(x=np.arange(len(sorted_test)), y=sorted_test[label_name], marker='.', alpha=0.4)
  ax[0][2].set_title("Index Sorted Testing Data")
  ax[0][0].set_ylabel(label_name)

  train1 = sorted_train.sort_values(label_name)
  test1 = sorted_test.sort_values(label_name)

  y_val = train1[int(len(train1)/2)-1:int(len(train1)/2)][label_name]
  ax[1][0].scatter(x=np.arange(len(train_dataset)), y=train1[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][0].plot([len(train_dataset)/2, len(train_dataset)/2], [0,train_dataset[label_name].max()], '--', alpha=0.4, color="red")
  # horizontal line
  ax[1][0].plot([0, len(train_dataset)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][0].set_title("Label Sorted Training Data")

  sorted_ds=sorted_vali.sort_values(label_name)
  y_val = sorted_ds[int(len(sorted_ds)/2)-1:int(len(sorted_ds)/2)][label_name]
  ax[1][1].scatter(x=np.arange(len(sorted_ds)), y=sorted_ds[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][1].plot([len(sorted_ds)/2, len(sorted_ds)/2], [0,sorted_ds[label_name].max()], '--',  alpha=0.4, color="red")
  # horizontal line
  ax[1][1].plot([0, len(sorted_ds)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][1].set_title("Label Sorted Train Data")

  sorted_ds=sorted_test.sort_values(label_name)
  y_val = sorted_ds[int(len(sorted_ds)/2)-1:int(len(sorted_ds)/2)][label_name]
  ax[1][2].scatter(x=np.arange(len(sorted_ds)), y=sorted_ds[label_name], marker='.', alpha=0.4)
  # vertical line
  ax[1][2].plot([len(sorted_ds)/2, len(sorted_ds)/2], [0,sorted_ds[label_name].max()], '--',  alpha=0.4, color="red")
  # horizontal line
  ax[1][2].plot([0, len(sorted_ds)], [y_val,y_val], '--',  alpha=0.4, color="red")
  ax[1][2].set_title("Label Sorted Test Data")
  ax[1][0].set_ylabel(label_name)

In [None]:
def check_label_range(dataset, name, nums=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4], label='eta c'):
    # nums = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4]
    less_than_percents = []
    in_range_percents = []
    x_ranges = []

    # lim_dataset[ (lim_dataset["b1"]<0.1) & (lim_dataset["RatioTotalArea"]<0.2) ]

    for i in range(len(nums)):
        less_than_percents.append( ((len(dataset[dataset[label] <= nums[i]]))/len(dataset))*100)
        in_range_percents.append( (len(dataset[ (dataset[label]<nums[i]) & (dataset[label] >= nums[i-1]) ])/len(dataset))*100 )
        x_ranges.append("{one}<{label}<{two}".format(one=nums[i-1], label=label, two=nums[i]) )

    nums.pop(0)
    less_than_percents.pop(0)
    in_range_percents.pop(0)
    x_ranges.pop(0)

    fig = plt.figure()
    fig, ax = plt.subplots(ncols=2, figsize=(15,8))

    ax[0].scatter(nums, less_than_percents)
    ax[0].set_ylabel("Percentage")
    ax[0].set_xlabel(label)
    ax[0].set_title("Percentage of {name} data with {label} less than x".format(name=name, label=label))
    for i in range(len(less_than_percents)):
        if less_than_percents[i] > 99.0:
            ax[0].scatter(nums[i], less_than_percents[i], c="red", label=">99%")
    ax[0].legend()



    ax[1].scatter(x_ranges, in_range_percents)
    ax[1].set_ylabel("Percentage")
    ax[1].set_xlabel(label)
    ax[1].set_title("Percentage of {name} data with {label} within range".format(name=name, label=label))
    for i in range(len(in_range_percents)):
        if sum(in_range_percents[0:i])>99:
            ax[1].scatter(x_ranges[i], in_range_percents[i], c="red", label="sum>99%")
    ax[1].legend()

    plt.xticks(rotation='vertical')

In [None]:
uploaded = pd.read_csv("AdditionalStuff.csv")
 
dataset = uploaded.copy()
dataset.pop("Unnamed: 0")

label_name = 'Nc'
in_features = 5
out_nodes = 1

# split the data
train_dataset, validation_dataset, test_dataset = split_data_with_vali(dataset.copy(), seed_val, train_ratio=0.8, vali_ratio=0.1, shuffle=False, highest_at_top=False, no_outliers=False)
print("# of samples :", len(dataset))
print("# of starting train samples:", len(train_dataset))
print("# of starting vali samples:", len(validation_dataset))
print("# of starting test samples:", len(test_dataset))
print()

# If noise to be added to data.
# Currently adds NO NOISE because this was not seen as useful. 
# for adding noise to b1 and a2
train_dataset_no_noise = train_dataset.copy()
in_noisey_train = add_input_noise(train_dataset, noise_perc=0.0)
all_noisey_train = add_output_noise(in_noisey_train, train_dataset_no_noise, noise_perc=0.0, noise_frac=1.0)
# append the data without noise
all_noisey_train = all_noisey_train.sample(frac=0.0, random_state=9)
all_train = train_dataset_no_noise.append(all_noisey_train)
 
print("# of Training Samples: ", len(train_dataset))
print("# of Test Samples: ", len(test_dataset))

# select which data to use
train_dataset = train_dataset[["b1", "a2", "b2", "RatioTotalArea", "frac", label_name]]
validation_dataset = validation_dataset[["b1", "a2", "b2", "RatioTotalArea", "frac", label_name]]
test_dataset = test_dataset[["b1", "a2", "b2", "RatioTotalArea", "frac", label_name]]
 
# # sort the data to check representation
sorted_train = train_dataset.sort_index()
sorted_vali = validation_dataset.sort_index()
sorted_test = test_dataset.sort_index()
 
check_representation(train_dataset, validation_dataset, test_dataset, label_name)

In [None]:
check_label_range(train_dataset.copy(), name='Train', nums=[1,5,10,15,20,25,50,75,100,150,200,250,300,350], label='Nc')

In [None]:
check_label_range(validation_dataset.copy(), name='Validation', nums=[1,5,10,15,20,25,50,75,100,150,200,250,300,350], label='Nc')

In [None]:
check_label_range(test_dataset.copy(), name='Test', nums=[1,5,10,15,20,25,50,75,100,150,200,250,300,350], label='Nc')

In [None]:
train_dataset.to_csv("Train.csv")
validation_dataset.to_csv("Vali.csv")
test_dataset.to_csv("Test.csv")