In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
def load_to_dataframe(loc_name):
  """ Takes a txt document and loads to a Dataframe. """

  names = ['a', 'b', 'r1', '2a2', 'r2', 'frac', 'c', 'd', 'e', 'eta c', 'Nc1', 'StdDev Nc1', 'Nc2', 'StdDev Nc2', 'Nc3', 'StdDev Nc3', 'Nc4', 'StdDev Nc4', 'f']
  data = pd.read_csv(loc_name, sep=" ", names=names)
  data.pop("a")
  data.pop("b")
  data.pop("c")
  data.pop("d")
  data.pop("e")
  data.pop("f")
  return data

def remove_duplicates(data):
  print("# of samples:", len(data))
  datax = data[['r1', '2a2', 'r2', 'frac']]
  # all samples w/o duplicates
  data_no_duplicates = datax.drop_duplicates()
  print("# of samples w/o duplicates:", len(data_no_duplicates))

  data1 = data.copy()
  data2 = data.copy()

  # indecies to drop from dataleft over
  data1 = data1.drop(data_no_duplicates.index)
  # 
  data_wo_duplicates = data2.drop(data1.index)
  data_wo_duplicates = data_wo_duplicates.reset_index(drop=True)
  return data_wo_duplicates

def select_Nc(sample_data):
  """ Takes Dataframe. Selects the Nc with the lowest std. deviation from the 
      data. Returns Dataframe with headings:
      ['r1', '2a2', 'r2', 'frac', 'Nc', 'Nc Std. Dev', 'eta c']. """

  data = remove_duplicates(sample_data)

  names = ['r1', '2a2', 'r2', 'frac', 'Nc', 'Nc Std. Dev', 'eta c']
  new_df = pd.DataFrame(columns=names)

  names = [x for x in data.columns]
  row = pd.DataFrame(columns=names)
  for i in range(0, len(data)):
    row = pd.DataFrame(columns=names)
    row = row.append(data.loc[i])
    dev_nc1 = row.loc[i, "StdDev Nc1"]
    dev_nc2 = row.loc[i, "StdDev Nc2"]
    dev_nc3 = row.loc[i, "StdDev Nc3"]
    dev_nc4 = row.loc[i, "StdDev Nc4"]
    row = row.reindex()
    
    if dev_nc1 <= dev_nc2 and dev_nc1 <= dev_nc3 and dev_nc1 <= dev_nc4:
      row = row.rename(columns={'Nc1': 'Nc', 'StdDev Nc1': 'Nc Std. Dev'})
      row.pop("Nc2")
      row.pop("Nc3")
      row.pop("Nc4")
      row.pop("StdDev Nc2")
      row.pop("StdDev Nc3")
      row.pop("StdDev Nc4")
    elif dev_nc2 <= dev_nc1 and dev_nc2 <= dev_nc3 and dev_nc2 <= dev_nc4:
      row = row.rename(columns={'Nc2': 'Nc', 'StdDev Nc2': 'Nc Std. Dev'})
      row.pop("Nc1")
      row.pop("Nc3")
      row.pop("Nc4")
      row.pop("StdDev Nc1")
      row.pop("StdDev Nc3")
      row.pop("StdDev Nc4")
    elif dev_nc3 <= dev_nc1 and dev_nc3 <= dev_nc2 and dev_nc3 <= dev_nc4:
      row.pop("Nc1")
      row.pop("Nc2")
      row.pop("Nc4")
      row.pop("StdDev Nc1")
      row.pop("StdDev Nc2")
      row.pop("StdDev Nc4")
      row = row.rename(columns={'Nc3': 'Nc', 'StdDev Nc3': 'Nc Std. Dev'})
    elif dev_nc4 <= dev_nc1 and dev_nc4 <= dev_nc2 and dev_nc4 <= dev_nc3:
      row.pop("Nc1")
      row.pop("Nc2")
      row.pop("Nc3")
      row.pop("StdDev Nc1")
      row.pop("StdDev Nc2")
      row.pop("StdDev Nc3")
      row = row.rename(columns={'Nc4': 'Nc', 'StdDev Nc4': 'Nc Std. Dev'})

    new_df = new_df.append(row)  
  return new_df

def augment_data(dataset):
  """ Augments the data with 
      [a1, b1, a2, b2, area1, area2, TotalArea, RatioTotalArea]. """
  
  dataset["a1"]=1/2
  dataset["b1"]=dataset["a1"]/dataset["r1"]
  dataset["a2"]=dataset["2a2"]/2
  dataset["b2"]=dataset["a2"]/dataset["r2"]
  dataset["area1"]=dataset["a1"]*dataset["b1"]*np.pi
  dataset["area2"]=dataset["a2"]*dataset["b2"]*np.pi
  dataset["TotalArea"] = dataset["area1"] + dataset["area2"]
  dataset["RatioTotalArea"] = dataset["area1"]*(1-dataset["frac"]) + dataset["area2"]*(dataset["frac"])
  return dataset

def augment_data2(dataset):
  """ Augments the data with 
      [a1, b1, a2, b2, area1, area2, TotalArea, RatioTotalArea]. """
  
  dataset["a1"]=1/2
  dataset["b1"]=dataset["a1"]/dataset["r1"]
  dataset["a2"]=dataset["2a2"]/2
  dataset["b2"]=dataset["a2"]/dataset["r2"]
  dataset["area1"]=dataset["a1"]*dataset["b1"]*np.pi
  dataset["area2"]=dataset["a2"]*dataset["b2"]*np.pi
  dataset["TotalArea"] = dataset["area1"] + dataset["area2"]
  dataset["RatioTotalArea"] = (1/dataset["r1"])*(1-dataset["frac"]) + (dataset["frac"])*(dataset["a2"]*dataset["a2"]/dataset["r2"])
  return dataset

def save_all(data1, data2, loc):
  """ Saves the data to csv file at location. """
  all_data = data1.append(data2)
  all_data = all_data.reset_index()
  all_data.pop('index')

  all_data.to_csv(loc)
  print("# Old Data Samples: ", len(data1))
  print("# New Data Samples: ", len(data2))
  print("# ALL Data Samples: ", len(all_data))
  print("Saved!")
  return all_data

In [None]:
uploaded = pd.read_csv("alldata.csv")

dataset = uploaded.copy()
dataset.pop("Unnamed: 0")

new_rawdata = load_to_dataframe("AdditionalStuff.txt") 

Remove duplicates from the orginal data.

In [None]:
# does the original data have duplicates? remove them?
original_data = dataset.copy()
dataset = remove_duplicates(original_data)

Check for Nan rows in original and new data.

In [None]:
print("Original data with NANs: ", len(dataset))
dataset = dataset.dropna()
print("Original data w/o NANs : ", len(dataset))

print("New data with NANs: ", len(new_rawdata))
new_rawdata = new_rawdata.dropna()
print("New data w/o NANs : ", len(new_rawdata))

In [None]:
new_data = select_Nc(new_rawdata)

In [None]:
new_data.tail()

In [None]:
greater_than_30 = new_data[new_data["Nc"] >= 30]
print("# samples with Nc >= 30: {}%".format(len(greater_than_30)*100/len(new_data)))

In [None]:
augmented_new_data = augment_data(new_data)

In [None]:
augmented_new_data.head()

In [None]:
len(augmented_new_data)

In case stray samples were simulated. Can also perform a check by extending the dataset of new samples after generating the unique values then limiting by RTA. Finally, saving only the columns needed by the Monte Carlo simulation. 

In [None]:
x - 0.013
augmented_new_data_RTA_restricted = augmented_new_data[augmented_new_data["RatioTotalArea"]<=x]

In [None]:
len(augmented_new_data_RTA_restricted)

In [None]:
augmented_new_data_RTA_restricted["RatioTotalArea"].max()

In [None]:
new_stuff = save_all(dataset, augmented_new_data_RTA_restricted, "AdditionalStuff.csv")

In [None]:
print("# original samples with Nc >= 30 :", len(dataset[dataset["Nc"] >=30]))
print("# new samples with Nc >= 30      :", len(new_stuff[new_stuff["Nc"] >=30]))

In [None]:
new_stuff.tail()

In [None]:
temp = new_stuff.copy()

fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(20,35))

catagorical_dataset = temp.sort_values("r1").copy()
catagorical_dataset['r1'] = catagorical_dataset['r1'].astype(str)
sns.histplot(catagorical_dataset, x="r1", shrink=.8, color='red', ax=axs[0][0], label="r1")
catagorical_dataset = temp.sort_values("r2").copy()
catagorical_dataset['r2'] = catagorical_dataset['r2'].astype(str)
sns.histplot(catagorical_dataset, x="r2", shrink=.8, color='blue', ax=axs[0][1])

catagorical_dataset = temp.sort_values("frac").copy()
catagorical_dataset['frac'] = catagorical_dataset['frac'].astype(str)
sns.histplot(catagorical_dataset, x="frac", shrink=.8, color='red', ax=axs[1][0])
catagorical_dataset = temp.sort_values("2a2").copy()
catagorical_dataset['2a2'] = catagorical_dataset['2a2'].astype(str)
sns.histplot(catagorical_dataset, x="2a2", shrink=.8, color='blue', ax=axs[1][1])

catagorical_dataset = temp.sort_values("b1").copy()
catagorical_dataset['b1'] = catagorical_dataset['b1'].astype(str)
sns.histplot(catagorical_dataset, x="b1", shrink=.8, color='red', ax=axs[2][0])
