# Input =  {r1, r2, b1, a2, RatioTotalArea, frac}
Here we will attempt to elucidate ourselves on how these inputs change eta c. RatioTotalArea will always be an output because it is dependant on the others. 

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
sns.set()

In [None]:
def import_data(name='HybridEllipsePercolation.txt', sep1=" ", header1=None, shuffle=True):
    data = pd.read_csv(name, sep=sep1, header=header1)
    data.columns = ["r1", "2a2", "r2", "frac", "Nc", "Nc Std. Dev", "eta c" ]
    # data.reset_index(inplace=True)
    
    if shuffle:
        data = data.sample(frac=1).reset_index(drop=True)
    return data

def single_input_vs_output(dataset, input_column, output_column="eta c", plot=False, output=False):
    reduced_dataset = dataset.drop_duplicates(subset=input_column)
    # data_range = reduced_dataset[input_column].to_numpy()
    if output:
        print("{col} range is: {rng}    (output is {out})".format(col=input_column, rng=len(reduced_dataset), out=output_column))

    if plot:
        ax1 = reduced_dataset.plot.scatter( x=input_column,
                        y=output_column,
                        c='DarkBlue')
        return ax1
        # return sns.scatterplot(data=reduced_dataset, x=input_column, y=output_column)
    
    return reduced_dataset[[input_column, output_column]]

def split_data(dataset):
    train_dataset = dataset.sample(frac=0.6, random_state=0)
    valid_and_test_dataset = dataset.drop(train_dataset.index)
    test_dataset = valid_and_test_dataset.sample(frac=0.5, random_state=0)
    validation_dataset = valid_and_test_dataset.drop(test_dataset.index)
    return train_dataset, test_dataset, validation_dataset

def split_features_labels(data, label_column='eta c'):
    features = data
    labels = data.pop(label_column)
    return features, labels

In [None]:
rawdata = import_data(shuffle=False)
dataset = rawdata.copy()

# remove irrelevant columns
# dataset.pop("Nc Std. Dev")

# check for missing values
dataset.isna().sum()
# drop missing values
dataset = dataset.dropna()

dataset["a1"]=1/2
dataset["b1"]=dataset["a1"]/dataset["r1"]
dataset["a2"]=dataset["2a2"]/2
dataset["b2"]=dataset["a2"]/dataset["r2"]
dataset["area1"]=dataset["a1"]*dataset["b1"]*np.pi
dataset["area2"]=dataset["a2"]*dataset["b2"]*np.pi
dataset["TotalArea"] = dataset["area1"] + dataset["area2"]
dataset["RatioTotalArea"] = dataset["area1"]*(1-dataset["frac"]) + dataset["area2"]*(dataset["frac"])
dataset.head()

In [None]:
r1_range = single_input_vs_output(dataset, "r1")
r2_range = single_input_vs_output(dataset, "r2")
frac_range = single_input_vs_output(dataset, "frac")
a1_range = single_input_vs_output(dataset, "a1")
b1_range = single_input_vs_output(dataset, "b1")
a2_range = single_input_vs_output(dataset, "a2")
b2_range = single_input_vs_output(dataset, "b2")
_2a2_range = single_input_vs_output(dataset, "2a2")

In [None]:
b1_range.sort_values('b1', inplace=True)
a2_range.sort_values('a2', inplace=True)
# b2_range.sort_values('b2', inplace=True)

print("b1 number of vals {}".format(len(b1_range["b1"])))
print("b1 range {}".format(list(b1_range["b1"])))

# print("b2 number of vals {}".format(len(b2_range["b2"])))
# print("b2 range {}".format(list(b2_range["b2"])))

print("a2 number of vals {}".format(len(a2_range["a2"])))
print("a2 range {}".format(list(a2_range["a2"])))

print("r1 number of vals {}".format(len(r1_range["r1"])))
print("r1 range {}".format(list(r1_range["r1"])))

print("r2 number of vals {}".format(len(r2_range["r2"])))
print("r2 range {}".format(list(r2_range["r2"])))

## Checking eta c vs a2 and b1
Here we will look at how this effects the other variables and their relationship with eta c.

In [None]:
# a2_vals = (a2_range['a2'].to_numpy())
# # a2_val = a2_vals[0]

# fig = plt.figure(figsize=(10,10))
# yaxis_label = "eta c"
# title = "How eta c varies with a2" 
# for a2_val in a2_vals:
#     selection_dataset = dataset.copy()
#     select_1 = selection_dataset[selection_dataset["a2"]==a2_val]
#     plt.scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.2, label="a2={v}, {l}".format(v=a2_val,l=len(select_1)))
#     plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_a2.png")

In [None]:
# b1_range.sort_values('b1', inplace=True)
# b1_vals = (b1_range['b1'].to_numpy())[0:20:5]
# # a2_val = a2_vals[0]

# fig = plt.figure(figsize=(10,10))
# yaxis_label = "eta c"
# title = "How eta c varies with b1" 
# for b1_val in b1_vals:
#     selection_dataset = dataset.copy()
#     select_1 = selection_dataset[selection_dataset["b1"]==b1_val]
#     # select_2 = select_1[select_1['eta c']<0.4]
#     b1_str = "{:.2e}".format(b1_val)
#     plt.scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.9, label="b1={v}, {l}".format(v=b1_str,l=len(select_1)))
#     plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_b1.png")

You can see from the data that eta c increases with a2. For b1, this is also clear. Both a2 and b1 have well dsitributed values as well. a2 and b1 have more at the lower end because smaller steps are taken for the specific range, even if the number of samples are fairly similar. This gets to asks question, should we leave some out, average them, select from each or randomly select from each range or keep the data as is.  

In [None]:
# fig, axs = plt.subplots(ncols=2, figsize=(12,6))
# sns.distplot(dataset["a2"], color='red', bins=1000, ax=axs[0])
# sns.distplot(dataset["b1"], color='red', bins=1000, ax=axs[1])
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/a2_b1_distr.png")

## Checking eta c vs r1 and r2

In [None]:
# r1_vals = (r1_range['r1'].to_numpy())[::-5]
# # r1_val = r1_vals[0]

# fig = plt.figure(figsize=(10,10))
# yaxis_label = "eta c"
# title = "How eta c varies with r1" 
# for r1_val in r1_vals:
#     selection_dataset = dataset.copy()
#     select_1 = selection_dataset[selection_dataset["r1"]==r1_val]
#     plt.scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.9, label="r1={v}, {l}".format(v=r1_val,l=len(select_1)))
#     plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_r1.png")

In [None]:
# r2_vals = (r2_range['r2'].to_numpy())[1::5]
# # r1_val = r1_vals[0]

# fig = plt.figure(figsize=(10,10))
# yaxis_label = "eta c"
# title = "How eta c varies with r2" 
# for r2_val in r2_vals:
#     selection_dataset = dataset.copy()
#     select_1 = selection_dataset[selection_dataset["r2"]==r2_val]
#     plt.scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.9, label="r2={v}, {l}".format(v=r2_val,l=len(select_1)))
#     plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_r2.png")

## Checking eta c vs frac

In [None]:
# frac_range.sort_values('frac', inplace=True)
# frac_vals = (frac_range['frac'].to_numpy())[2:4]
# # r1_val = r1_vals[0]

# fig = plt.figure(figsize=(10,10))
# yaxis_label = "eta c"
# title = "How eta c varies with frac" 
# for frac_val in frac_vals:
#     selection_dataset = dataset.copy()
#     select_1 = selection_dataset[selection_dataset["frac"]==frac_val]
#     plt.scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.9, label="frac={v}, {l}".format(v=frac_val,l=len(select_1)))
#     plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_frac.png")

In [None]:
# frac_range.sort_values('frac', inplace=True)
# frac_vals = (frac_range['frac'].to_numpy())
# selection_dataset = dataset.copy()

# fig = plt.figure()
# fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(15,10))
# yaxis_label = "eta c"
# title = "How eta c varies with frac" 
# for i in range(len(frac_range)):
#     frac_val = frac_vals[i]
#     select_1 = selection_dataset[selection_dataset["frac"]==frac_val]
#     ax[int(i/6)][int(i/2)-(int(i/6)*3)].scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.4, label="frac={v}, {l}".format(v=frac_val,l=len(select_1)), )
#     # plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_frac.png")

You can see that with increasing fraction, the eta c value become more spread. This makes sense, because this means the ratio of small:big ellipses increases. In this experiment, the small ellipse dimensions varied with both the major and minor axis, whereas the large ellipse varied with just one axis. As the output becomes more dependant on the 'small ellipse', the variation increases because, loosely, there are more variables to consider with many more different values. {a1} had just {20} possibile values, while {a2,b2} had {13, 134}.