# Input =  {r1, r2, b1, a2, RatioTotalArea, frac}
Here we will attempt to elucidate ourselves on how these inputs change eta c. RatioTotalArea will always be an output because it is dependant on the others. 

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import seaborn as sns
sns.set()

In [None]:
def import_data(name='HybridEllipsePercolation.txt', sep1=" ", header1=None, shuffle=True):
    data = pd.read_csv(name, sep=sep1, header=header1)
    data.columns = ["r1", "2a2", "r2", "frac", "Nc", "Nc Std. Dev", "eta c" ]
    # data.reset_index(inplace=True)
    
    if shuffle:
        data = data.sample(frac=1).reset_index(drop=True)
    return data

def single_input_vs_output(dataset, input_column, output_column="eta c", plot=False, output=False):
    reduced_dataset = dataset.drop_duplicates(subset=input_column)
    # data_range = reduced_dataset[input_column].to_numpy()
    if output:
        print("{col} range is: {rng}    (output is {out})".format(col=input_column, rng=len(reduced_dataset), out=output_column))

    if plot:
        ax1 = reduced_dataset.plot.scatter( x=input_column,
                        y=output_column,
                        c='DarkBlue')
        return ax1
        # return sns.scatterplot(data=reduced_dataset, x=input_column, y=output_column)
    
    return reduced_dataset[[input_column, output_column]]

def split_data(dataset):
    train_dataset = dataset.sample(frac=0.6, random_state=0)
    valid_and_test_dataset = dataset.drop(train_dataset.index)
    test_dataset = valid_and_test_dataset.sample(frac=0.5, random_state=0)
    validation_dataset = valid_and_test_dataset.drop(test_dataset.index)
    return train_dataset, test_dataset, validation_dataset

def split_features_labels(data, label_column='eta c'):
    features = data
    labels = data.pop(label_column)
    return features, labels

In [None]:
rawdata = import_data(shuffle=False)
dataset = rawdata.copy()

# remove irrelevant columns
# dataset.pop("Nc Std. Dev")

# check for missing values
dataset.isna().sum()
# drop missing values
dataset = dataset.dropna()

dataset["a1"]=1/2
dataset["b1"]=dataset["a1"]/dataset["r1"]
dataset["a2"]=dataset["2a2"]/2
dataset["b2"]=dataset["a2"]/dataset["r2"]
dataset["area1"]=dataset["a1"]*dataset["b1"]*np.pi
dataset["area2"]=dataset["a2"]*dataset["b2"]*np.pi
dataset["TotalArea"] = dataset["area1"] + dataset["area2"]
dataset["RatioTotalArea"] = dataset["area1"]*(1-dataset["frac"]) + dataset["area2"]*(dataset["frac"])
dataset.head()

In [None]:
r1_range = single_input_vs_output(dataset, "r1")
r2_range = single_input_vs_output(dataset, "r2")
frac_range = single_input_vs_output(dataset, "frac")
a1_range = single_input_vs_output(dataset, "a1")
b1_range = single_input_vs_output(dataset, "b1")
a2_range = single_input_vs_output(dataset, "a2")
b2_range = single_input_vs_output(dataset, "b2")
_2a2_range = single_input_vs_output(dataset, "2a2")

In [None]:
b1_range.sort_values('b1', inplace=True)
a2_range.sort_values('a2', inplace=True)
# b2_range.sort_values('b2', inplace=True)

print("b1 number of vals {}".format(len(b1_range["b1"])))
# print("b1 range {}".format(list(b1_range["b1"])))
print("a1 number of vals {}".format(len(a1_range["a1"])))
# print("a1 range {}".format(list(b2_range["a1"])))

print()

print("a2 number of vals {}".format(len(a2_range["a2"])))
# print("a2 range {}".format(list(a2_range["a2"])))
print("b2 number of vals {}".format(len(b2_range["b2"])))
# print("b2 range {}".format(list(b2_range["b2"])))

print()

print("r1 number of vals {}".format(len(r1_range["r1"])))
print("r1 range {}".format(list(r1_range["r1"])))

print("r2 number of vals {}".format(len(r2_range["r2"])))
print("r2 range {}".format(list(r2_range["r2"])))

## Checking eta c vs a2 and b1
Here we will look at how this effects the other variables and their relationship with eta c.

In [None]:
# a2_vals = (a2_range['a2'].to_numpy())
# # a2_val = a2_vals[0]

# fig = plt.figure(figsize=(10,10))
# yaxis_label = "eta c"
# title = "How eta c varies with a2" 
# for a2_val in a2_vals:
#     selection_dataset = dataset.copy()
#     select_1 = selection_dataset[selection_dataset["a2"]==a2_val]
#     plt.scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.2, label="a2={v}, {l}".format(v=a2_val,l=len(select_1)))
#     plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_a2.png")

In [None]:
# b1_range.sort_values('b1', inplace=True)
# b1_vals = (b1_range['b1'].to_numpy())[0:20:5]
# # a2_val = a2_vals[0]

# fig = plt.figure(figsize=(10,10))
# yaxis_label = "eta c"
# title = "How eta c varies with b1" 
# for b1_val in b1_vals:
#     selection_dataset = dataset.copy()
#     select_1 = selection_dataset[selection_dataset["b1"]==b1_val]
#     # select_2 = select_1[select_1['eta c']<0.4]
#     b1_str = "{:.2e}".format(b1_val)
#     plt.scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.9, label="b1={v}, {l}".format(v=b1_str,l=len(select_1)))
#     plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_b1.png")

![eta_c_vs_a2](exp_dataset/eta_c_vs_a2.png)
![eta_c_vs_b1](exp_dataset/eta_c_vs_b1.png)


You can see from the data that eta c increases with a2. For b1, this is also clear. Both a2 and b1 have well dsitributed values as well. a2 and b1 have more at the lower end because smaller steps are taken for the specific range, even if the number of samples are fairly similar. This gets to asks question, should we leave some out, average them, select from each or randomly select from each range or keep the data as is.  

In [None]:
# fig, axs = plt.subplots(ncols=2, figsize=(12,6))
# sns.distplot(dataset["a2"], color='red', bins=1000, ax=axs[0])
# sns.distplot(dataset["b1"], color='red', bins=1000, ax=axs[1])
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/a2_b1_distr.png")

![a2_b1_distr](exp_dataset/a2_b1_distr.png)

## Checking eta c vs r1 and r2

In [None]:
# r1_vals = (r1_range['r1'].to_numpy())[::-5]
# # r1_val = r1_vals[0]

# fig = plt.figure(figsize=(10,10))
# yaxis_label = "eta c"
# title = "How eta c varies with r1" 
# for r1_val in r1_vals:
#     selection_dataset = dataset.copy()
#     select_1 = selection_dataset[selection_dataset["r1"]==r1_val]
#     plt.scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.9, label="r1={v}, {l}".format(v=r1_val,l=len(select_1)))
#     plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_r1.png")

In [None]:
# r2_vals = (r2_range['r2'].to_numpy())[1::5]
# # r1_val = r1_vals[0]

# fig = plt.figure(figsize=(10,10))
# yaxis_label = "eta c"
# title = "How eta c varies with r2" 
# for r2_val in r2_vals:
#     selection_dataset = dataset.copy()
#     select_1 = selection_dataset[selection_dataset["r2"]==r2_val]
#     plt.scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.9, label="r2={v}, {l}".format(v=r2_val,l=len(select_1)))
#     plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_r2.png")

![eta_c_vs_r1](exp_dataset/eta_c_vs_r1.png)
![eta_c_vs_r2](exp_dataset/eta_c_vs_r2.png)

## Checking eta c vs frac

In [None]:
# frac_range.sort_values('frac', inplace=True)
# frac_vals = (frac_range['frac'].to_numpy())[2:4]
# # r1_val = r1_vals[0]

# fig = plt.figure(figsize=(10,10))
# yaxis_label = "eta c"
# title = "How eta c varies with frac" 
# for frac_val in frac_vals:
#     selection_dataset = dataset.copy()
#     select_1 = selection_dataset[selection_dataset["frac"]==frac_val]
#     plt.scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.9, label="frac={v}, {l}".format(v=frac_val,l=len(select_1)))
#     plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_frac.png")

In [None]:
# frac_range.sort_values('frac', inplace=True)
# frac_vals = (frac_range['frac'].to_numpy())
# selection_dataset = dataset.copy()

# fig = plt.figure()
# fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(15,10))
# yaxis_label = "eta c"
# title = "How eta c varies with frac" 
# for i in range(len(frac_range)):
#     frac_val = frac_vals[i]
#     select_1 = selection_dataset[selection_dataset["frac"]==frac_val]
#     ax[int(i/6)][int(i/2)-(int(i/6)*3)].scatter(x=np.arange(0,len(select_1)), y=select_1['eta c'], marker='.', alpha=0.4, label="frac={v}, {l}".format(v=frac_val,l=len(select_1)), )
#     # plt.legend()
# plt.ylabel(yaxis_label)
# plt.title(title)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/eta_c_vs_frac.png")

![eta_c_vs_frac](exp_dataset/eta_c_vs_frac.png)

You can see that with increasing fraction, the eta c value become more spread. This makes sense, because this means the ratio of small:big ellipses increases. In this experiment, the small ellipse dimensions varied with both the major and minor axis, whereas the large ellipse varied with just one axis. As the output becomes more dependant on the 'small ellipse', the variation increases because, loosely, there are more variables to consider with many more different values. {a1} had just {20} possibile values, while {a2,b2} had {13, 134}.

## Check RatioTotalArea and frac vs Eta C

![RTA_diff_frac](exp_dataset/RTA_diff_frac.png)

You can tell that for small fractions, the input-output or RatioTotalArea-EtaC, looks pretty logirithmic. At higher fracs this, however, falls away. Maybe we can try and figure out why by taking frac=0.9 and instead, varying r1 or r2. 

We can vary for it for r=[1,5,10,100,200,500,1000]

In [None]:
# selection_dataset = dataset.copy()
# r1_vals = [1,10,100,200,500,1000]
# frac_val = 0.9

# fig = plt.figure()
# fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(15,10))
# yaxis_label = "eta c"
# for i in range(len(r1_vals)):
#     r1_val = r1_vals[i]
#     title = "Eta c vs RTA for r1={}".format(r1_val) 
#     select_1 = selection_dataset[selection_dataset["frac"]==frac_val]
#     select_2 = select_1[select_1["r1"]==r1_val]

#     # ax[int(i/6)][int(i/2)-(int(i/6)*3)].scatter(x=select_2['RatioTotalArea'], y=select_2['eta c'], marker='.', alpha=0.4)
#     # ax[int(i/6)][int(i/2)-(int(i/6)*3)].set_title(title)
    
#     ax[int(i/3)][i%3].scatter(x=select_2['RatioTotalArea'], y=select_2['eta c'], marker='.', alpha=0.4)
#     ax[int(i/3)][i%3].set_title(title)

# xdata = "RatioTotalArea"
# ydata = "Eta c"
# plt.ylabel(yaxis_label)
# ax[0][0].set_ylabel(ydata)
# ax[1][0].set_ylabel(ydata)
# ax[1][0].set_xlabel(xdata)
# ax[1][1].set_xlabel(xdata)
# ax[1][2].set_xlabel(xdata)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/etac_RCA_r1const.png")
# plt.close()

In [None]:
# selection_dataset = dataset.copy()
# r2_vals = [1,10,100,200,500,1000]
# frac_val = 0.9

# fig = plt.figure()
# fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(15,10))
# yaxis_label = "eta c"
# for i in range(len(r2_vals)):
#     r2_val = r2_vals[i]
#     title = "Eta c vs RTA for r2={}".format(r2_val) 
#     select_1 = selection_dataset[selection_dataset["frac"]==frac_val]
#     select_2 = select_1[select_1["r2"]==r2_val]
    
#     ax[int(i/3)][i%3].scatter(x=select_2['RatioTotalArea'], y=select_2['eta c'], marker='.', alpha=0.4)
#     ax[int(i/3)][i%3].set_title(title)

# xdata = "RatioTotalArea"
# ydata = "Eta c"
# plt.ylabel(yaxis_label)
# ax[0][0].set_ylabel(ydata)
# ax[1][0].set_ylabel(ydata)
# ax[1][0].set_xlabel(xdata)
# ax[1][1].set_xlabel(xdata)
# ax[1][2].set_xlabel(xdata)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/etac_RCA_r2const.png")
# plt.close()

![etac_RCA_r1const](exp_dataset/etac_RCA_r1const.png)
![etac_RCA_r2const](exp_dataset/etac_RCA_r2const.png)

Keeping frac constant and plotting eta c vs RatioTotalArea for varying r values show similar variation for the 2, but with more spread for r2. The data shows what is typical for a slightly underdamped system at lower values and overdamped at higher. This at least gives us some way of viewing how r effects the effect of RatioTotalArea on eta c. 

Another useful thing to notice is that the RatioTotalArea has a couple of 'distinct' values (with some small deviations).

However, we may use b1 and a2, so how do they cause changes?

In [None]:
# selection_dataset = dataset.copy()
# b1s = b1_range['b1'].to_numpy()
# b1_vals = [b1s[0], b1s[5], b1s[8], b1s[12], b1s[16], b1s[19] ]
# frac_val = 0.9

# fig = plt.figure()
# fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(15,10))
# yaxis_label = "eta c"
# for i in range(len(b1_vals)):
#     b1_val = b1_vals[i]
#     title = "Eta c vs RTA for b1={}".format(b1_val) 
#     select_1 = selection_dataset[selection_dataset["frac"]==frac_val]
#     select_2 = select_1[select_1["b1"]==b1_val]
    
#     ax[int(i/3)][i%3].scatter(x=select_2['RatioTotalArea'], y=select_2['eta c'], marker='.', alpha=0.4)
#     ax[int(i/3)][i%3].set_title(title)

# xdata = "RatioTotalArea"
# ydata = "Eta c"
# plt.ylabel(yaxis_label)
# ax[0][0].set_ylabel(ydata)
# ax[1][0].set_ylabel(ydata)
# ax[1][0].set_xlabel(xdata)
# ax[1][1].set_xlabel(xdata)
# ax[1][2].set_xlabel(xdata)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/etac_RCA_b1const.png")
# plt.close()

In [None]:
# selection_dataset = dataset.copy()
# a2s = a2_range['a2'].to_numpy()
# a2_vals = [a2s[0], a2s[4], a2s[6], a2s[8], a2s[10], a2s[12] ]
# frac_val = 0.9

# fig = plt.figure()
# fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(15,10))
# yaxis_label = "eta c"
# for i in range(len(b1_vals)):
#     a2_val = a2_vals[i]
#     title = "Eta c vs RTA for a2={}".format(a2_val) 
#     select_1 = selection_dataset[selection_dataset["frac"]==frac_val]
#     select_2 = select_1[select_1["a2"]==a2_val]
    
#     ax[int(i/3)][i%3].scatter(x=select_2['RatioTotalArea'], y=select_2['eta c'], marker='.', alpha=0.4)
#     ax[int(i/3)][i%3].set_title(title)

# xdata = "RatioTotalArea"
# ydata = "Eta c"
# plt.ylabel(yaxis_label)
# ax[0][0].set_ylabel(ydata)
# ax[1][0].set_ylabel(ydata)
# ax[1][0].set_xlabel(xdata)
# ax[1][1].set_xlabel(xdata)
# ax[1][2].set_xlabel(xdata)
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/etac_RCA_a2const.png")
# plt.close()

![etac_RCA_b1const](exp_dataset/etac_RCA_b1const.png)
![etac_RCA_a2const](exp_dataset/etac_RCA_a2const.png)

b1 shows similar plots to r1, which was expected, as r1 is inversely proportional to b1. a2, however, is different. a2 shows logirithmic curves that would have to only be learnt for varying fractions, with simpler approximations for the subset of a chosen fraction. This is interesting (and the same for 2a2). 

## How things Change and Standardizing
Here we will look at when inputs changed. This will hopefully provide some clarity to the changes in eta c?

In [None]:
selection_dataset = dataset.copy()

# scale the data
selection_dataset['r1'] = StandardScaler().fit_transform(selection_dataset['r1'].values.reshape(-1,1))
selection_dataset['r2'] = StandardScaler().fit_transform(selection_dataset['r2'].values.reshape(-1,1))
selection_dataset['frac'] = StandardScaler().fit_transform(selection_dataset['frac'].values.reshape(-1,1))
selection_dataset['2a2'] = StandardScaler().fit_transform(selection_dataset['2a2'].values.reshape(-1,1))
selection_dataset['eta c'] = StandardScaler().fit_transform(selection_dataset['eta c'].values.reshape(-1,1))
selection_dataset['Nc'] = StandardScaler().fit_transform(selection_dataset['Nc'].values.reshape(-1,1))

#account for added features
selection_dataset['a1'] = StandardScaler().fit_transform(selection_dataset['a1'].values.reshape(-1,1))
selection_dataset['a2'] = StandardScaler().fit_transform(selection_dataset['a2'].values.reshape(-1,1))
selection_dataset['b1'] = StandardScaler().fit_transform(selection_dataset['b1'].values.reshape(-1,1))
selection_dataset['b2'] = StandardScaler().fit_transform(selection_dataset['b2'].values.reshape(-1,1))
selection_dataset['area1'] = StandardScaler().fit_transform(selection_dataset['area1'].values.reshape(-1,1))
selection_dataset['area2'] = StandardScaler().fit_transform(selection_dataset['area2'].values.reshape(-1,1))
selection_dataset["TotalArea"] = StandardScaler().fit_transform(selection_dataset['TotalArea'].values.reshape(-1,1))
selection_dataset["RatioTotalArea"] = StandardScaler().fit_transform(selection_dataset['RatioTotalArea'].values.reshape(-1,1))

In [None]:
# sel = "r1"

# fig = plt.figure()
# fig, ax = plt.subplots(figsize=(15,10))

# y_data = (selection_dataset[sel].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, label="{}".format(sel))

# y_data = (selection_dataset["eta c"].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, alpha=0.2, label="eta c")

# ax.legend()
# ax.set_title("Eta c and {}".format(sel))
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/etac_and_{}.png".format(sel))
# plt.close()

#_______________________________________________________________________________________________________________________________________________________________________________________________________________________
# sel = "r2"

# fig = plt.figure()
# fig, ax = plt.subplots(figsize=(15,10))

# y_data = (selection_dataset[sel].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, label="{}".format(sel))

# y_data = (selection_dataset["eta c"].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, alpha=0.2, label="eta c")

# ax.legend()
# ax.set_title("Eta c and {}".format(sel))
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/etac_and_{}.png".format(sel))
# plt.close()

#_______________________________________________________________________________________________________________________________________________________________________________________________________________________
# sel = "frac"

# fig = plt.figure()
# fig, ax = plt.subplots(figsize=(15,10))

# y_data = (selection_dataset[sel].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, label="{}".format(sel))

# y_data = (selection_dataset["eta c"].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, alpha=0.2, label="eta c")

# ax.legend()
# ax.set_title("Eta c and {}".format(sel))
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/etac_and_{}.png".format(sel))
# plt.close()

#_______________________________________________________________________________________________________________________________________________________________________________________________________________________
# sel = "a2"

# fig = plt.figure()
# fig, ax = plt.subplots(figsize=(15,10))

# y_data = (selection_dataset[sel].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, label="{}".format(sel))

# y_data = (selection_dataset["eta c"].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, alpha=0.2, label="eta c")

# ax.legend()
# ax.set_title("Eta c and {}".format(sel))
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/etac_and_{}.png".format(sel))
# plt.close()

#_______________________________________________________________________________________________________________________________________________________________________________________________________________________
# sel = "b1"

# fig = plt.figure()
# fig, ax = plt.subplots(figsize=(15,10))

# y_data = (selection_dataset[sel].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, label="{}".format(sel))

# y_data = (selection_dataset["eta c"].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, alpha=0.2, label="eta c")

# ax.legend()
# ax.set_title("Eta c and {}".format(sel))
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/etac_and_{}.png".format(sel))
# plt.close()

#_______________________________________________________________________________________________________________________________________________________________________________________________________________________
# sel = "b2"

# fig = plt.figure()
# fig, ax = plt.subplots(figsize=(15,10))

# y_data = (selection_dataset[sel].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, label="{}".format(sel))

# y_data = (selection_dataset["eta c"].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, alpha=0.2, label="eta c")

# ax.legend()
# ax.set_title("Eta c and {}".format(sel))
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/etac_and_{}.png".format(sel))
# plt.close()

#_______________________________________________________________________________________________________________________________________________________________________________________________________________________
# sel = "RatioTotalArea"

# fig = plt.figure()
# fig, ax = plt.subplots(figsize=(15,10))

# y_data = (selection_dataset[sel].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, label="{}".format(sel))

# y_data = (selection_dataset["eta c"].to_numpy())[::2]
# ax.scatter(x=np.arange(len(y_data)), y=y_data, alpha=0.2, label="eta c")

# ax.legend()
# ax.set_title("Eta c and {}".format(sel))
# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/etac_and_{}.png".format(sel))
# plt.close()

These could have been inferred by the correlation and earlier plots, but it was nice to visualise. 

![etac_and_r1](exp_dataset/etac_and_r1.png)
![etac_and_r2](exp_dataset/etac_and_r2.png)
![etac_and_frac](exp_dataset/etac_and_frac.png)
![etac_and_a2](exp_dataset/etac_and_a2.png)
![etac_and_b2](exp_dataset/etac_and_b2.png)
![etac_and_b1](exp_dataset/etac_and_b1.png)


## Check b1 and RatioTotalArea distr

Here we will check the distributions for the normalized b1 and RatioTotalArea. 

In [None]:
# fig = plt.figure()
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12,12))

# sns.distplot(selection_dataset["b1"], kde=True, color='red', bins=1000, ax=axes[0][0])
# axes[0][0].set_title("Scaled b1")
# sns.distplot(selection_dataset["RatioTotalArea"], kde=True, color='red', bins=1000, ax=axes[0][1])
# axes[0][1].set_title("Scaled RTA")
# sns.distplot(dataset["b1"], kde=True, color='red', bins=1000, ax=axes[1][0])
# axes[1][0].set_title("Unscaled b1")
# sns.distplot(dataset["RatioTotalArea"], kde=True, color='red', bins=1000, ax=axes[1][1])
# axes[1][1].set_title("Unscaled RTA")

# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/b1_RTA_distr.png")
# plt.close()

![b1_RTA_distr](exp_dataset/b1_RTA_distr.png)

You can see here, that using the sklearn StandardScaler is not advised for b1 and RTA. From the previous work, scaling frac and a2 is good especially if we are using r1 and r2, because I think r1 and r2 need to be scaled to reduce the impact of the much larger r values. However, if {a2, b1, RatioTotalArea, frac} are used, scaling is not necessary as all of the values are: 0<x<1.

We can also limit the dataset to the 42016 values where b1<0.1 and RatioTotalArea<0.2.

In [None]:
lim_dataset = dataset.copy()

lim_dataset = lim_dataset[ (lim_dataset["b1"]<0.1) & (lim_dataset["RatioTotalArea"]<0.2) ]
lim_dataset.describe().transpose()

In [None]:
# fig = plt.figure()
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12,12))

# sns.distplot(lim_dataset["b1"], kde=True, color='red', bins=100, ax=axes[0][0])
# axes[0][0].set_title("b1")
# sns.distplot(lim_dataset["RatioTotalArea"], kde=True, color='red', bins=1000, ax=axes[0][1])
# axes[0][1].set_title("RatioTotalArea")
# sns.distplot(lim_dataset["a2"], kde=True, color='red', bins=100, ax=axes[1][0])
# axes[1][0].set_title("a2")
# sns.distplot(lim_dataset["frac"], kde=True, color='red', bins=100, ax=axes[1][1])
# axes[1][1].set_title("frac")

# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/allunscaleddistr.png")
# plt.close()

![allunscaleddistr](exp_dataset/allunscaleddistr.png)

However, doing this, we will need to scale the data. 

In [None]:
scaled_dataset = lim_dataset.copy()

scaled_dataset['b1'] = StandardScaler().fit_transform(lim_dataset['b1'].values.reshape(-1,1))
scaled_dataset['a2'] = StandardScaler().fit_transform(lim_dataset['a2'].values.reshape(-1,1))
scaled_dataset['RatioTotalArea'] = StandardScaler().fit_transform(lim_dataset['RatioTotalArea'].values.reshape(-1,1))
scaled_dataset['frac'] = StandardScaler().fit_transform(lim_dataset['frac'].values.reshape(-1,1))
scaled_dataset['eta c'] = StandardScaler().fit_transform(lim_dataset['eta c'].values.reshape(-1,1))


In [None]:
# fig = plt.figure()
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12,12))

# sns.distplot(scaled_dataset["b1"], kde=True, color='red', bins=100, ax=axes[0][0])
# axes[0][0].set_title("b1")
# sns.distplot(scaled_dataset["RatioTotalArea"], kde=True, color='red', bins=1000, ax=axes[0][1])
# axes[0][1].set_title("RatioTotalArea")
# sns.distplot(scaled_dataset["a2"], kde=True, color='red', bins=100, ax=axes[1][0])
# axes[1][0].set_title("a2")
# sns.distplot(scaled_dataset["frac"], kde=True, color='red', bins=100, ax=axes[1][1])
# axes[1][1].set_title("frac")

# fig.savefig("/media/nirav/34E0-F309/KTH/Thesis/edited_data/exp_dataset/allscaleddistr.png")
# plt.close()

![allscaleddistr](exp_dataset/allscaleddistr.png)

In [None]:
fin_ds = dataset[["b1","a2","RatioTotalArea","frac", "eta c"]]
fin_us_ds = lim_dataset[["b1","a2","RatioTotalArea","frac", "eta c"]]
fin_s_ds = scaled_dataset[["b1","a2","RatioTotalArea","frac", "eta c"]]

print(fin_ds.tail())
print("Length:", len(fin_ds))
print()
print(fin_us_ds.tail())
print("Length:", len(fin_us_ds))
print()
print(fin_s_ds.tail())
print("Length:", len(fin_s_ds))

In [None]:
fin_ds.to_csv("data.csv")
fin_us_ds.to_csv("LIM_unscaled.csv")
fin_s_ds.to_csv("LIM_scaled.csv")

## My Choice
I think conceptually it was easier to use b1 and a2, but r1 and r2 give the same patterns to learn as b1 and a2. However, the algorithim may find it easier to use b1 and a2 instead because they both show positive correlation. 


            r1          r2	        2a2         frac	    b1	        a2	        RatioTotalArea
eta c	-0.372151	0.019714	-1.167498e-01	0.047311	1.348565	0.009857	0.762428


In [None]:
fig = sns.figure()