# E-coli 8 synthetic dataset test with CGA

In [1]:
using CSV, DataFrames
using Random
using Revise, GADNF

## Load data

In [2]:
target = "glnX"
data_dir = "./processed_dataset/"
tsv_file = joinpath(data_dir, "$(target)_processed_dataset.tsv")

"./processed_dataset/glnX_processed_dataset.tsv"

In [3]:
data = CSV.read(tsv_file, DataFrame)
first(data, 5)

Row,glnV,pheV,apaG,lysW,lysV,glnX
Unnamed: 0_level_1,Int64,Int64,Int64,Int64,Int64,Int64
1,0,1,0,0,1,0
2,1,0,1,0,0,0
3,0,0,0,0,0,1
4,0,0,0,1,0,1
5,0,1,1,1,0,0


In [4]:
target = names(data)[end]
features = names(data)[1:end-1]

5-element Vector{String}:
 "glnV"
 "pheV"
 "apaG"
 "lysW"
 "lysV"

In [5]:
X = data[!, 1:end-1] |> Array |> transpose |> BitMatrix
y = data[!, end] |> BitVector
;

## Run GA

In [13]:
cfg = GAConfig(; num_generations=500, population_size=200, num_conjunctions=3,
    num_elites=1,
    allowed_stagnation_generations=50,
    CN_subtree_cx_rate=0.1,
    edge_cx_rate=0.05,
    mut_rate_min=0.3,
    mut_rate_max=0.7,
    mut_rate=0.3,
    probabilistic_replacement=false)

GAConfig(500, 200, 1, 3, 2, 0.1, 0.05, 0.1, 0.1, 0.3, 0.7, 0.3, 50, false)

### Standard GA

In [30]:
@time final_pop = run_GA(X, y; cfg, target, features);

Running GA for glnX ...
Gen   Error    Complexity
1     (0.3100, 0.2000)
2     (0.3100, 0.2000)
3     (0.3100, 0.2000)
4     (0.3100, 0.2000)
5     (0.3100, 0.2000)
6     (0.3100, 0.2000)
7     (0.3100, 0.2000)
8     (0.3100, 0.2000)
9     (0.3100, 0.2000)
10    (0.3100, 0.2000)
11    (0.3100, 0.2000)
12    (0.3100, 0.2000)
13    (0.3100, 0.2000)
14    (0.3100, 0.2000)
15    (0.3100, 0.2000)
16    (0.4600, 0.0000)
17    (0.3100, 0.2000)
18    (0.3100, 0.2000)
19    (0.3100, 0.2000)
20    (0.3100, 0.2000)
21    (0.3100, 0.2000)
22    (0.3100, 0.2000)
23    (0.3100, 0.2000)
24    (0.3100, 0.2000)
25    (0.3100, 0.2000)
26    (0.3100, 0.2000)
27    (0.3100, 0.2000)
28    (0.3100, 0.2000)
29    (0.3100, 0.2000)
30    (0.3100, 0.2000)
31    (0.3100, 0.2000)
32    (0.3100, 0.2000)
33    (0.3100, 0.2000)
34    (0.3100, 0.2000)
35    (0.3100, 0.2000)
36    (0.3100, 0.2000)
37    (0.3100, 0.2000)
38    (0.3100, 0.2000)
39    (0.3100, 0.2000)
40    (0.3100, 0.2000)
41    (0.3100, 0.2000)
42    (

In [31]:
best = minimum(final_pop)
to_expression(best, features)

"false"

In [32]:
best_res, best_idx = findmin(ind->(ind.fitting_error_rate, ind.complexity), final_pop)
@show best_res
to_expression(final_pop[best_idx], features)

best_res = (0.24, 0.4)


"~pheV & lysW"

In [10]:
sort!(final_pop)
ds = [compute_distance(final_pop[i], final_pop[i+1]) for i in 1:length(final_pop)-1]
@show extrema(ds)
push!(ds, 0)

final_df = DataFrame(
    error=[ind.fitting_error_rate for ind in final_pop],
    complexity=[ind.complexity for ind in final_pop],
    distance=ds
)

extrema(ds) = (0, 7)


Row,error,complexity,distance
Unnamed: 0_level_1,Float64,Float64,Int64
1,0.24,0.866667,0
2,0.24,0.866667,0
3,0.24,0.866667,0
4,0.24,0.866667,0
5,0.24,0.866667,0
6,0.24,0.866667,0
7,0.24,0.866667,0
8,0.24,0.866667,0
9,0.24,0.866667,0
10,0.24,0.866667,0


In [11]:
to_expression(minimum(final_pop), features)

"~pheV & lysW"

In [12]:
findfirst(!iszero, ds)

160

### CGA

In [None]:
@time final_pop = run_CGA(X, y; cfg, target, features);

In [None]:
sort!(final_pop)
ds = [compute_distance(final_pop[i], final_pop[i+1]) for i in 1:length(final_pop)-1]
@show extrema(ds)
push!(ds, 0)
;

In [None]:
final_df = DataFrame(
    error=[ind.fitting_error_rate for ind in final_pop],
    complexity=[ind.complexity for ind in final_pop],
    distance=ds
)

In [None]:
to_expression(minimum(final_pop), features)

In [None]:
findfirst(!iszero, ds)

In [None]:
first(final_df, 20)