# Project 1

## 1: Demo specific functions

In [1]:
# Import from separate .py files
from ga import *
from p1 import *

In [2]:
# Set parameters

params_1 = {
        'indiv_len': 10,
        'pop_size': 8,              # Has to be even
        'num_parents':8,            # Has to be <= pop_size
        'p_m': 0.1,
        'p_c': 0.6,
        'max_sine_exp': 7,          # 2^7 -> [0,128]
        'max_gen': 10, 
        'sine_constraint': False
    }

In [3]:
# DEMO init_pop()

algorithm_1 = GA(params_1, fitness=sine_fitness)
pop = algorithm_1.init_pop()
print(pop)

['1110011100', '0010000010', '0111010101', '0101101101', '1111110000', '1100100000', '1100110011', '0110001010']


In [4]:
# DEMO select_parents()

parents = algorithm_1.select_parents(pop)
print('\nParents selcted to make offsprings:\n', parents)

[0.17379322 0.03809101 0.19677415 0.21075088 0.13460095 0.03917677
 0.20681302 0.        ]

Parents selcted to make offsprings:
 ['1110011100', '1110011100', '1100110011', '1100110011', '1100110011', '0101101101', '1110011100', '1100110011']


In [5]:
# DEMO make_offsprings()

dummy_parents = ['11111', '00000', '11111', '00000', '11111', '00000', '11111', '00000']

# Crossover
crossover_offsprings = algorithm_1.crossover(dummy_parents)
print('Crossover:\n', crossover_offsprings)

# Mutation
mutation_offsprings = algorithm_1.mutate(dummy_parents)
print('\nMutation:\n', mutation_offsprings)

# Make offsprings
offsprings = algorithm_1.make_offsprings(dummy_parents)
print('\n\nA new generation:\n', offsprings)

Crossover:
 ['10000', '01111', '10000', '01111', '10000', '01111', '10000', '01111']

Mutation:
 ['11111', '00001', '11111', '00010', '11110', '00000', '11111', '00000']


A new generation:
 ['10000', '01111', '10000', '01111', '10000', '00111', '10000', '01111']


## 2: Demo GA - Sine [0, 128]

In [6]:
# DEMO run(), the complete GA with sine fitness function

params_2 = {
        'indiv_len': 15,
        'pop_size': 20,              # Has to be even
        'num_parents':20,            # Has to be <= pop_size
        'p_m': 0.05,
        'p_c': 0.6,
        'max_sine_exp': 7,          # 2^7 -> [0,128]
        'max_gen': 100, 
        'sine_constraint': False
    }
algorithm_2 = GA(params_2, fitness=sine_fitness)
eval_log_2 = algorithm_2.run()

[0.01345241 0.00082054 0.00455451 0.06903413 0.09536301 0.03720317
 0.04272079 0.09812776 0.         0.06879811 0.10666058 0.00068105
 0.0212832  0.0009961  0.09076451 0.07369944 0.06591218 0.07587029
 0.05065152 0.08340672]
[0.07729798 0.05038883 0.07000773 0.03482749 0.05229308 0.05611074
 0.06032179 0.0892268  0.0863696  0.04632758 0.04898511 0.03237361
 0.02507193 0.05352393 0.00904418 0.         0.06032179 0.05645643
 0.00980959 0.08124178]
[0.00279002 0.07721474 0.0669614  0.0239789  0.07743289 0.07487296
 0.07447242 0.07721474 0.04484907 0.         0.01202381 0.07536745
 0.03622111 0.02158512 0.04433923 0.077242   0.04566335 0.07721474
 0.02348113 0.06707493]
[0.07750477 0.07749854 0.06700667 0.04525903 0.01888084 0.06700667
 0.07422396 0.04762535 0.06773651 0.02318838 0.0266089  0.07575504
 0.01850563 0.         0.03885836 0.06623358 0.05192496 0.06411046
 0.03173452 0.06033781]
[0.02186756 0.00074353 0.03295054 0.08393582 0.02153542 0.08484823
 0.05655006 0.00082045 0.09381274

In [7]:
# Generational data: population, real value, fitness value
gen_print = 10

for generation, data in eval_log_2.items():
    if generation % gen_print == 0:
        # 0: pop, 1: weights, 2: x-values, 3: fitness
        print('Generation:', generation, '\n')
        print('Population:', data[0], '\n')
        print('Population weights:', data[1], '\n')
        print('Population sine fitness value:', ['{:.2f}'.format(item) for item in data[3]], '\n\n')
        

Generation: 0 

Population: ['010001101101001', '111110001000101', '010101011010011', '001010110111001', '100110110110010', '101101101101110', '100101100101110', '111100110011001', '111010111000111', '111011110100101', '011101001101100', '100001101100010', '100111100110010', '101000001001110', '101010000011001', '000100100010001', '111101001010000', '011101101001010', '110000101110110', '110101101011111'] 

Population weights: [0.01345241 0.00082054 0.00455451 0.06903413 0.09536301 0.03720317
 0.04272079 0.09812776 0.         0.06879811 0.10666058 0.00068105
 0.0212832  0.0009961  0.09076451 0.07369944 0.06591218 0.07587029
 0.05065152 0.08340672] 

Population sine fitness value: ['-0.75', '-0.98', '-0.92', '0.27', '0.75', '-0.32', '-0.22', '0.80', '-1.00', '0.26', '0.95', '-0.99', '-0.61', '-0.98', '0.66', '0.35', '0.21', '0.39', '-0.07', '0.53'] 


Generation: 10 

Population: ['101001100100100', '001111011011010', '001111100100101', '011101110011010', '001101001011010', '11110100101

In [8]:
# Plot the population through the generations

%matplotlib notebook
from ipywidgets import *
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider, Button


DATA = eval_log_2

x_sine = np.linspace(0, 128, 1000)
y_sine = np.sin(x_sine)

fig, axs = plt.subplots(figsize=(5,3))
plt.subplots_adjust(bottom=0.35)
plt.title("Population plot")
plt.xlabel("x")
plt.ylabel("sin(x)");
plt.xlim(-1, 129)
plt.ylim(-1.5, 1.5)
line, = axs.plot(x_sine, y_sine)

i = 1
x = DATA[i][2]
y = DATA[i][3]

dots = axs.scatter(x, y, marker='o', color='orange')

ax = plt.axes([0.25, 0.1, 0.55, 0.05])
generation = Slider(ax, label='Generation', valmin=0, valmax=params_2['max_gen'], valstep=1, valinit=i)

def update(val):
    gen = generation.val
    dots.set_offsets(np.c_[DATA[gen][2], DATA[gen][3]])

generation.on_changed(update)


    


<IPython.core.display.Javascript object>

0

In [9]:
# Plot the fitness sum per generation

gen_fitness = [np.sum(eval_log_2[i][3]) for i in eval_log_2.keys()]

# Plot
fig, axs = plt.subplots(figsize=(5,3))
plt.title("Fitness Plot")
plt.xlabel("Generation")
plt.ylabel("Sum of Population Fitness");
axs.plot(gen_fitness)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fe9902a6f50>]

## 3: Demo GA - Sine [0, 128] with [5, 10]-constraint

In [10]:
params_3 = {
        'indiv_len': 15,
        'pop_size': 20,              # Has to be even
        'num_parents':20,            # Has to be <= pop_size
        'p_m': 0.05,
        'p_c': 0.6,
        'max_sine_exp': 7,          # 2^7 -> [0,128]
        'max_gen': 10, 
        'sine_constraint': True     # THIS ONE IS TRUE NOW
    }

algorithm_3 = GA(params_3, fitness=sine_fitness)
eval_log_3 = algorithm_3.run()


[0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05
 0.05 0.05 0.05 0.05 0.05 0.05]
[0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05
 0.05 0.05 0.05 0.05 0.05 0.05]
[0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05
 0.05 0.05 0.05 0.05 0.05 0.05]
[0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05
 0.05 0.05 0.05 0.05 0.05 0.05]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0.07113638 0.06733039 0.         0.06733039 0.         0.06733039
 0.06709357 0.05261855 0.06709357 0.         0.06709357 0.06733039
 0.06827903 0.06733039 0.06733039 0.         0.06733039 0.
 0.06827903 0.06709357]
[0.05563616 0.08060826 0.05387963 0.04347954 0.05563616 0.05407426
 0.10123107 0.05642004 0.         0.05563616 0.05544047 0.05563616
 0.05602787 0.         0.05021258 0.0562239  0.05878111 0.
 0.05544047 0.05563616]
[0.05316909 0.05298209 0.         0.0773894  0.         0.07685571
 0.05167645 0.05167645 0.

In [11]:
# Generational data: population, real value, fitness value
gen_print = 10

for generation, data in eval_log_3.items():
    if generation % gen_print == 0:
        # 0: pop, 1: weights, 2: x-values, 3: fitness
        print('Generation:', generation, '\n')
        print('Population:', data[0], '\n')
        print('Population weights:', data[1], '\n')
        print('Population sine fitness value:', ['{:.2f}'.format(item) for item in data[3]], '\n\n')

Generation: 0 

Population: ['011110000010001', '001010000111000', '101100100001010', '000110011111101', '011111000000001', '010100000111111', '111010011011101', '110100110101100', '110110010101101', '101010100111000', '101100110010100', '100111011010000', '010011001100110', '010001100100011', '110110110001010', '100000000010010', '010001101001100', '010000011100110', '110010010110111', '110100011010011'] 

Population weights: [0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05 0.05
 0.05 0.05 0.05 0.05 0.05 0.05] 

Population sine fitness value: ['-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25', '-1.25'] 


Generation: 10 

Population: ['000100010010001', '000100010010000', '000100010010000', '000100010010100', '000100010000100', '000000110001100', '000100010010000', '000100010110001', '000100100001101', '000000010010000', '000100010000000', '000100100

In [20]:
# Plot the generations

DATA = eval_log_3

x_sine = np.linspace(0, 128, 1000)
y_sine = np.sin(x_sine)

fig, axs = plt.subplots(figsize=(5,3))
plt.subplots_adjust(bottom=0.35)
plt.title("Population plot")
plt.xlabel("x")
plt.ylabel("sin(x)");
plt.xlim(-1, 129)
plt.ylim(-1.5, 1.5)
line, = axs.plot(x_sine, y_sine)

i = 1
x = DATA[i][2]
y = DATA[i][3]

dots = axs.scatter(x, y, marker='o', color='orange')

ax = plt.axes([0.25, 0.1, 0.55, 0.05])
generation = Slider(ax, label='Generation', valmin=0, valmax=params_3['max_gen'], valstep=1, valinit=i)

def update(val):
    gen = generation.val
    dots.set_offsets(np.c_[DATA[gen][2], DATA[gen][3]])

generation.on_changed(update)


<IPython.core.display.Javascript object>

0

In [13]:
# Plot the fitness sum per generation

gen_fitness = [np.sum(eval_log_3[i][3]) for i in eval_log_3.keys()]

# Plot
fig, axs = plt.subplots(figsize=(5,3))
plt.title("Fitness Plot")
plt.xlabel("Generation")
plt.ylabel("Sum of Population Fitness");
axs.plot(gen_fitness)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fe9902b6c10>]

## 4: Demo GA - Feature selection

In [14]:
# View data set

data_df = pd.read_csv('data/data.csv', index_col=[0])
values_df = pd.read_csv('data/values.csv', index_col=[0])
display(data_df, values_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
0,8.0,1.0,0.19,0.33,0.02,0.90,0.12,0.17,0.34,0.47,...,0.12,0.42,0.50,0.51,0.64,0.12,0.26,0.20,0.32,0.20
1,53.0,1.0,0.00,0.16,0.12,0.74,0.45,0.07,0.26,0.59,...,0.21,0.50,0.34,0.60,0.52,0.02,0.12,0.45,0.00,0.67
2,24.0,1.0,0.00,0.42,0.49,0.56,0.17,0.04,0.39,0.47,...,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.00,0.43
3,34.0,1.0,0.04,0.77,1.00,0.08,0.12,0.10,0.51,0.50,...,0.19,0.30,0.73,0.64,0.65,0.02,0.39,0.28,0.00,0.12
4,42.0,1.0,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,...,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.00,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1988,28.0,10.0,0.02,0.41,1.00,0.21,0.02,0.01,0.54,0.57,...,0.01,0.75,0.57,0.74,0.71,0.03,0.17,0.02,0.00,0.19
1989,12.0,10.0,0.01,0.40,0.10,0.87,0.12,0.16,0.43,0.51,...,0.22,0.28,0.34,0.48,0.39,0.01,0.28,0.05,0.00,0.09
1990,6.0,10.0,0.05,0.96,0.46,0.28,0.83,0.32,0.69,0.86,...,0.53,0.25,0.17,0.10,0.00,0.02,0.37,0.20,0.00,0.45
1991,9.0,10.0,0.16,0.37,0.25,0.69,0.04,0.25,0.35,0.50,...,0.25,0.68,0.61,0.79,0.76,0.08,0.32,0.18,0.91,0.23


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
0,6.0,10.0,0.2,0.78,0.14,0.46,0.24,0.77,0.5,0.62,...,0.68,0.5,0.34,0.35,0.68,0.11,0.3,0.05,1.0,0.48


In [15]:
params_4 = {
        'indiv_len': 102,
        'pop_size': 10,              # Has to be even
        'num_parents':10,            # Has to be <= pop_size
        'p_m': 0.05,
        'p_c': 0.6,
        'max_sine_exp': 7,          # 2^7 -> [0,128]
        'max_gen': 10, 
        'sine_constraint': False
    }

algorithm_4 = GA(params_4, fitness=feature_fitness)

In [16]:
# Test regression class
import LinReg

test_pop = algorithm_4.init_pop()
#print(test_pop)

linreg = LinReg.LinReg()
#feats = values_df.to_numpy().shape[1]
#x = data_df.to_numpy().reshape(feats, data_df.to_numpy().shape[0])
#y = values_df.to_numpy().reshape(feats, 1)
#linreg.train(x, y)
errors = []
for indiv in test_pop:
    x = linreg.get_columns(data_df, indiv)
    y = linreg.get_columns(values_df, indiv)
    #print(x.shape, y.shape)
    feats = y.shape[1]
    x = x.reshape(feats, x.shape[0])
    y = y.reshape(feats, 1)
    error = linreg.get_fitness(x, y)
    errors.append(error)
    #print(error)
    

In [17]:
eval_log_4 = algorithm_4.run()

[0.05576174 0.21077    0.22941256 0.02865446 0.14024098 0.05419982
 0.06639693 0.13813513 0.07642836 0.        ]
[0.16192649 0.16579005 0.1634714  0.0435557  0.         0.16142596
 0.02037307 0.13968014 0.09933556 0.04444162]
[0.11514343 0.         0.11150187 0.1113706  0.10316697 0.11176064
 0.10978287 0.11169146 0.11102206 0.11456009]
[0.07396415 0.         0.11631476 0.11499253 0.11589237 0.11444271
 0.11533103 0.1173808  0.11672993 0.11495173]
[0.12173405 0.14922422 0.         0.05644902 0.14697418 0.08349386
 0.14988532 0.08947523 0.09776692 0.1049972 ]
[0.04488659 0.03710351 0.         0.17263177 0.11503168 0.09817565
 0.08853521 0.10109395 0.06644724 0.2760944 ]
[0.13219033 0.0752436  0.07822839 0.11302473 0.07844662 0.
 0.15646834 0.12162571 0.13757605 0.10719623]
[0.15545865 0.         0.08842593 0.1654885  0.05704821 0.14365294
 0.08174013 0.1322263  0.08393322 0.09202613]
[0.         0.13642755 0.13437324 0.14401589 0.00656735 0.10318977
 0.09020372 0.20748312 0.08141763 0.0

In [22]:
# Generational data: population, real value, fitness value
gen_print = 10

for generation, data in eval_log_4.items():
    if generation % gen_print == 0:
        # 0: pop, 1: weights, 2: x-values, 3: fitness
        print('Generation:', generation, '\n')
        print('Population:', data[0], '\n')
        print('Population weights:', data[1], '\n')
        print('Population RMSE error:', ['{:.2f}'.format(item) for item in data[3]], '\n\n')

Generation: 0 

Population: ['011000100101101001100111001011110001000101111011100111111000011000001101101100011110110101011110101011', '111110110011100011100000110010110100011111110100111001110000101010011000111100110100100100110001110111', '001011011110110011000000100111000001001000011000111110010000100011101100110000001111100111111101001000', '010000001001110000111000101011110001101010100000100111001011001111010111111110110100100001010011001011', '110100011001011011000111111100011110011001000011010001010111010011101101111110111100001100100111000011', '011011001011101110101111101000101111111100010110011111101101111001000100101110101101000001001010011110', '011100111001111111101100001101101000011101010111101011100111110001001011010000000010010010111101110000', '100001110100110111010111010000110001110100111011000111110100111001111011000010011011110111001110110001', '011000110011010010010100001001001010001111101000100110010011101000100010100011101010110100111011110011', '0100000010011111

In [24]:
# Plot the fitness sum per generation

gen_fitness = [np.sum(eval_log_4[i][3]) for i in eval_log_4.keys()]

# Plot
fig, axs = plt.subplots(figsize=(5,3))
plt.title("Fitness Plot")
plt.xlabel("Generation")
plt.ylabel("Sum of Population Fitness");
axs.plot(gen_fitness)



<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7fe9301c1950>]

In [26]:
# Print key measures
print('Per generation: min, mean, medidan:')
min_list = [min(eval_log_4[i][3]) for i in list(eval_log_4.keys())[10:]]
mean_list = [np.mean(eval_log_4[i][3]) for i in list(eval_log_4.keys())[10:]]
median_list = [np.median(eval_log_4[i][3]) for i in list(eval_log_4.keys())[10:]]

print(np.mean(min_list), np.mean(mean_list), np.mean(median_list))

Per generation: min, mean, medidan:
0.25053499124970763 0.2911873631836649 0.28150731353056624
