## Testing LinReg functionality

This notbook contains a small showcase on how to use the functions inside the **LinReg.py** file.

### Testing

In [6]:
import numpy as np
import pandas as pd
import LinReg

Read the data:

In [7]:
data = pd.read_csv('dataset.txt', header=None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
0,8.0,1.0,0.19,0.33,0.02,0.90,0.12,0.17,0.34,0.47,...,0.12,0.42,0.50,0.51,0.64,0.12,0.26,0.20,0.32,0.20
1,53.0,1.0,0.00,0.16,0.12,0.74,0.45,0.07,0.26,0.59,...,0.21,0.50,0.34,0.60,0.52,0.02,0.12,0.45,0.00,0.67
2,24.0,1.0,0.00,0.42,0.49,0.56,0.17,0.04,0.39,0.47,...,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.00,0.43
3,34.0,1.0,0.04,0.77,1.00,0.08,0.12,0.10,0.51,0.50,...,0.19,0.30,0.73,0.64,0.65,0.02,0.39,0.28,0.00,0.12
4,42.0,1.0,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,...,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.00,0.03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1989,12.0,10.0,0.01,0.40,0.10,0.87,0.12,0.16,0.43,0.51,...,0.22,0.28,0.34,0.48,0.39,0.01,0.28,0.05,0.00,0.09
1990,6.0,10.0,0.05,0.96,0.46,0.28,0.83,0.32,0.69,0.86,...,0.53,0.25,0.17,0.10,0.00,0.02,0.37,0.20,0.00,0.45
1991,9.0,10.0,0.16,0.37,0.25,0.69,0.04,0.25,0.35,0.50,...,0.25,0.68,0.61,0.79,0.76,0.08,0.32,0.18,0.91,0.23
1992,25.0,10.0,0.08,0.51,0.06,0.87,0.22,0.10,0.58,0.74,...,0.45,0.64,0.54,0.59,0.52,0.03,0.38,0.33,0.22,0.19


Implement the regressor:

In [8]:
regressor = LinReg.LinReg()

Now implement a random number generator, and generate a dummy binary array:

In [9]:
myRNG = np.random.default_rng()
rand_ind = myRNG.integers(0, 1, size=data.shape[1], endpoint=True)
rand_ind

array([0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1])

We can use the **get_columns** method of the regressor to get columns marked as 1 from the **data** and save it in matrix **X**

In [10]:
X = regressor.get_columns(data.values, rand_ind)

In [11]:
X.shape

(1994, 48)

We finally use the **get_fitness** method to train, test, and calculate the root mean square error of our prediction using:
* Observations are taken from **X**: all rows, and all columns except the last one
* Target are taken from the last column of **X**

In [12]:
regressor.get_fitness(X[:,:-1], X[:,-1])

0.14289772005668763

### Documentation
All methods are well documented via docstring, which can be understood both by humans and Python. For example, we can use the **help** function:

In [13]:
help(regressor.get_fitness)

Help on method get_fitness in module LinReg:

get_fitness(x, y, rng=None) method of LinReg.LinReg instance
    Return the error of the trained model
    
    Parameters
    ----------
    x : an `n x m` matrix of
        Data that should be used for training the model
    y : a vector of length `n`
        Regression values of observarions
    rng : int, optional
        Random seed, by default None
    
    Returns
    -------
    float
        The square root of the MSE of the model



In [14]:
def generate_initial_population(population_size, num_features):
    return myRNG.integers(
        0, 1, size=(population_size, num_features), endpoint=True
    )

In [15]:
a = generate_initial_population(10, 5)

In [16]:
a

array([[0, 0, 0, 0, 1],
       [1, 1, 1, 1, 0],
       [1, 1, 0, 0, 0],
       [1, 1, 1, 0, 1],
       [1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [1, 0, 1, 0, 1],
       [0, 1, 1, 1, 1],
       [1, 1, 0, 0, 0]])

In [48]:
bit = np.array([1, 1, 1, 1, 0])
bit

array([1, 1, 1, 1, 0])

In [49]:
np.all(a == bit, axis=1)

array([False,  True, False, False, False, False, False, False, False,
       False])

In [50]:
np.all(a == bit, axis=1).any()

True

In [44]:
np.where(np.all(a == bit, axis=1))[0][0]

IndexError: index 0 is out of bounds for axis 0 with size 0

In [30]:
np.delete(a, np.where(np.all(a == bit, axis=1))[0][0], axis=0)

array([[0, 0, 0, 0, 1],
       [1, 1, 1, 1, 0],
       [1, 1, 1, 0, 1],
       [1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [1, 0, 1, 0, 1],
       [0, 1, 1, 1, 1],
       [1, 1, 0, 0, 0]])

In [42]:
b = np.array([1, 1, 0, 0, 0])
np.append(a, [b], axis=0)

array([[0, 0, 0, 0, 1],
       [1, 1, 1, 1, 0],
       [1, 1, 0, 0, 0],
       [1, 1, 1, 0, 1],
       [1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [1, 0, 1, 0, 1],
       [0, 1, 1, 1, 1],
       [1, 1, 0, 0, 0],
       [1, 1, 0, 0, 0]])

In [39]:
test = np.array([np.array([1, 2, 3]), np.array([1, 2, 5])])
test

array([[1, 2, 3],
       [1, 2, 5]])

In [40]:
test[0]

array([1, 2, 3])

In [51]:
def calculate_entropy(population):
    # Calculate the probability of a bit being 1 in each position
    p_i = np.mean(population, axis=0)
    # Avoid division by zero in case of p_i being 0 by adding a small epsilon
    epsilon = 1e-10
    # Calculate the entropy using the formula
    H = -np.sum(p_i * np.log2(p_i + epsilon) + (1 - p_i) * np.log2(1 - p_i + epsilon))
    return H

In [53]:
a

array([[0, 0, 0, 0, 1],
       [1, 1, 1, 1, 0],
       [1, 1, 0, 0, 0],
       [1, 1, 1, 0, 1],
       [1, 1, 0, 1, 0],
       [0, 1, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [1, 0, 1, 0, 1],
       [0, 1, 1, 1, 1],
       [1, 1, 0, 0, 0]])

In [55]:
calculate_entropy(a)

4.545120181584697

In [56]:
p_i = np.mean(a, axis=0)

In [57]:
p_i

array([0.6, 0.8, 0.4, 0.3, 0.5])