## Home assignment 05: Bagging and OOB score

Please, fill the lines in the code below.
This is a simplified version of `BaggingRegressor` from `sklearn`. Please, notice, that `sklearn` API is **not preserved**.

Your algorithm should be able to train different instances of the same model class on bootstrapped datasets and to provide [OOB score](https://en.wikipedia.org/wiki/Out-of-bag_error) for the training set.

The model should be passed as model class with no explicit parameters and no parentheses.

Example:
```
import numpy as np
from sklearn.linear_model import LinearRegression

bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
bagging_regressor.fit(LinearRegression, X, y)

```

In [1]:
import numpy as np

In [237]:
class SimplifiedBaggingRegressor:
    def __init__(self, num_bags, oob=False):
        self.num_bags = num_bags
        self.oob = oob

    def _generate_splits(self, data: np.ndarray):
        '''
        Generate indices for every bag and store in self.indices_list list
        '''
        self.indices_list = []
        data_length = len(data)
        for bag in range(self.num_bags):
            # Your Code Here
            self.indices_list.append(np.random.randint(0, data_length, data_length))
        
    def fit(self, model_constructor, data, target):
        '''
        Fit model on every bag.
        Model constructor with no parameters (and with no ()) is passed to this function.
        
        example:
        
        bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
        bagging_regressor.fit(LinearRegression, X, y)
        '''
        self.data = None
        self.target = None
        self._generate_splits(data)
        assert len(set(list(map(len, self.indices_list)))) == 1, 'All bags should be of the same length!'
        assert list(map(len, self.indices_list))[0] == len(data), 'All bags should contain `len(data)` number of elements!'
        self.models_list = []
        for bag in range(self.num_bags):
            model = model_constructor() #list(map(lambda x: x[0], data) [:,0]
            data_bag, target_bag = data[self.indices_list[bag]], target[self.indices_list[bag]]  # Your Code Here
            #print(data_bag, '\ntarget bag:',target_bag)
            self.models_list.append(model.fit(data_bag, target_bag)) # store fitted models here
        if self.oob:
            self.data = data
            self.target = target
        
    def predict(self, data):
        '''
        Get average prediction for every object from passed dataset
        '''
        # Your code here
        predictions = []
        for model in self.models_list:
            predictions.append(model.predict(data))
            #print(model.predict(data))
        #print(np.mean(predictions, axis=0))
        return np.mean(predictions, axis=0)
    
    def _get_oob_predictions_from_every_model(self):
        '''
        Generates list of lists, where list i contains predictions for self.data[i] object
        from all models, which have not seen this object during training phase
        '''
        list_of_predictions_lists = [[] for _ in range(len(self.data))]
        # Your Code Here
        
        for i in range(len(self.data)):
            sample = self.data[i].reshape(1, -1)
            model_predictions = []
            for bag in range(self.num_bags):
                if i not in self.indices_list[bag]:
                    model_predictions.append(float(self.models_list[bag].predict(sample)))
            list_of_predictions_lists[i] = model_predictions
        self.list_of_predictions_lists = np.array(list_of_predictions_lists, dtype=object)

        # for i in range(len(self.data)):
                
        #         prediction_list = []

        #         for idx_list in self.indices_list:

        #             #model_indexes.append(np.where(idx_list == i))

        #             for model_idx in np.where(idx_list == i)[0]:

        #                 #print(i, ' i\t', idx_list,' idx list\t', np.where(idx_list == i)[0],' where list\t model id', model_idx)
        #                 #print(self.models_list, '\t', self.data[i])
        #                 #print(self.data[0].reshape(1, -1))

        #                 prediction_list.append(self.models_list[model_idx].predict(self.data[i].reshape(1, -1))[:,0])

        #                 #print(self.models_list[model_idx].predict(self.data[i].reshape(1, -1))[:,0])
        #                 #print(self.target[i])
        #             #print(np.where(idx_list == i))

        #         list_of_predictions_lists[i] = prediction_list

        # #print(list_of_predictions_lists)

        # self.list_of_predictions_lists = np.array(list_of_predictions_lists, dtype=object)

        #print(self.list_of_predictions_lists)
    
    def _get_averaged_oob_predictions(self):
        '''
        Compute average prediction for every object from training set.
        If object has been used in all bags on training phase, return None instead of prediction
        '''
        self._get_oob_predictions_from_every_model()
        #print('oob predictions:')
        
        self.oob_predictions = np.array([np.mean(sublist) if len(sublist) != 0 else np.nan for sublist in self.list_of_predictions_lists]) # Your Code Here

    def OOB_score(self):
        '''
        Compute mean square error for all objects, which have at least one prediction
        '''
        self._get_averaged_oob_predictions()
        #[(print(self.target[i], end='\tpred:\t'), print(self.oob_predictions[i])) if self.oob_predictions[i] is not None else None for i in range(len(self.target))]
        return np.nanmean((self.target - self.oob_predictions)**2)
        #return np.sum([(self.target[i] - self.oob_predictions[i])**2 if self.oob_predictions[i] is not np.nan else 0 for i in range(len(self.target))]) # Your Code Here

### Local tests:

In [11]:
from sklearn.linear_model import LinearRegression
from tqdm.auto import tqdm

#### Simple tests:

In [235]:
for _ in tqdm(range(100)):
    X = np.random.randn(2000, 10)
    y = np.mean(X, axis=1)
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    #print('preds:\n', predictions, '\ny:\n', y)
    #print(np.mean((predictions - y)**2))
    assert np.mean((predictions - y)**2) < 1e-6, 'Linear dependency should be fitted with almost zero error!'
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score < 1e-6, 'OOB error for linear dependency should be also close to zero!'
    #print(abs(np.mean(list(map(len, bagging_regressor.list_of_predictions_lists))) / bagging_regressor.num_bags - 1/np.exp(1)))
    #print(bagging_regressor.list_of_predictions_lists)
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'
    
print('Simple tests done!')

  0%|          | 0/100 [00:00<?, ?it/s]

[-0.15246758 -0.18925358  0.18645673 ...  0.22912763  0.07794862
  0.67919318]
[-0.15246758 -0.18925358  0.18645673 ...  0.22912763  0.07794862
  0.67919318]
[-0.15246758 -0.18925358  0.18645673 ...  0.22912763  0.07794862
  0.67919318]
[-0.15246758 -0.18925358  0.18645673 ...  0.22912763  0.07794862
  0.67919318]
[-0.15246758 -0.18925358  0.18645673 ...  0.22912763  0.07794862
  0.67919318]
[-0.15246758 -0.18925358  0.18645673 ...  0.22912763  0.07794862
  0.67919318]
[-0.15246758 -0.18925358  0.18645673 ...  0.22912763  0.07794862
  0.67919318]
[-0.15246758 -0.18925358  0.18645673 ...  0.22912763  0.07794862
  0.67919318]
[-0.15246758 -0.18925358  0.18645673 ...  0.22912763  0.07794862
  0.67919318]
[-0.15246758 -0.18925358  0.18645673 ...  0.22912763  0.07794862
  0.67919318]
0.012546071611806452
0.0020294411714423255
[list([-0.1524675842660809, -0.15246758426608042, -0.15246758426608117])
 list([-0.18925358218366328, -0.18925358218366348, -0.18925358218366395, -0.1892535821836636, 

#### Medium tests

In [236]:
for _ in tqdm(range(10)):
    X = np.random.randn(200, 150)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=20, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    average_train_error = np.mean((predictions - y)**2)
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score > average_train_error, 'OOB error must be higher than train error due to overfitting!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'
    
print('Medium tests done!')

  0%|          | 0/10 [00:00<?, ?it/s]

[ 1.01856435e+00 -8.29897361e-01  1.01156147e+00 -4.62029113e-01
 -9.85399265e-01  3.57138321e-01  2.25808605e+00 -3.00435748e-01
  1.73823456e-01  2.97008089e-01 -2.18313049e+00 -1.00250775e+00
  6.33385314e-01 -9.91608659e-01  1.15665022e-02 -1.16833895e+00
 -2.41392103e-01  2.67620590e+00  1.04826195e+00 -2.11163908e-01
 -1.03457549e+00 -2.63244462e-01  9.62545405e-01 -4.60725986e-01
 -1.99154698e+00  1.51652144e-02 -7.69475314e-01  5.56447842e-01
  4.03146030e-01  6.27772097e-01  2.55696198e-01 -1.76618524e+00
 -1.77777983e-01 -5.82128947e-02 -1.07308424e+00 -6.25673135e-01
 -9.71761197e-01  9.65036016e-01 -8.48901636e-01 -1.41646855e+00
 -3.06281743e-01 -1.10525105e+00  1.78624005e+00 -1.97559948e-01
  1.01594607e+00 -8.94480133e-01  5.63879248e-01  1.59685056e+00
 -7.33440822e-01 -1.24558711e-01 -7.64831751e-01  5.52085141e-01
 -2.45921290e-01 -4.26148467e-01 -1.66882626e+00 -4.52435402e-01
 -1.97524175e-01 -1.07986265e+00  5.43651887e-01  8.31632650e-01
 -3.25439557e-01  1.53640

#### Complex tests:

In [238]:
for _ in tqdm(range(10)):
    X = np.random.randn(2000, 15)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=100, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    oob_score = bagging_regressor.OOB_score()
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 1e-2, 'Probability of missing a bag should be close to theoretical value!'
    
print('Complex tests done!')

  0%|          | 0/10 [00:00<?, ?it/s]

Complex tests done!


In [None]:
np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)

Great job! Please, save `SimplifiedBaggingRegressor` to  `bagging.py` and submit your solution to the grading system!