## Home assignment 05: Bagging and OOB score

Please, fill the lines in the code below.
This is a simplified version of `BaggingRegressor` from `sklearn`. Please, notice, that `sklearn` API is **not preserved**.

Your algorithm should be able to train different instances of the same model class on bootstrapped datasets and to provide [OOB score](https://en.wikipedia.org/wiki/Out-of-bag_error) for the training set.

The model should be passed as model class with no explicit parameters and no parentheses.

Example:
```
import numpy as np
from sklearn.linear_model import LinearRegression

bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
bagging_regressor.fit(LinearRegression, X, y)

```

In [1]:
import numpy as np

In [100]:
class SimplifiedBaggingRegressor:
    def __init__(self, num_bags, oob=False):
        self.num_bags = num_bags
        self.oob = oob
        
    def _generate_splits(self, data: np.ndarray):
        '''
        Generate indices for every bag and store in self.indices_list list
        '''
        self.indices_list = []
        data_length = len(data)
        for bag in range(self.num_bags):
                indices = np.random.randint(0, data_length, data_length)
                self.indices_list.append(indices)
        
    def fit(self, model_constructor, data, target):
        '''
        Fit model on every bag.
        Model constructor with no parameters (and with no ()) is passed to this function.
        
        example:
        
        bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
        bagging_regressor.fit(LinearRegression, X, y)
        '''
        self.data = None
        self.target = None
        self._generate_splits(data)
        assert len(set(list(map(len, self.indices_list)))) == 1, 'All bags should be of the same length!'
        assert list(map(len, self.indices_list))[0] == len(data), 'All bags should contain `len(data)` number of elements!'
        self.models_list = []
        for bag in range(self.num_bags):
            model = model_constructor()
            data_bag, target_bag = data[self.indices_list[bag]], target[self.indices_list[bag]] # Your Code Here
            self.models_list.append(model.fit(data_bag, target_bag)) # store fitted models here
        if self.oob:
            self.data = data
            self.target = target
        
    def predict(self, data):
        '''
        Get average prediction for every object from passed dataset
        '''
        return np.mean(np.array([x.predict(data) for x in self.models_list]), axis=0)
    
    def _get_oob_predictions_from_every_model(self):
        '''
        Generates list of lists, where list i contains predictions for self.data[i] object
        from all models, which have not seen this object during training phase
        '''
        list_of_predictions_lists = [[model.predict(self.data[i].reshape(1, -1)) for j, model in enumerate(self.models_list) if i not in self.indices_list[j]] for i in range(len(self.data))]  
        self.list_of_predictions_lists = np.array(list_of_predictions_lists, dtype=object)
    
    def _get_averaged_oob_predictions(self):
        '''
        Compute average prediction for every object from training set.
        If object has been used in all bags on training phase, return None instead of prediction
        '''
        self._get_oob_predictions_from_every_model()
        print('___TEST___1')
        self.oob_predictions = [np.mean(i) if len(i) != len(self.models_list) else None for i in self.list_of_predictions_lists] # Your Code Here
        print('___TEST___2')
        
        
    def OOB_score(self):
        '''
        Compute mean square error for all objects, which have at least one prediction
        '''
        self._get_averaged_oob_predictions()
        #print(len(self.oob_predictions))
        print(np.array([(self.target[i] - pred)**2 for i, pred in enumerate(self.oob_predictions) if pred is not None]))
        return np.array([(self.target[i] - pred)**2 for i, pred in enumerate(self.oob_predictions) if pred is not None]).mean()

### Local tests:

In [2]:
from sklearn.linear_model import LinearRegression
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


#### Simple tests:

In [101]:
for _ in tqdm(range(100)):
    X = np.random.randn(2000, 10)
    y = np.mean(X, axis=1)
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    assert np.mean((predictions - y)**2) < 1e-6, 'Linear dependency should be fitted with almost zero error!'
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    print(oob_score)
    assert oob_score < 1e-6, 'OOB error for linear dependency should be also close to zero!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'
    
print('Simple tests done!')

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  0%|          | 0/100 [00:00<?, ?it/s]

___TEST___1
___TEST___2
[2.35926418e-33 1.23259516e-32 5.00771878e-32 ... 4.93038066e-32
 1.10933565e-31 0.00000000e+00]
nan





AssertionError: OOB error for linear dependency should be also close to zero!

#### Medium tests

In [54]:
for _ in tqdm(range(10)):
    X = np.random.randn(200, 150)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=20, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    average_train_error = np.mean((predictions - y)**2)
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score > average_train_error, 'OOB error must be higher than train error due to overfitting!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'
    
print('Medium tests done!')

  0%|          | 0/10 [00:18<?, ?it/s]

___TEST___





ValueError: Expected 2D array, got 1D array instead:
array=[ 0.94221337 -1.31426736 -0.93999077  0.8371804   0.78146644 -0.84766328
 -1.60175761  0.35428879  1.5697252   0.31568524 -0.99343914  1.32248738
 -0.29665033 -0.28022558  0.41648008  0.14461974  0.01194447 -1.32871934
 -0.43192878 -1.35665484 -0.03271855 -1.97899938 -0.46833181 -0.44827314
 -0.63787513 -1.49685677 -1.26057531  0.21075326 -0.91733084  0.94847847
 -0.45081447  1.25023298 -0.44454211 -0.35564658  0.98424418  0.95343567
  1.38476089 -0.17780701 -0.35288811 -0.50729358  0.62875137  0.97060981
 -1.69310573  1.19027289  0.17289211  1.71734327  0.56615305  0.49476668
  2.13510913 -0.07636466  0.31053292  1.51526788  1.06811197  1.92947664
 -1.97059253 -0.43253432 -1.73130045  0.87761368 -1.39816388  0.62329479
 -0.12922227 -1.774643   -0.259362    1.13987294  1.09064907  1.20680021
 -2.93108193  1.22849686  1.50986374  1.15266877 -2.05533638  0.68562031
 -0.5792697   0.53092564 -1.51788178  0.40992031 -0.57320292  0.74400182
  1.04157972  0.49740212 -0.48013196  1.37820373  0.37715567  0.84474093
  1.21157708 -0.85965067  0.76246701  0.53703321 -0.64660579  0.24535677
 -0.65673026 -0.42297744  0.83044027  1.22683374  1.06905578 -0.39303624
 -1.05881735 -0.82125171 -0.14911593  2.33128104  0.18519309 -0.09774862
 -0.30406359  0.19444748 -0.36554974  0.4254357  -2.07357721 -0.99587901
 -0.40243212  0.0730388  -0.33062421 -0.68790448  0.43558571 -0.03404269
 -0.58482728 -0.13522619  0.86932975  1.55609493  0.89040439  1.0271693
  0.96431541  1.335879    2.55411455  1.41863256 -0.38125278  0.89260091
  0.95399758 -0.75506552  0.63132756 -1.46234585  0.36591418  0.00548879
 -0.6887606  -0.20880602  0.51572608  0.42683528  0.18633926 -1.20257502
  1.83165438 -0.01437741  1.57478134 -0.28520705  1.02181793  1.78538584
  1.05159706 -0.80820755 -1.14063515  1.00062413  1.37669442  0.26569727].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

#### Complex tests:

In [None]:
for _ in tqdm(range(10)):
    X = np.random.randn(2000, 15)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=100, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    oob_score = bagging_regressor.OOB_score()
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 1e-2, 'Probability of missing a bag should be close to theoretical value!'
    
print('Complex tests done!')

In [None]:
np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)

Great job! Please, save `SimplifiedBaggingRegressor` to  `bagging.py` and submit your solution to the grading system!