# Bagging (Bootstrap Aggregating)

Bagging is an ensemble learning technique where multiple models (typically decision trees) are trained independently on different random subsets of the data, and their predictions are aggregated to make a final decision. 

## How It Works:
- **Bootstrap Sampling**: Random subsets of data are sampled with replacement.
- **Parallel Training**: Each model is trained independently on its own subset of data.
- **Aggregation**: The results from all models are combined, often by majority voting (classification) or averaging (regression).
- **Reduces Variance**: Bagging helps make the model more stable and less sensitive to fluctuations in the data.

## Advantages:
✅ **Reduces Overfitting**: Combining multiple models reduces the likelihood of overfitting.  
✅ **Improves Accuracy**: Bagging tends to improve accuracy compared to individual models.  
✅ **Stable Predictions**: By aggregating predictions, the model becomes more robust.  

## Disadvantages:
❌ **Increased Computational Cost**: Bagging requires training multiple models, which can be computationally expensive.  
❌ **Less Interpretability**: As it uses an ensemble of models, bagging may be less interpretable compared to single decision trees.  


In [1]:
#downloading all the necesaary dependecies
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.model_selection import cross_val_score, KFold

In [2]:
%run ../Data/Data_Formatting.ipynb

In [3]:
%run ../Data/Ultimate_Hyperparameters.ipynb

In [4]:
%run ../Data/Parameters.ipynb

In [5]:
%run Classification_Tree.ipynb




In [6]:
pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [7]:
#loading the training dataset 
train_path = Path("../Data/premierleague_team_data.csv")
matches = pd.read_csv(train_path)

#loading the testing data 
test_path = Path("../Data/premierleague_test_team_data.csv")
test_matches = pd.read_csv(test_path)

In [8]:
#loading the training dataset with rank
train_path = Path("../Data/premierleague_rank_team_data.csv")
new_matches = pd.read_csv(train_path)

#loading the testing data with rank
test_path = Path("../Data/premierleague_rank_test_team_data.csv")
new_test_matches = pd.read_csv(test_path)

In [9]:
process_data(matches, test_matches)

In [10]:
process_data(new_matches, new_test_matches)

### Bagging using Baseline Predictors  (refer /Data/Data_Formatting.ipynb)

In [11]:
def make_yearly_predictions_bagging(Train, Test):
    best_alpha = find_optimal_alpha_base(Train)
    
    # Define static predictors
    static_predictors = parameters_base(Train,Test)

    # Train Bagging model on training data
    base_tree = DecisionTreeClassifier(max_depth=10, min_samples_split=10, ccp_alpha=best_alpha, random_state=1)
    bagging_clf = BaggingClassifier(estimator=base_tree, n_estimators=50, random_state=1, n_jobs=-1)
    bagging_clf.fit(Train[static_predictors], Train["Target"])

    # Calculate training accuracy
    train_accuracy = accuracy_score(Train['Target'], bagging_clf.predict(Train[static_predictors]))

    # Create a list to store results
    results = []

    for year in range(Test['Date'].dt.year.min(), Test['Date'].dt.year.max() + 1):
        test_year = Test[Test['Date'].dt.year == year]
        if not test_year.empty:
            # Predict on test data
            preds = bagging_clf.predict(test_year[static_predictors])

            # Calculate precision and accuracy
            precision = precision_score(test_year["Target"], preds, average="weighted")
            accuracy = accuracy_score(test_year["Target"], preds)

            # Append results to list
            results.append({
                "Model": "Bagging",
                "Year": year,
                "Precision": precision,
                "Accuracy": accuracy
            })

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    return results_df


### Bagging using Baseline Predictors + Rolling Predictors   (refer /Data/Data_Formatting.ipynb)

In [12]:
def make_yearly_predictions_bagging_rolling(Train, Test):
    best_alpha = find_optimal_alpha_roll(Train)
   
    all_predictors = parameters_roll(Train,Test)
    Train = roll(Train)
    Test  = roll(Test)

    # Train a Bagging Classifier with multiple Decision Trees
    base_tree = DecisionTreeClassifier(max_depth=10, min_samples_split=10, ccp_alpha=best_alpha, random_state=1)
    bagging_clf = BaggingClassifier(estimator=base_tree, n_estimators=50, random_state=1, n_jobs=-1) 
    bagging_clf.fit(Train[ all_predictors], Train["Target"])
    
    results = []
    for year in range(Test['Date'].dt.year.min(), Test['Date'].dt.year.max() + 1):
        test_year = Test[Test['Date'].dt.year == year]
        if not test_year.empty:  
            preds = bagging_clf.predict(test_year[ all_predictors])
            
            precision = precision_score(test_year["Target"], preds, average="weighted")
            accuracy = accuracy_score(test_year["Target"], preds)
            
             # Append results to list
            results.append({
                "Model": "Bagging",
                "Year": year,
                "Precision": precision,
                "Accuracy": accuracy
            })

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    return results_df

### Bagging using  Full Feature Set (refer /Data/Data_Formatting.ipynb)

In [14]:
def make_yearly_predictions_bagging_full(Train, Test):
    best_alpha = find_optimal_alpha_full(Train)
    
    all_predictors = parameters_full(Train,Test)
    Train = roll(Train)
    Test  = roll(Test)

    # Train a Bagging Classifier with multiple Decision Trees
    base_tree = DecisionTreeClassifier(max_depth=10, min_samples_split=10, ccp_alpha=best_alpha, random_state=1)
    bagging_clf = BaggingClassifier(estimator=base_tree, n_estimators=50, random_state=1, n_jobs=-1) 
    bagging_clf.fit(Train[ all_predictors], Train["Target"])
    
    results = []
    for year in range(Test['Date'].dt.year.min(), Test['Date'].dt.year.max() + 1):
        test_year = Test[Test['Date'].dt.year == year]
        if not test_year.empty:
            preds = bagging_clf.predict(test_year[ all_predictors])
            
            precision = precision_score(test_year["Target"], preds, average="weighted")
            accuracy = accuracy_score(test_year["Target"], preds)
            
           # Append results to list
            results.append({
                "Model": "Bagging",
                "Year": year,
                "Precision": precision,
                "Accuracy": accuracy
            })

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    return results_df