# Task: To implement Bagging technique using 100 decision trees

In [6]:
# importing necessary libraries
import pandas as pd
import numpy as np

In [7]:
# importing data
data = pd.read_csv('churn_prediction_simple.csv')
data.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,0,0.0,0,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,6,2531,42,0,2.0,0,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1
2,7,263,42,1,0.0,0,1096.0,2,1666,60.0,...,16059.34,15211.29,13798.82,0.36,0.36,857.5,286.07,15719.44,15349.75,0
3,8,5922,72,0,0.0,1,1020.0,1,1,98.0,...,7714.19,7859.74,11232.37,0.64,0.64,1299.64,439.26,7076.06,7755.98,0
4,9,1145,46,0,0.0,0,623.0,2,317,172.0,...,8519.53,6511.82,16314.17,0.27,0.27,443.13,5688.44,8563.84,5317.04,0


In [8]:
predictors = data.drop(columns=['churn','customer_id'])
target = data['churn']

## Steps to implement bagging technique with "n" number of trees
1. Determine the (n_trees) number of trees
2. Make (n_trees) number of bootstrap samples
3. For each bootstrap samples build a decision tree model and generate predictions
4. For every observation in test set, calculate the model of predictions made by n_trees
5. Calculate F1 score for the final predictions

In [10]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

# Function to implement bagging technique
def bagging_with_n_trees(data, n_trees, test_size=0.2):
    # Split data into training and test sets
    train, test = data.sample(frac=1-test_size, replace=False), data.sample(frac=test_size, replace=False)
    
    # List to store predictions of each decision tree
    predictions_list = []
    
    for i in range(n_trees):
        # Create bootstrap sample
        bootstrap_sample = train.sample(frac=1, replace=True)
        
        # Splitting bootstrap sample into predictors and target
        predictors = bootstrap_sample.drop(columns=['churn', 'customer_id'])
        target = bootstrap_sample['churn']
        
        # Create and train decision tree model
        decision_tree = DecisionTreeClassifier()
        decision_tree.fit(predictors, target)
        
        # Generate predictions for test set
        test_predictors = test.drop(columns=['churn', 'customer_id'])
        test_predictions = decision_tree.predict(test_predictors)
        
        # Store predictions in the list
        predictions_list.append(test_predictions)
    
    # Calculate final predictions by averaging the predictions of all trees
    final_predictions = pd.DataFrame(predictions_list).mean()
    final_predictions = final_predictions.apply(lambda x: 1 if x >= 0.5 else 0)  # Convert probabilities to binary
    
    # Calculate F1 score for the final predictions
    f1 = f1_score(test['churn'], final_predictions)
    
    return f1

# Example usage
n_trees = 100  # Number of decision trees
f1_score = bagging_with_n_trees(data, n_trees)
print("F1 Score:", f1_score)

F1 Score: 0.9088743299583086


In [11]:
# step1 : Setting n_trees_trees
n_trees = 100

In [12]:
import pandas as pd

def BootStrap(data, n_samples, fraction=1):
    '''
    Function to generate bootstrap samples.
    
    Parameters:
    data : DataFrame, data to generate bootstrap samples from
    n_samples : int, number of samples to create
    fraction : float, what fraction of data should be each sample (default=1)
    
    Returns:
    bootstrapped_data : list of DataFrames, where each DataFrame represents a bootstrap sample
    '''
    bootstrapped_data = []
    
    for i in range(n_samples):
        # Create a bootstrap sample
        bootstrap_sample = data.sample(frac=fraction, replace=True)
        bootstrapped_data.append(bootstrap_sample)
    
    return bootstrapped_data

# Example usage
n_samples = 100  # Number of bootstrap samples
bootstrap_samples = BootStrap(data, n_samples)


In [13]:
# Testing function
bootstrap_samples_full = BootStrap(train, n_trees, fraction=1)
bootstrap_samples_75 = BootStrap(train, n_trees, fraction=0.75)
bootstrap_samples_50 = BootStrap(train, n_trees, fraction=0.5)

print('Number of observations and columns for full fraction:')
print(bootstrap_samples_full[0].shape)  # Printing shape of the first sample only, assuming all samples have the same shape

print('Number of observations and columns for 75% fraction:')
print(bootstrap_samples_75[0].shape)  # Printing shape of the first sample only, assuming all samples have the same shape

print('Number of observations and columns for 50% fraction:')
print(bootstrap_samples_50[0].shape)  # Printing shape of the first sample only, assuming all samples have the same shape


Number of observations and columns for full fraction:
(16550, 21)
Number of observations and columns for 75% fraction:
(12412, 21)
Number of observations and columns for 50% fraction:
(8275, 21)


### Expected Outcome

<img src="images/image1.png">

In [14]:
from sklearn.tree import DecisionTreeClassifier

def generate_predictions(train_x, train_y, test_x):
    '''
    Function to train a decision tree model over one bootstrap sample and return its predictions on the test set.
    
    Parameters:
    train_x : DataFrame, independent variables of the sample
    train_y : Series, target variable of the sample
    test_x : DataFrame, independent variables of the test data
    
    Returns:
    predictions : array-like, predictions for the test_x
    '''
    # Create and train a decision tree model
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(train_x, train_y)
    
    # Generate predictions for the test set
    predictions = decision_tree.predict(test_x)
    
    return predictions

# Example usage
# Assuming train_x, train_y, and test_x are defined elsewhere
# predictions = generate_predictions(train_x, train_y, test_x)


In [19]:
# Testing function
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42)
expected_length = test.shape[0]
generated_predictions = generate_predictions(train.drop(columns=['churn']), train['churn'], test.drop(columns=['churn']))

print('Expected length of predictions:', expected_length)
print("Generated predictions length:", len(generated_predictions))


Expected length of predictions: 4414
Generated predictions length: 4414


In [20]:
from scipy.stats import mode

def Bagging(n_trees, train_data, test_data, sample_fraction):
    '''
    Function to use the generate_predictions and bootstrap functions to generate predictions for all the bootstrap samples
    and calculate the mode of predictions for each sample.

    Parameters:
    n_trees : int, number of trees
    train_data : DataFrame, training data
    test_data : DataFrame, test data
    sample_fraction : float, fraction of data to be included in each sample
    
    Returns:
    final_predictions : array-like, final predictions of overall bagging technique
    '''
    # List to store predictions of each bootstrap sample
    predictions_list = []

    for i in range(n_trees):
        # Create bootstrap sample
        bootstrap_sample = BootStrap(train_data, 1, fraction=sample_fraction)[0]
        
        # Splitting bootstrap sample into predictors and target
        train_x = bootstrap_sample.drop(columns=['churn'])
        train_y = bootstrap_sample['churn']
        
        # Generate predictions for test set using the bootstrap sample
        predictions = generate_predictions(train_x, train_y, test_data.drop(columns=['churn']))
        
        # Store predictions in the list
        predictions_list.append(predictions)
    
    # Calculate the mode of predictions for each observation across all trees
    final_predictions, _ = mode(predictions_list)
    
    return final_predictions

# Example usage
# final_predictions = Bagging(n_trees, train, test, sample_fraction)


In [21]:
(Bagging(n_trees, train, test, 1))

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)