# Task: To implement Bagging technique using 100 decision trees

In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np

In [2]:
# importing data
data = pd.read_csv('churn_prediction_simple.csv')
data.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,0,0.0,0,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,6,2531,42,0,2.0,0,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1
2,7,263,42,1,0.0,0,1096.0,2,1666,60.0,...,16059.34,15211.29,13798.82,0.36,0.36,857.5,286.07,15719.44,15349.75,0
3,8,5922,72,0,0.0,1,1020.0,1,1,98.0,...,7714.19,7859.74,11232.37,0.64,0.64,1299.64,439.26,7076.06,7755.98,0
4,9,1145,46,0,0.0,0,623.0,2,317,172.0,...,8519.53,6511.82,16314.17,0.27,0.27,443.13,5688.44,8563.84,5317.04,0


In [3]:
predictors = data.drop(columns=['churn','customer_id'])
target = data['churn']

## Steps to implement bagging technique with "n" number of trees
1. Determine the (n_trees) number of trees
2. Make (n_trees) number of bootstrap samples
3. For each bootstrap samples build a decision tree model and generate predictions
4. For every observation in test set, calculate the model of predictions made by n_trees

In [4]:
#splitting data
train = data.sample(frac=0.75, replace = False)
test = data.append(train)
test = test.drop_duplicates(keep=False)

#check
train.shape, test.shape

((16550, 21), (5517, 21))

In [5]:
# step1 : Setting n_trees_trees
n_trees = 100

In [6]:
# step2 : Making n_trees bootstrap samples (hint: search pd.sample() function)

def BootStrap(data, n_samples, fraction = 1):
    '''
    function to generate boot strap samples.
    data : data to generate bootstrap samples from
    n_samples : number of samples to create
    fraction : what fraction of data should be each sample (default=1)

    Return: a numpy matrix of N,M elements. where N=samples and M=observation in each sample
    '''
    bootstraps = np.zeros(shape = (int(data.shape[0]*fraction), data.shape[1], n_samples))

    for i in range(n_samples):
        bootstraps[:,:,i] = data.sample(frac=fraction, replace=True).values

    return bootstraps

In [7]:
# Testing function (do not change)
print('observations, columns, samples')
print(BootStrap(train, n_trees, fraction=1).shape)
print(BootStrap(train, n_trees, fraction=0.75).shape)
print(BootStrap(train, n_trees, fraction=0.5).shape)

observations, columns, samples
(16550, 21, 100)
(12412, 21, 100)
(8275, 21, 100)


### Expected Outcome

<img src="images/image1.png">

In [8]:
# Step3.1 : create a function called generate predictions which will train a decision tree model over one bootstrap sample and return its predictions on test set

def generate_predictions(train_x, train_y, test_x):
    '''
    train_x: independent variables of sample
    train_y: target variable of sample
    test_x: independent variables of test data

    Return: prediction for the test_x
    '''
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(max_depth=7, class_weight='balanced')
    classifier.fit(train_x, train_y)
    predictions = classifier.predict(test_x)
    return predictions

In [9]:
## testing function
print('Expected length of predictions:', test.shape[0])
print("generated predictions shape: ",generate_predictions(train.drop(columns=['churn']), train['churn'], test.drop(columns=['churn'])).size)
# output numbers should be same

Expected length of predictions: 5517
generated predictions shape:  5517


In [10]:
#Step3.2 : Create a function called Bagging which uses the generate_prediction and bootstrap functions to generate predictions for all the bootstrap samples and calculate mode of predictions for eachhh sample.

def Bagging(n_trees, train_data, test_data, sample_fraction):
    '''
    N-trees: number of trees
    Return: final predictions of overall bagging technique
    '''
    bootstrap_samples = BootStrap(train_data, n_samples = n_trees, fraction = sample_fraction)
    multi_predictions = np.zeros(shape = (int(test_data.shape[0]),n_trees))
    
    for i in range(n_trees):
        tmp = generate_predictions(bootstrap_samples[:,:-1,i],bootstrap_samples[:,-1,i], test_data.drop(columns=['churn']))
        multi_predictions[:,i] = tmp
    
    from scipy.stats import mode
    final_prediction = mode(multi_predictions,axis=1)
    return final_prediction[0]

In [11]:
(Bagging(n_trees, train, test, 1))

array([[0.],
       [1.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])