## Load packages

In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingClassifier

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Function to load data

In [2]:
def parse_telecom_data(filename_train,filename_test):
    '''
    Takes filename and returns X and Y after applying label encoding and OneHotEncoding

    Input:
        filename: name of CSV file to read
    Output:
        X: nparray of X data
        Y: nparray of labels
    '''
    X_train_in = pd.read_csv(filename_train)
    X_test_in = pd.read_csv(filename_test)
    ID_test = X_test_in[['customerID']].to_numpy()

    # get X with categorical data
    X_train_cat = X_train_in.drop(columns=['customerID','tenure','MonthlyCharges','TotalCharges','Discontinued'])
    X_test_cat = X_test_in.drop(columns=['customerID','tenure','MonthlyCharges','TotalCharges'])

    # get X with numeric data
    X_train_num = X_train_in[['tenure','MonthlyCharges']].to_numpy()
    X_test_num = X_test_in[['tenure','MonthlyCharges']].to_numpy()

    # get Y
    Y_train_cat = X_train_in.filter(['Discontinued'])

    # convert labels to numeric using LabelEncoder
    le = preprocessing.LabelEncoder()
    colList = X_train_cat.columns
    ncol_le = len(colList)
    m_train_rows = X_train_cat.shape[0]
    m_test_rows = X_test_cat.shape[0]
    X_train_le = np.zeros((m_train_rows,ncol_le))
    X_test_le = np.zeros((m_test_rows,ncol_le))
    for jdx in range(0,ncol_le):
        le.fit(X_train_cat[colList[jdx]])
        X_train_le[:,jdx] = le.transform(X_train_cat[colList[jdx]])
        X_test_le[:,jdx] = le.transform(X_test_cat[colList[jdx]])
    
    # get labels for training data
    Y_train = Y_train_cat.apply(le.fit_transform)
    Y_train = Y_train.to_numpy()

    # use OneHotEncoder (applied to encoded labels)
    enc = preprocessing.OneHotEncoder()
    enc.fit(X_train_le)
    X_train_ohl = enc.transform(X_train_le).toarray()
    X_test_ohl = enc.transform(X_test_le).toarray()

    # combine data
    n_ohl = X_train_ohl.shape[1]
    n_num = X_train_num.shape[1]
    X_train = np.zeros((m_train_rows,n_ohl+n_num))
    X_test = np.zeros((m_test_rows,n_ohl+n_num))
    X_train[:,0:n_ohl] = X_train_ohl
    X_test[:,0:n_ohl] = X_test_ohl
    X_train[:,n_ohl:] = X_train_num
    X_test[:,n_ohl:] = X_test_num

    return X_train,Y_train,X_test,ID_test

## Load Data

In [3]:
# get data
[X_train,Y_train,X_test,ID_test] = parse_telecom_data(filename_train='train.csv',filename_test='test.csv');

# check for NaN
if np.isnan(X_train).any():
    print('NaN in training data');
if np.isnan(X_test).any():
    print('NaN in test data');

## Train Decision Tree Classifier 

In [4]:
#clf = RandomForestClassifier(n_estimators = 1000,criterion='gini');
clf = DecisionTreeClassifier(criterion='gini');
clf.min_samples_leaf = 25;
clf.fit(X_train,Y_train.flatten());

## Compute metric for training data

In [5]:
y_train_prob = clf.predict_proba(X_train);
thisMetric = metrics.roc_auc_score(Y_train, y_train_prob[:,1]);
print("Training metric: ",thisMetric)

Training metric:  0.8802695275895379


## Train Bagging Classifier

In [6]:
estimator_range = [2,4,6,8,10,12,14,16];
models = [];
scores = [];

for n_estimators in estimator_range:

    # Create bagging classifier
    clf = BaggingClassifier(n_estimators = n_estimators)

    # Fit the model
    clf.fit(X_train,Y_train.flatten())

    # compute metric
    y_train_prob = clf.predict_proba(X_train);
    thisMetric = metrics.roc_auc_score(Y_train, y_train_prob[:,1]);
    print("Training metric: ",thisMetric)

    models.append(clf);
    scores.append(thisMetric);

Training metric:  0.949417039628642
Training metric:  0.9873716944473591
Training metric:  0.9955812598818341
Training metric:  0.9972643827042125
Training metric:  0.9983688868306132
Training metric:  0.9990512534778628
Training metric:  0.999338256310806
Training metric:  0.9995658324619833


## Save test prediction to CSV

In [7]:
y_test_prob = clf.predict_proba(X_test);
thisData = np.concatenate((ID_test, np.reshape(y_test_prob[:,1],(-1,1))), axis=1);
thisLabel = ['ID','TARGET'];
y_test_prob_pd = pd.DataFrame(data=thisData,columns=thisLabel)
fname_submit = 'test_submission.csv';
y_test_prob_pd.to_csv(fname_submit,index=False);