# Churn Prediction☎️📊🔖📉

In this notebook:

- EDA for Iranian Churn Dataset
- Grid Search for hyperparameter tuning with cross-val&hold-out methods
- Decision Tree
- Naive Bayes
- SVM
- Neural Networks
- Bagging ensemble method
- Boosting ensemble method

In [None]:
import csv
import pandas as pd
import numpy as np
import seaborn as sns
import time

In [None]:
df=pd.read_csv('../input/customer-churn/Customer Churn.csv')
df

In [None]:
df.info()

In [None]:
#rename columns
df=df.rename(columns={"Call  Failure": "call_failure", "Complains": "complains", "Subscription  Length": "subs_len", "Charge  Amount": "charge_amount",
                   "Seconds of Use": "total_sec_calls", "Frequency of use": "total_num_calls", "Frequency of SMS": "total_num_sms", "Distinct Called Numbers": "distinct_call_nums",
                   "Age Group": "age_group", "Tariff Plan": "tariff_plan", "Status": "status", "Age": "age", "Customer Value": "customer_value"})

In [None]:
df

In [None]:
#see how many unique values for each col
df.nunique()

In [None]:
#there is no Nan values
df.isnull().sum()

# EDA

In [None]:
#see target class is imbalanced

sns.set_style("dark")
sns.set(rc={'figure.figsize':(4,4)})
sns.countplot(x="Churn", data=df, palette=sns.color_palette("Paired", 7), saturation=10)

In [None]:
sns.set_style("dark")
sns.countplot(x="age_group", data=df, palette=sns.color_palette("husl", 8), saturation=10)

In [None]:
sns.set_style("dark")
sns.countplot(x="charge_amount", data=df, palette=sns.color_palette("husl", 8), saturation=10)

In [None]:
sns.set_style("dark")
sns.countplot(x="age_group", data=df, palette=sns.color_palette("husl", 8), saturation=10, hue="Churn")

In [None]:
sns.set(rc={"font.style":"normal",
            "text.color":"black",
            "xtick.color":"black",
            "ytick.color":"black",
            "axes.labelcolor":"black",
            "axes.grid":False,
            'axes.labelsize':30,
            'figure.figsize':(12.0, 6),
            'xtick.labelsize':25,
            'ytick.labelsize':20})

sns.set(style="white",font_scale=1)


sns.set_style("dark")
sns.countplot(x="charge_amount", data=df, palette=sns.color_palette("husl", 8), 
              saturation=10, edgecolor=(0,0,0), linewidth=2)

In [None]:
# library
import matplotlib.pyplot as plt
from palettable.colorbrewer.qualitative import Pastel1_7

# create data
names=list(df["age"].unique())
sizes=[df["age"].value_counts()[unique_class]*100/len(df["age"]) for unique_class in names]
colors = Pastel1_7.hex_colors
explode = (0, 0, 0, 0, 0)  # explode a slice if required

plt.pie(sizes, explode=explode, labels=names, colors=colors,
        autopct='%1.1f%%', shadow=True)
        
#draw a circle at the center of pie to make it look like a donut
centre_circle = plt.Circle((0,0), 0.50, color='black', fc='white',linewidth=0.80)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)


# Set aspect ratio to be equal so that pie is drawn as a circle.
plt.axis('equal')
plt.show()

In [None]:
cat_feature_col=["complains", "charge_amount", "tariff_plan", "status", "age","Churn"]
for i in cat_feature_col:
    print(f"{i} : {df[i].unique()}")
    print(df[i].value_counts())
    print("-------------------------------------------")

In [None]:
#heatmap for correlation coefficient

# calculate correlation
df_corr = df.corr()

# correlation matrix
sns.set(font_scale=0.8)
plt.figure(figsize=(16,12))
sns.heatmap(df_corr, annot=True, fmt=".4f",vmin=-1, vmax=1, linewidths=.5, cmap = sns.diverging_palette(145, 300, s=60, as_cmap=True))

#plt.yticks(rotation=0)
plt.show()

In [None]:
#feature importance using corr
df.drop('Churn', axis=1).corrwith(df.Churn).plot(kind='barh', figsize=(8, 6), color='skyblue', title="Churn vs all features")

In [None]:
!pip install ppscore

In [None]:
import seaborn as sns
import ppscore as pps

matrix_df = pps.matrix(df).pivot(columns='x', index='y',  values='ppscore')

sns.heatmap(matrix_df, annot=True)

In [None]:
#for cat data distribution
import matplotlib

plt.figure(figsize=(32, 32))
matplotlib.rc('axes', titlesize=24)#cols size

cat_feature_col=["complains", "charge_amount", "age_group", "tariff_plan", "status", "age"]
for i, column in enumerate(cat_feature_col, 1):
    plt.subplot(4, 4, i)
    df[df["Churn"] == 0][column].hist(bins=20, color='pink', label='churn = 0(non-churn)', alpha=1)
    df[df["Churn"] == 1][column].hist(bins=20, color='tomato', label='churn = 1(churn)', alpha=1)
    plt.legend(fontsize='medium')
    plt.title(column)

In [None]:
#since age_group and age is highly correlated, we decide to del age_group
df=df.drop(columns=["age_group", "FN", "FP"])

In [None]:
df

In [None]:
#for cont data scatterplot matrix
cont_feature_col=["call_failure", "subs_len", "total_sec_calls", "total_num_calls", "total_num_sms", "distinct_call_nums","customer_value"]

sns.set(style="ticks")

sns.pairplot(df[cont_feature_col + ['Churn']], hue='Churn', palette="husl", corner=True)

In [None]:
#outlier analysis using box-plot(continuos data can have outliers)

sns.set(style="whitegrid",font_scale=1)
plt.figure(figsize=(10,8))
sns.boxplot(data=df[cont_feature_col])
plt.xticks(rotation=80)
plt.title("Box plot ")
plt.show()

In [None]:
df.describe()

In [None]:
# find the IQR
q1 = df[cont_feature_col].quantile(.25)
q3 = df[cont_feature_col].quantile(.75)
IQR = q3-q1

outliers_df = np.logical_or((df[cont_feature_col] < (q1 - 1.5 * IQR)), (df[cont_feature_col] > (q3 + 1.5 * IQR))) 

outlier_list=[]
total_outlier=[]
for col in list(outliers_df.columns):
    try:
        total_outlier.append(outliers_df[col].value_counts()[True])
        outlier_list.append((outliers_df[col].value_counts()[True] / outliers_df[col].value_counts().sum()) * 100)
    except:
        outlier_list.append(0)
        total_outlier.append(0)
        
outlier_list

outlier_df=pd.DataFrame(zip(list(outliers_df.columns), total_outlier, outlier_list), columns=['name of the column', 'total', 'outlier(%)'])

#see totally how many outliers in cont features
outlier_df.set_index('name of the column', inplace=True)
#del outlier_df.index.name
outlier_df

In [None]:
outliers_df

In [None]:
df_cont=df[cont_feature_col]
out_nan_df=df_cont[~outliers_df]
out_nan_df

In [None]:
for col in cont_feature_col:
  #qq=out_nan_df.dropna()
  col_mean=df[col].mean() #calculate mean for each col
  out_nan_df[col]=out_nan_df[col].fillna(col_mean) #first convert outliers to Nan values then fill Nan's with col mean
  #df[cont_feature_col]=df_cont

In [None]:
out_nan_df

In [None]:
deneme=df.drop(columns=["call_failure", "subs_len", "total_sec_calls", "total_num_calls", "total_num_sms", "distinct_call_nums", "customer_value"])

In [None]:
#concat cat_df and clear out of outliers cont_df

df=pd.concat([out_nan_df, deneme], axis=1)

In [None]:
df

# CLASSIFICATION

In [None]:
#import sklearn methods
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, CategoricalNB
from sklearn.svm import LinearSVC,SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import learning_curve
import sys 
import os

In [None]:
# split df to X and Y
y = df.loc[:, 'Churn'].values
X = df.drop('Churn', axis=1)

# split data into 80-20 for training set / test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

# cross-validation with 5 splits
cv = StratifiedShuffleSplit(n_splits=5, random_state = 88)

#hold-out
hold_out=StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state = 88)

In [None]:
X

In [None]:
y

In [None]:
print("X_train size is", len(X_train))
print("y_train size is", len(y_train))
print("--------------------")
print("X_test size is", len(X_test))
print("y_test size is", len(y_test))

# Normalization

In [None]:
#normalization(make all values bet. 0-1)

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_normalized_arr=scaler.transform(X_train)
X_train_normalized_df=pd.DataFrame(X_train_normalized_arr, columns=list(X.columns))

X_test_normalized_arr=scaler.transform(X_test)
X_test_normalized_df=pd.DataFrame(X_test_normalized_arr, columns=list(X.columns))

In [None]:
X_train_normalized_df

In [None]:
X_test_normalized_df

In [None]:
print("X_train_normalized_df size is", len(X_train_normalized_df))
print("----------------------------------")
print("X_test_normalized_df size is", len(X_test_normalized_df))

# feature importances

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 500, max_depth=5)
rf.fit(X_train_normalized_df, y_train)
rf_y_pred = rf.predict(X_test_normalized_df)

pd.Series(rf.feature_importances_, index = X_train_normalized_df.columns).nlargest(15).plot(kind = 'pie',
                                                                               figsize = (8, 8),
                                                                              title = 'Feature importance from RandomForest', colormap='magma')

Here, our display test scores method

In [None]:
# display test scores and return result string and indexes of false samples
def display_test_scores(test, pred):
    str_out = "\n"
    str_out += ("#####  TEST SCORES  #####\n--------------------")
    str_out += ("\n")

    #print accuracy
    accuracy = accuracy_score(test, pred)
    str_out += ("ACCURACY: {:.4f}\n".format(accuracy))
    str_out += ("\n")

    #print AUC score
    auc = roc_auc_score(test, pred)
    str_out += ("AUC: {:.4f}\n".format(auc))
    str_out += ("\n")

    #print confusion matrix
    str_out += ("CONFUSION MATRIX:\n--------------------\n")
    conf_mat = confusion_matrix(test, pred)
    str_out += ("{}".format(conf_mat))
    str_out += ("\n")
    str_out += ("\n--------------------\n")

    #print classification report
    str_out += ("{}".format(classification_report(test, pred)))
    
    false_indexes = np.where(test != pred)
    return str_out, false_indexes

# **Classifier #1: Decision Tree**

In [None]:
# decision tree with "gini"
dt_1 = DecisionTreeClassifier(random_state = 0, criterion="gini")

# parameters 
parameters = {
                "splitter": ["best","random"],
                "class_weight": [None, "balanced"],
                "max_depth": [9, 11, 13, 15, 17, None]
                }

start_time=time.time()##

# grid search for parameters
grid_1 = GridSearchCV(estimator=dt_1, param_grid=parameters, cv=cv, n_jobs=-1)
grid_1.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid_1.best_params_, grid_1.best_score_))

################################################
# detailed dataframe of gridsearch

#detailed_grid_results = pd.DataFrame(grid.cv_results_)
#detailed_grid_results


################################################

# prediction results
y_pred = grid_1.predict(X_test_normalized_df)

end_time=time.time()##
print("\nRun time for train&test cv (DT-gini): ",end_time-start_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

*### hold_out DT1*

In [None]:
start_time=time.time()##

# grid search for parameters for hold_out
grid_1_h = GridSearchCV(estimator=dt_1, param_grid=parameters, cv=hold_out, n_jobs=-1)
grid_1_h.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid_1_h.best_params_, grid_1_h.best_score_))

#########################################

# prediction results
y_pred = grid_1_h.predict(X_test_normalized_df)

end_time=time.time()##
print("\nRun time for train&test hold_out (DT-gini): ",end_time-start_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)


# **Classifier #2: Decision Tree**

In [None]:
# decision tree with "entropy" gain_ratio
dt_2 = DecisionTreeClassifier(random_state = 0, criterion="entropy")

# parameters 
parameters = {
                "splitter": ["best","random"],
                "class_weight": [None, "balanced"],
                "max_depth": [11, 13, 15, 17, 19, 21, None]
                }

start_time=time.time()##

# grid search for parameters
grid_2 = GridSearchCV(estimator=dt_2, param_grid=parameters, cv=cv, n_jobs=-1)
grid_2.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid_2.best_params_, grid_2.best_score_))

########################################

# prediction results
y_pred = grid_2.predict(X_test_normalized_df)

end_time=time.time()##
print("\nRun time for train&test cv (DT-gain ratio): ",end_time-start_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

*### hold_out DT2*

In [None]:
start_time=time.time()##

# grid search for parameters for hold_out
grid_2_h = GridSearchCV(estimator=dt_2, param_grid=parameters, cv=hold_out, n_jobs=-1)
grid_2_h.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid_2_h.best_params_, grid_2_h.best_score_))

#####################################

# prediction results
y_pred = grid_2_h.predict(X_test_normalized_df)

end_time=time.time()##
print("\nRun time for train&test hold_out (DT-gain ratio): ",end_time-start_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# **Classifier #3: Naive Bayes**

In [None]:
# Naive-Bayes with different approaches
nb_list = [GaussianNB(), MultinomialNB(), ComplementNB()]

for nb in nb_list:
    print("*********", str(nb), "**********")
    # parameters 
    parameters = {}

    start_time=time.time()##
    # grid search for parameters
    grid_3 = GridSearchCV(estimator=nb, param_grid=parameters, cv=cv, n_jobs=-1)
    grid_3.fit(X_train_normalized_df, y_train)

    # print best scores
    print("The best parameters are %s with a score of %0.4f\n"
          % (grid_3.best_params_, grid_3.best_score_))

    # prediction results
    y_ord_pred = grid_3.predict(X_test_normalized_df)
    
    end_time=time.time()##
    print("\nRun time for train&test cv{}: ".format(str(nb)), end_time-start_time)

    # print accuracy metrics
    results, false = display_test_scores(y_test, y_pred)
    print("\n>>>>>>>>><<<<<<<<<<>>>>>>>>>>><<<<<<<<<<<<<>>>>>>>>><<<<<<<<\n")
    print(results)
    

*### hold_out NB*

In [None]:
for nb in nb_list:
    print("*********", str(nb), "**********")
    # parameters 
    parameters = {}

    start_time=time.time()##
    # grid search for parameters
    grid_3_h = GridSearchCV(estimator=nb, param_grid=parameters, cv=hold_out, n_jobs=-1)
    grid_3_h.fit(X_train_normalized_df, y_train)

    # print best scores
    print("The best parameters are %s with a score of %0.4f\n"
          % (grid_3_h.best_params_, grid_3_h.best_score_))

    # prediction results
    y_ord_pred = grid_3_h.predict(X_test_normalized_df)
    
    end_time=time.time()##
    print("\nRun time for train&test hold_out{}: ".format(str(nb)), end_time-start_time)

    # print accuracy metrics
    results, false = display_test_scores(y_test, y_pred)
    print("\n>>>>>>>>><<<<<<<<<<>>>>>>>>>>><<<<<<<<<<<<<>>>>>>>>><<<<<<<<\n")
    print(results)
    

# Classifier #4: ANN - 1 layer

In [None]:
# NN with 1 layer
ann_1 = MLPClassifier(tol=1e-5, random_state=0, solver='adam', activation='tanh', max_iter=1000, batch_size=256)

parameters = {
                'hidden_layer_sizes': [(10,),(50,),(100,)],
                'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]
            }


start_time=time.time()##
# grid search for parameters
grid_4 = GridSearchCV(estimator=ann_1, param_grid=parameters, cv=cv, n_jobs=-1)
grid_4.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid_4.best_params_, grid_4.best_score_))

###################################

# prediction results
y_pred = grid_4.predict(X_test_normalized_df)

end_time=time.time()##
print("\nRun time for train&test cv NN-1 layer: ", end_time-start_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

*### hold_out NN-1*

In [None]:
start_time=time.time()##

# grid search for parameters for hold_out
grid_4_h = GridSearchCV(estimator=ann_1, param_grid=parameters, cv=hold_out, n_jobs=-1)
grid_4_h.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid_4_h.best_params_, grid_4_h.best_score_))

####################################

# prediction results
y_pred = grid_4_h.predict(X_test_normalized_df)

end_time=time.time()##
print("\nRun time for train&test hold_out NN-1 layer: ", end_time-start_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)


# Classifier #5: ANN - 2 layer

In [None]:
# NN with 2 layers
nn_2 = MLPClassifier(tol=1e-5, random_state=0, solver='adam', activation='tanh', max_iter=1000, batch_size=256)


parameters = {
                'hidden_layer_sizes': [(10, 10),(50, 50),(100, 100)],
                'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]
            }

start_time=time.time()##
# grid search for parameters
grid_5 = GridSearchCV(estimator=nn_2, param_grid=parameters, cv=cv, n_jobs=-1)
grid_5.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid_5.best_params_, grid_5.best_score_))

############################

# prediction results
y_pred = grid_5.predict(X_test_normalized_df)

end_time=time.time()##
print("\nRun time for train&test cv NN-2 layer: ", end_time-start_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

*### hold_out NN-2*

In [None]:
start_time=time.time()##

# grid search for parameters for hold_out
grid_5_h = GridSearchCV(estimator=nn_2, param_grid=parameters, cv=hold_out, n_jobs=-1)
grid_5_h.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f"
      % (grid_5_h.best_params_, grid_5_h.best_score_))

####################################

# prediction results
y_pred = grid_5_h.predict(X_test_normalized_df)

end_time=time.time()##
print("\nRun time for train&test hold_out NN-2 layer: ", end_time-start_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)


# **Classifier #6: SVM**

In [None]:
# SVM classifier
svm = SVC(tol=1e-5)

# parameters 
parameters = {
                'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                'max_iter': [100, 300, 800, 1000, 1200],
                'class_weight': [None, 'balanced']
            }

start_time=time.time()##

# grid search for parameters
grid_6 = GridSearchCV(estimator=svm, param_grid=parameters, cv=cv, n_jobs=-1)
grid_6.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid_6.best_params_, grid_6.best_score_))

# prediction results
y_pred = grid_6.predict(X_test_normalized_df)


end_time=time.time()##
print("\nRun time for train&test cv SVM : ", end_time-start_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

*### hold_out SVM*

In [None]:
start_time=time.time()##

# grid search for parameters
grid_6_h = GridSearchCV(estimator=svm, param_grid=parameters, cv=hold_out, n_jobs=-1)
grid_6_h.fit(X_train_normalized_df, y_train)

# print best scores
print("The best parameters are %s with a score of %0.4f\n"
      % (grid_6_h.best_params_, grid_6_h.best_score_))

# prediction results
y_pred = grid_6_h.predict(X_test_normalized_df)

end_time=time.time()##
print("\nRun time for train&test hold_out SVM : ", end_time-start_time)

# print accuracy metrics
results, false = display_test_scores(y_test, y_pred)
print(results)

# Classifier #7: Bagging

In [None]:
start_time=time.time()##

#generate subsamples by indices
indexes=X_train.index.values
rep = np.array([np.random.choice(indexes, len(indexes), replace = True) for _ in range(6)])

#rep_x_train has 6 dfs 
rep_x_train=[df.iloc[arr,:-1] for arr in rep]
rep_y_train=[df.iloc[arr,-1] for arr in rep]

#dfs are created by bootstrapping
rep_x_train[0].duplicated()
rep_y_train[0].duplicated()

In [None]:
indexes

In [None]:
rep

In [None]:
preds=[]

#dt_1 --> cv 
model_1 = grid_1.best_estimator_
model_1.fit(rep_x_train[0], rep_y_train[0])

pred_1=model_1.predict(X_test)
preds.append(pred_1)

#####################################

#dt_2 --> cv 
model_2 = grid_2.best_estimator_
model_2.fit(rep_x_train[1], rep_y_train[1])

pred_2=model_2.predict(X_test)
preds.append(pred_2)

#####################################

#nb --> cv
model_3 = grid_3.best_estimator_
model_3.fit(rep_x_train[2], rep_y_train[2])

pred_3=model_3.predict(X_test)
preds.append(pred_3)

###################################

#ann_1 --> cv
model_4 = grid_4.best_estimator_
model_4.fit(rep_x_train[3], rep_y_train[3])

pred_4=model_4.predict(X_test)
preds.append(pred_4)

###################################

#ann_2 --> cv
model_5 = grid_5.best_estimator_
model_5.fit(rep_x_train[4], rep_y_train[4])

pred_5=model_5.predict(X_test)
preds.append(pred_5)


###################################

#svm --> hold
model_6 = grid_6_h.best_estimator_
model_6.fit(rep_x_train[5], rep_y_train[5])

pred_6=model_6.predict(X_test)
preds.append(pred_6)


In [None]:
arr_preds=np.array(preds)
arr_preds_mean=arr_preds.mean(axis=0)

end_time=time.time()##
print("\nRun time for train&test bagging: ", end_time-start_time)

#see it has same len with X_test
print(len(arr_preds_mean))

# print accuracy metrics
results, false = display_test_scores(y_test, arr_preds_mean.round())
print(results)

# Classifier #8: Boosting

- https://www.analyticsvidhya.com/blog/2018/06/comprehensive-guide-for-ensemble-models/
- https://www.python-course.eu/Boosting.php#Boosting-Pseudocode

In [None]:
def boosting_step(grid, initial_weight, X_train, y_train):
    
    # Initialize the weights of each sample with wi = 1/N and 
    #create a dataframe in which the evaluation is computed
    df_eval = pd.DataFrame(y_train, columns=["target"])
    df_eval['weights'] = initial_weight 


    model = grid.best_estimator_
    model.fit(X_train, y_train, sample_weight=np.array(df_eval['weights']))
    pred=model.predict(X_train)
    score = model.score(X_train,y_train)

    #add values to the df_eval
    df_eval['predictions'] = pred
    df_eval['evaluation'] = np.where(df_eval['predictions'] == df_eval['target'], 1, 0)
    df_eval['misclassified'] = np.where(df_eval['predictions'] != df_eval['target'], 1, 0)

    #cal the misclassification rate and accuracy
    accuracy = sum(df_eval['evaluation']) / len(df_eval['evaluation'])
    misclassification = sum(df_eval['misclassified']) / len(df_eval['misclassified'])


    #cal the error
    err = np.sum(df_eval['weights'] * df_eval['misclassified']) / np.sum(df_eval['weights'])


    #cal the alpha values
    alpha = np.log((1-err) / err)
 

    # Update the weights wi --> These updated weights are used in the sample_weight parameter
    # for the training of the next decision stump. 
    df_eval['weights'] *= np.exp(alpha * df_eval['misclassified'])

    prediction = alpha * df_eval["predictions"]

    return prediction, df_eval['weights']

In [None]:
#Set the initial weights w = 1/N
start_time=time.time()##

pred_1, w_1=boosting_step(grid_1, 1/len(y_train), X_train, y_train)

pred_2, w_2=boosting_step(grid_2, w_1, X_train, y_train)

pred_3, w_3=boosting_step(grid_3, w_2, X_train, y_train)

#alpha_4, w_4=boosting_step(grid_4, w_3, X_train, y_train)

#alpha_5, w_5=boosting_step(grid_5, w_4, X_train, y_train)

pred_6, w_6=boosting_step(grid_6_h, w_3, X_train, y_train)

pred_final=(pred_1+pred_2+pred_3+pred_6) / 4

pred_final=np.where(pred_final >=0.5, 1, 0)


end_time=time.time()##
print("\nRun time for train&test boosting: ", end_time-start_time)

# print accuracy metrics
results, false = display_test_scores(y_train, pred_final)
print(results)