In [2]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-win_amd64.whl (125.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.1


In [3]:
%matplotlib inline  
# To make data visualisations display in Jupyter Notebooks 
import numpy as np   # linear algebra
import pandas as pd  # Data processing, Input & Output load
import matplotlib.pyplot as plt # Visuvalization & plotting
import seaborn as sns
import datetime  

import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier  #Ada algorithm

from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from xgboost.sklearn import XGBClassifier # Extrame GB
from xgboost import plot_importance ## Plotting Importance Variables 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
                                    # GridSearchCV - Implements a “fit” and a “score” method
                                    # train_test_split - Split arrays or matrices into random train and test subsets
                                    # cross_val_score - Evaluate a score by cross-validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, make_scorer, accuracy_score, roc_curve, confusion_matrix, classification_report
                                    # Differnt metrics to evaluate the model 
#import pandas_profiling as pp   # simple and fast exploratory data analysis of a Pandas Datafram

import warnings   # To avoid warning messages in the code run
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder # Labeling the columns with 0 & 1

In [None]:
Tdata = pd.read_csv(r"churn.csv")
Tdata.head()

In [None]:
print ("Rows     : " ,Tdata.shape[0])
print ("Columns  : " ,Tdata.shape[1])


In [None]:
def df_summary(df):
    df_U = df.nunique()
    df_M = df.isnull().sum()   # compute missing value and do sum 
    df_I = df.dtypes    # data infto 
    df_U = df_U.to_frame().reset_index()
    df_M = df_M.to_frame().reset_index()
    df_I = df_I.to_frame().reset_index()
    df_U = df_U.rename(columns= {0: 'Unique Data'})
    df_M = df_M.rename(columns= {0: 'Missing Data'})
    df_I = df_I.rename(columns= {0: 'Data Types'})
    output = pd.merge(pd.merge(df_M,df_U,on='index'),df_I,on='index')
    return output;

In [None]:
df_summary(Tdata)

In [None]:
Tdata.TotalCharges.head()

*  Seems the variables is wrongly classified as object data type .
* We will convert into Numerical

In [None]:
Tdata.TotalCharges= pd.to_numeric(Tdata.TotalCharges, errors='coerce')
Tdata.TotalCharges.describe()

In [None]:
df_summary(Tdata)

In [None]:
Tdata['TotalCharges'].fillna((Tdata['TotalCharges'].mean()), inplace=True)
Tdata['TotalCharges'].isnull().sum()

* As we dicussed above we will drop the customer id column since it will not give much information for prediciton

In [None]:
Tdata.drop('customerID',axis=1, inplace=True)
Tdata.columns

# Lets seprate Numerical columns and object columns seperatly

In [None]:
Num_cols = Tdata.select_dtypes(include=['float64','int64']).columns.tolist()
Cat_cols = Tdata.select_dtypes(include=['object']).columns.tolist()
print("Number columns : ",Num_cols , "Catogarical columns :" ,Cat_cols,sep="\n")

In [None]:
Binary_class = Tdata[Cat_cols].nunique()[Tdata[Cat_cols].nunique() == 2].keys().tolist()
Multi_class =  Tdata[Cat_cols].nunique()[Tdata[Cat_cols].nunique() > 2].keys().tolist()
print(Binary_class)
print(Multi_class)

# Tranforming the data

In [None]:
#Label encoding Binary columns
le = LabelEncoder()
for i in Binary_class :
    Tdata[i] = le.fit_transform(Tdata[i])

In [None]:
Tdata[Binary_class].shape

In [None]:
# Split multi class catergory columns as dummies  
Tdata_Dummy = pd.get_dummies(Tdata[Multi_class])
Tdata_Dummy.head()

In [None]:
New_df = pd.concat([Tdata[Num_cols],Tdata[Binary_class],Tdata_Dummy], axis=1)
New_df.shape

In [None]:
import plotly.express as px

fig = px.pie(New_df,names='Churn',color='Churn',
             color_discrete_map={'Yes':'red',
                                 'No':'green'})
fig.show()

# Data Partition

In [None]:
X = New_df.drop('Churn',axis=1)
y = New_df[["Churn"]]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state =1)

In [None]:
print('The number of samples into the Train data is {}.'.format(x_train.shape[0]))
print('The number of samples into the test data is {}.'.format(x_test.shape[0]))

# Model 3 - XGBOOST

In [None]:
# Define Model parameters to tune
model_parameters = { 
        'n_estimators':[50, 100, 200, 500],  # number of trees
        'max_depth': [3, 5, 10],                 # maximum number of branches in each tree 
        'min_samples_leaf': [10,5,15]            # minimum number of Sample in each child node or leaf 
                  }

In [None]:
# Gridsearch the parameters to find the best parameters. Using L2 penalty
model = XGBClassifier(reg_lambda=0)   # Lamda = 0 
gscv = GridSearchCV(estimator=model, 
                    param_grid=model_parameters, 
                    cv=5, 
                    verbose=1, 
                    n_jobs=-1,
                    scoring='accuracy')

gscv.fit(x_train, y_train)  ## Model building

In [None]:
print('The best parameter are -', gscv.best_params_)

In [None]:
# Re-fit the model with the best parameters
final_mod = XGBClassifier(**gscv.best_params_)
final_mod.fit(x_train, y_train)

# Prediction

In [None]:
# Prediction
train_pred = final_mod.predict(x_train)
test_pred = final_mod.predict(x_test)

In [None]:
train=pd.concat([y_train,x_train],axis=1)
train['Predicted']=final_mod.predict(x_train)  # MODEL = dt
train.head()

# Tree Plot

In [None]:
node_params = {'shape':'box',
               'style':'filled,rounded',
                'fillcolor' :'#78cbec'}
leaf_params = {'shape':'box',
               'style':'filled',
                'fillcolor' :'#e48038'}

In [None]:
xgb.to_graphviz(final_mod,num_trees=0,size="5,5",
               condition_node_params=node_params,
               leaf_node_params=leaf_params)

# Confusion Matrix Train Data (Model 3)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train,train_pred)
print(confusion_matrix)

In [None]:
Accuracy_test=((3774+773)/(5634)*100)
Accuracy_test

# Confusion Matrix Test Data (Model 3)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,test_pred)
print(confusion_matrix)

In [None]:
Accuracy_test=((953+183)/(1409)*100)
Accuracy_test

In [None]:
print('Classification report for train data is : \n',
      classification_report(y_train, train_pred))
print('Classification report for test data is : \n',
      classification_report(y_test, test_pred))

# Finished