In [2]:
import pandas as pd  # to load and manipulate data for One-Hot-Encoding
import numpy as np  # calculate mean & std
import xgboost as xgb 
from sklearn.model_selection import train_test_split  # to split data in training and testing sets
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import plot_confusion_matrix 
import matplotlib.pyplot as plt 

## import data

In [10]:
df = pd.read_csv("Telco_customer_churn.csv")

# these columns should not be used in model   
df.drop(["Churn Label", "Churn Score", "CLTV", "Churn Reason"], axis=1, inplace=True)

# useless columns due to only contain 1 value
# CustomerID is a random value
# Lat Long is a combination of 2 other columns
df.drop(["Count", "Country", "State", "CustomerID", "Lat Long"], axis=1, inplace=True)

# need to remove whitespace from cities not because of xgboost due to one-hot-encoding 
# takes care of that. But need the whitespaces to draw the decision tree
df["City"].replace(" ", "_", regex=True, inplace=True)

# remove whitespace from column names
df.columns = df.columns.str.replace(" ", "_")

df.head()

Unnamed: 0,City,Zip_Code,Latitude,Longitude,Gender,Senior_Citizen,Partner,Dependents,Tenure_Months,Phone_Service,...,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Contract,Paperless_Billing,Payment_Method,Monthly_Charges,Total_Charges,Churn_Value
0,Los_Angeles,90003,33.964131,-118.272783,Male,No,No,No,2,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
1,Los_Angeles,90005,34.059281,-118.30742,Female,No,No,Yes,2,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1
2,Los_Angeles,90006,34.048013,-118.293953,Female,No,No,Yes,8,Yes,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,1
3,Los_Angeles,90010,34.062125,-118.315709,Female,No,Yes,Yes,28,Yes,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,1
4,Los_Angeles,90015,34.039224,-118.266293,Male,No,No,Yes,49,Yes,...,Yes,No,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,1


## identifying missing data
* ### unique about xgboost is that it has default behavior for missing data and need to make them 0's 

In [4]:
df.dtypes

City                  object
Zip_Code               int64
Latitude             float64
Longitude            float64
Gender                object
Senior_Citizen        object
Partner               object
Dependents            object
Tenure_Months          int64
Phone_Service         object
Multiple_Lines        object
Internet_Service      object
Online_Security       object
Online_Backup         object
Device_Protection     object
Tech_Support          object
Streaming_TV          object
Streaming_Movies      object
Contract              object
Paperless_Billing     object
Payment_Method        object
Monthly_Charges      float64
Total_Charges         object
Churn_Value            int64
dtype: object

* ### long story short everything fine except Total Charges

In [5]:
# too many unique values, casting column to a numeric data type
# to check if only contains numbers should do the trick
df["Total_Charges"].unique()

# error occurs. The stacktrace leads to the conclusion that there are blank spaces  
# df["Total Charges"] = df.to_numeric(df["Total Charges"])

array(['108.15', '151.65', '820.5', ..., '7362.9', '346.45', '6844.5'],
      dtype=object)

## dealing with missing data
* ### xgboost style convert missing data to 0's

In [8]:
print("missing data: ", len(df.loc[(df["Total_Charges"] == " ")]))
print("missing data in %: ", len(df.loc[(df["Total_Charges"] == " ")]) / len(df) * 100)
df.loc[(df["Total_Charges"] == " ")]

missing data:  11
missing data in %:  0.1561834445548772


Unnamed: 0,City,Zip_Code,Latitude,Longitude,Gender,Senior_Citizen,Partner,Dependents,Tenure_Months,Phone_Service,...,Device_Protection,Tech_Support,Streaming_TV,Streaming_Movies,Contract,Paperless_Billing,Payment_Method,Monthly_Charges,Total_Charges,Churn_Value
2234,San_Bernardino,92408,34.084909,-117.258107,Female,No,Yes,No,0,No,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,0
2438,Independence,93526,36.869584,-118.189241,Male,No,No,No,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,0
2568,San_Mateo,94401,37.590421,-122.306467,Female,No,Yes,No,0,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,0
2667,Cupertino,95014,37.306612,-122.080621,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,0
2856,Redcrest,95569,40.363446,-123.835041,Female,No,Yes,No,0,No,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,0
4331,Los_Angeles,90029,34.089953,-118.294824,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,0
4687,Sun_City,92585,33.739412,-117.173334,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,0
5104,Ben_Lomond,95005,37.078873,-122.090386,Female,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,0
5719,La_Verne,91750,34.144703,-117.770299,Male,No,Yes,Yes,0,Yes,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,0
6772,Bell,90201,33.970343,-118.171368,Female,No,Yes,Yes,0,Yes,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,0


In [12]:
df.loc[(df["Total_Charges"] == " "), "Total_Charges"] = 0

# still need to convert data type due to xgboost only allows 
# int, float or boolean for missing data
df["Total_Charges"] = pd.to_numeric(df["Total_Charges"])

# reason for Total_Charges=" " is when Tenure_Months=0, now data is fine
df.loc[df["Tenure_Months"] == 0, ["Tenure_Months", "Total_Charges"]]

Unnamed: 0,Tenure_Months,Total_Charges
2234,0,0
2438,0,0
2568,0,0
2667,0,0
2856,0,0
4331,0,0
4687,0,0
5104,0,0
5719,0,0
6772,0,0


In [None]:
# replace whitespace for the whole dataframe
df.replace(' ', '_', regex=True, inplace=True)

## split data