In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# orig_url = "https://drive.google.com/file/d/12fFZ9k8wsmWBVUhcsVxmKsqHxaVzAzqt/view?usp=sharing"
# file_id = orig_url.split('/')[-2]
# data_path='https://drive.google.com/uc?export=download&id=' + file_id

data_path='https://drive.google.com/uc?export=download&id=1A3MUldrs0z09DlYR6Y1utfySwKNO9Qsz'
df = pd.read_csv(data_path)
df.head(10)

Unnamed: 0.1,Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1869,7010-BRBUU,Male,0,Yes,Yes,72,Yes,Yes,No,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),24.1,1734.65,No
1,4528,9688-YGXVR,Female,0,No,No,44,Yes,No,Fiber optic,...,Yes,No,Yes,No,Month-to-month,Yes,Credit card (automatic),88.15,3973.2,No
2,6344,9286-DOJGF,Female,1,Yes,No,38,Yes,Yes,Fiber optic,...,No,No,No,No,Month-to-month,Yes,Bank transfer (automatic),74.95,2869.85,Yes
3,6739,6994-KERXL,Male,0,No,No,4,Yes,No,DSL,...,No,No,No,Yes,Month-to-month,Yes,Electronic check,55.9,238.5,No
4,432,2181-UAESM,Male,0,No,No,2,Yes,No,DSL,...,Yes,No,No,No,Month-to-month,No,Electronic check,53.45,119.5,No
5,2215,4312-GVYNH,Female,0,Yes,No,70,No,No phone service,DSL,...,Yes,Yes,No,Yes,Two year,Yes,Bank transfer (automatic),49.85,3370.2,No
6,5260,2495-KZNFB,Female,0,No,No,33,Yes,Yes,Fiber optic,...,No,No,No,Yes,Month-to-month,Yes,Electronic check,90.65,2989.6,No
7,6001,4367-NHWMM,Female,0,No,No,1,No,No phone service,DSL,...,No,No,No,No,Month-to-month,Yes,Mailed check,24.9,24.9,No
8,1480,8898-KASCD,Male,0,No,No,39,No,No phone service,DSL,...,Yes,Yes,No,No,One year,No,Mailed check,35.55,1309.15,No
9,5137,8016-NCFVO,Male,1,No,No,55,Yes,Yes,Fiber optic,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,116.5,6382.55,No


In [10]:
df["SeniorCitizen"] = df["SeniorCitizen"].astype("object")
df["TotalCharges"]= df["TotalCharges"].apply(lambda x: "0" if x == " " else x )
df["TotalCharges"] = df["TotalCharges"].astype("float64")
df.dtypes

Unnamed: 0            int64
customerID           object
gender               object
SeniorCitizen        object
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [11]:
df["num_services"] = (
    df["PhoneService"].apply(lambda x: 1 if x == "Yes" else 0 )+
    df["MultipleLines"].apply(lambda x: 1 if x == "Yes" else 0 ) +
    df["InternetService"].apply(lambda x: 0 if x == "No" else 1 ) +
    df["OnlineSecurity"].apply(lambda x: 1 if x == "Yes" else 0 ) +
    df["OnlineBackup"].apply(lambda x: 1 if x == "Yes" else 0 ) +
    df["DeviceProtection"].apply(lambda x: 1 if x == "Yes" else 0 ) +
    df["TechSupport"].apply(lambda x: 1 if x == "Yes" else 0 ) +
    df["StreamingTV"].apply(lambda x: 1 if x == "Yes" else 0 ) +
    df["StreamingMovies"].apply(lambda x: 1 if x == "Yes" else 0 )
)
df["num_services"]

0       2
1       5
2       3
3       3
4       4
       ..
5981    5
5982    9
5983    1
5984    6
5985    1
Name: num_services, Length: 5986, dtype: int64

In [12]:
df["auto_payment"] = df["PaymentMethod"].apply(lambda x: 1 if "automatic" in x  else 0 )
df["auto_payment"] = df["auto_payment"].astype("object")

In [26]:
X = df.drop(columns = ['Unnamed: 0','Churn', "customerID", 'auto_payment', 'num_services'])
y = df['Churn']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4190, 19), (1796, 19), (4190,), (1796,))

In [14]:
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X_train.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_features, numeric_features

(['gender',
  'SeniorCitizen',
  'Partner',
  'Dependents',
  'PhoneService',
  'MultipleLines',
  'InternetService',
  'OnlineSecurity',
  'OnlineBackup',
  'DeviceProtection',
  'TechSupport',
  'StreamingTV',
  'StreamingMovies',
  'Contract',
  'PaperlessBilling',
  'PaymentMethod',
  'auto_payment'],
 ['tenure', 'MonthlyCharges', 'TotalCharges', 'num_services'])

In [27]:
#Added number of services and removed individual services
X_num_services = df.drop(columns=['PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'auto_payment', 'TotalCharges'])
X_num_services.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5986 entries, 0 to 5985
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        5986 non-null   int64  
 1   customerID        5986 non-null   object 
 2   gender            5986 non-null   object 
 3   SeniorCitizen     5986 non-null   object 
 4   Partner           5986 non-null   object 
 5   Dependents        5986 non-null   object 
 6   tenure            5986 non-null   int64  
 7   Contract          5986 non-null   object 
 8   PaperlessBilling  5986 non-null   object 
 9   PaymentMethod     5986 non-null   object 
 10  MonthlyCharges    5986 non-null   float64
 11  Churn             5986 non-null   object 
 12  num_services      5986 non-null   int64  
dtypes: float64(1), int64(3), object(9)
memory usage: 608.1+ KB


In [23]:
# Removed Payment method
X_auto = df.drop(columns=['PaymentMethod', 'num_services', 'TotalCharges'])
X_auto.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5986 entries, 0 to 5985
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            5986 non-null   object 
 1   SeniorCitizen     5986 non-null   object 
 2   Partner           5986 non-null   object 
 3   Dependents        5986 non-null   object 
 4   tenure            5986 non-null   int64  
 5   PhoneService      5986 non-null   object 
 6   MultipleLines     5986 non-null   object 
 7   InternetService   5986 non-null   object 
 8   OnlineSecurity    5986 non-null   object 
 9   OnlineBackup      5986 non-null   object 
 10  DeviceProtection  5986 non-null   object 
 11  TechSupport       5986 non-null   object 
 12  StreamingTV       5986 non-null   object 
 13  StreamingMovies   5986 non-null   object 
 14  Contract          5986 non-null   object 
 15  PaperlessBilling  5986 non-null   object 
 16  MonthlyCharges    5986 non-null   float64
