In [94]:
# Data wrangling:
import numpy as np
import pandas as pd

# Data Visualization:
import seaborn as sns
from yellowbrick.regressor import (
    residuals_plot,
    prediction_error
)

# Pipelines:
from sklearn.pipeline import (
    Pipeline,
    make_pipeline
)
from sklearn.compose import (
    make_column_selector as selector,
    ColumnTransformer,
)

# Seleção e validação dos modelos:
from sklearn import metrics
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV
)

# Pre-processing:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    OneHotEncoder,
)

# Modelos:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import (
    LinearRegression, 
    LogisticRegression,
    LassoCV,
    RidgeCV,
)
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
)

# Deployment:
import pickle

# Config:
from sklearn import set_config
set_config(transform_output='pandas')

from DataUnderstanding import DataUnderstanding

# Telco Customer Churn:

## DataFrame:

In [95]:
path = r'../data/customer_churn.csv'
df = pd.read_csv(
    filepath_or_buffer=path,
    decimal='.',
    sep=','
)

In [96]:
display(df.head(3))

Unnamed: 0,CustomerID,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,...,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,CLTV,Churn Value
0,3668-QPYBK,33.964131,-118.272783,Male,No,No,No,2,Yes,No,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,3239,1
1,9237-HQITU,34.059281,-118.30742,Female,No,No,Yes,2,Yes,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,2701,1
2,9305-CDSKC,34.048013,-118.293953,Female,No,No,Yes,8,Yes,Yes,...,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,5372,1


## Código:

In [97]:
class TelcoCustomerChurn(DataUnderstanding):
    pass

if __name__ == "__main__":
    telco_customer_churn = TelcoCustomerChurn()

## Output:

### Data Understanding:

In [98]:
display(df.head(3))

Unnamed: 0,CustomerID,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,...,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,CLTV,Churn Value
0,3668-QPYBK,33.964131,-118.272783,Male,No,No,No,2,Yes,No,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,3239,1
1,9237-HQITU,34.059281,-118.30742,Female,No,No,Yes,2,Yes,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,2701,1
2,9305-CDSKC,34.048013,-118.293953,Female,No,No,Yes,8,Yes,Yes,...,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,5372,1


In [99]:
display(df.tail(3))

Unnamed: 0,CustomerID,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,...,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,CLTV,Churn Value
7029,2234-XADUH,34.559882,-115.637164,Female,No,Yes,Yes,72,Yes,Yes,...,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,5560,0
7030,4801-JZAZL,34.1678,-116.86433,Female,No,Yes,Yes,11,No,No phone service,...,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,2793,0
7031,3186-AJIEK,34.424926,-117.184503,Male,No,No,No,66,Yes,No,...,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,5097,0


In [100]:
df.columns

Index(['CustomerID', 'Latitude', 'Longitude', 'Gender', 'Senior Citizen',
       'Partner', 'Dependents', 'Tenure Months', 'Phone Service',
       'Multiple Lines', 'Internet Service', 'Online Security',
       'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV',
       'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method',
       'Monthly Charges', 'Total Charges', 'CLTV', 'Churn Value'],
      dtype='object')

In [101]:
telco_customer_churn.verificando_as_dimensões_do_dataframe(
    dataframe=df
)

Linhas: 7032 
Colunas: 24


In [102]:
telco_customer_churn.descrição(
    dataframe=df,
    vertical=True
)

Unnamed: 0,count,mean,std,min,10%,15%,25%,50%,75%,85%,95%,99%,max
Latitude,7032.0,36.28,2.46,32.56,33.66,33.82,34.03,36.39,38.23,38.93,40.56,41.58,41.96
Longitude,7032.0,-119.8,2.16,-124.3,-122.48,-122.26,-121.82,-119.74,-118.04,-117.39,-116.76,-115.52,-114.19
Tenure Months,7032.0,32.42,24.55,1.0,2.0,4.0,9.0,29.0,55.0,65.0,72.0,72.0,72.0
Monthly Charges,7032.0,64.8,30.09,18.25,20.05,20.6,35.59,70.35,89.86,98.6,107.42,114.73,118.75
Total Charges,7032.0,2283.3,2266.77,18.8,84.6,164.56,401.45,1397.48,3794.74,5200.15,6923.59,8039.88,8684.8
CLTV,7032.0,4401.45,1182.41,2003.0,2613.0,2898.0,3469.75,4527.5,5381.0,5692.0,6087.0,6424.69,6500.0
Churn Value,7032.0,0.27,0.44,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0


In [103]:
telco_customer_churn.verificando_valores_nulos(
    dataframe=df
)

Unnamed: 0,Quantidade,Porcentagem
CustomerID,0,0.00%
Latitude,0,0.00%
Longitude,0,0.00%
Gender,0,0.00%
Senior Citizen,0,0.00%
Partner,0,0.00%
Dependents,0,0.00%
Tenure Months,0,0.00%
Phone Service,0,0.00%
Multiple Lines,0,0.00%


In [104]:
telco_customer_churn.verificando_valores_duplicados(
    dataframe=df
)

Unnamed: 0,Quantidade,Porcentagem
CustomerID,0,0.00%
Latitude,5380,76.51%
Longitude,5381,76.52%
Gender,7030,99.97%
Senior Citizen,7030,99.97%
Partner,7030,99.97%
Dependents,7030,99.97%
Tenure Months,6960,98.98%
Phone Service,7030,99.97%
Multiple Lines,7029,99.96%


In [105]:
telco_customer_churn.verificando_tipos(
    dataframe=df
)

Unnamed: 0,Tipos
CustomerID,object
Latitude,float64
Longitude,float64
Gender,object
Senior Citizen,object
Partner,object
Dependents,object
Tenure Months,int64
Phone Service,object
Multiple Lines,object


In [106]:
# Separando as colunas categóricas
categorical_columns = df.select_dtypes(include=['object']).columns

# Separando as colunas numéricas
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns

print("Colunas Categóricas:")
print(categorical_columns)

print("\nColunas Numéricas:")
print(numeric_columns)

Colunas Categóricas:
Index(['CustomerID', 'Gender', 'Senior Citizen', 'Partner', 'Dependents',
       'Phone Service', 'Multiple Lines', 'Internet Service',
       'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
       'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing',
       'Payment Method'],
      dtype='object')

Colunas Numéricas:
Index(['Latitude', 'Longitude', 'Tenure Months', 'Monthly Charges',
       'Total Charges', 'CLTV', 'Churn Value'],
      dtype='object')


### Data Preparation:

In [107]:
NUMERICAL_FEATURES = ['Latitude', 'Longitude', 'Tenure Months', 'Monthly Charges', 'Total Charges', 'CLTV']
CATEGORICAL_FEATURES = ['Gender', 'Senior Citizen', 'Partner', 'Dependents',
       'Phone Service', 'Multiple Lines', 'Internet Service',
       'Online Security', 'Online Backup', 'Device Protection', 'Tech Support',
       'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing',
       'Payment Method']
FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
TARGET = 'Churn Value'

#### Seed:

In [108]:
seed = 69

#### Separando x e y:

In [109]:
x = df.drop(columns='Churn Value')
y = df[TARGET]

#### Train-test Split:

In [110]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    random_state=seed,
)

#### Pre-processing:

##### Null Handling:

In [111]:
numeric_processor = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(), 
)

categorical_processor  = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False), 
)

#### Pre-processing Pipeline:

In [112]:
pre_processing = ColumnTransformer([
    (
        'one',
        categorical_processor,
        CATEGORICAL_FEATURES,
    ),
    (
        'scaler',
        numeric_processor,
        NUMERICAL_FEATURES,
    ),
])

#### Test Pre-processing:

In [113]:
df_test = x
result = pre_processing.fit_transform(df_test)

result.shape

(7032, 43)