In [18]:
import pandas as pd

from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import recall_score
import numpy as np

## Carga dos dados

In [2]:
df_churn = pd.read_csv('./churn_telecom.csv')

In [3]:
df_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   IDCliente         7032 non-null   object 
 1   Genero            7032 non-null   object 
 2   Mais65anos        7032 non-null   int64  
 3   TemParceiro       7032 non-null   object 
 4   TemDependentes    7032 non-null   object 
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  tenure            7032 non-null   int64  
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 


In [4]:
df_churn.head()

Unnamed: 0,IDCliente,Genero,Mais65anos,TemParceiro,TemDependentes,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,tenure,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,No,No phone service,DSL,No,Yes,...,No,No,No,1,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,Yes,No,DSL,Yes,No,...,No,No,No,34,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,Yes,No,DSL,Yes,Yes,...,No,No,No,2,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,No,No phone service,DSL,Yes,No,...,Yes,No,No,45,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,Yes,No,Fiber optic,No,No,...,No,No,No,2,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [6]:
# Abandonar o serviço é uma anormalia nesse contexto
df_churn.value_counts('Churn')

Churn
No     5163
Yes    1869
Name: count, dtype: int64

In [7]:
# Distribuição percentual de clientes pela variável Churn
df_churn.value_counts('Churn', normalize=True)

Churn
No     0.734215
Yes    0.265785
Name: proportion, dtype: float64

## Preparação da base para Algoritmo LOF

In [None]:
# Selecionando as colunas para o algoritmo
X = df_churn.drop(columns=['Churn','IDCliente'],axis=1)
y = df_churn['Churn']

In [None]:
# Função customizada
# Definir uma função para transformar "Yes" em 1 e "No" em 0

def binary_transformer_funcition(X):
    return X.map(lambda x: 1 if x == 'Yes' else 0)

In [10]:
# Transformações
numeric_features = ['tenure','MonthlyCharges','TotalCharges']
categorical_features = ['Genero','MultipleLines','InternetService','OnlineSecurity','OnlineBackup',
                        'DeviceProtection','StreamingTV','StreamingMovies','Contract','PaymentMethod']
binary_features = ['TemParceiro','TemDependentes','TechSupport','PhoneService','PaperlessBilling']
no_transformation_features = ['Mais65anos']

# Criar transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()
binary_transformer = FunctionTransformer(binary_transformer_funcition)

# Criar o preprocessador
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,numeric_features),
        ('cat',categorical_transformer,categorical_features),
        ('bin',binary_transformer,binary_features),
        ('pass','passthrough',no_transformation_features)
    ]
)

X_transformed = preprocessor.fit_transform(X)

In [12]:
X_transformed, X_transformed.shape

(array([[-1.28024804, -1.16169394, -0.99419409, ...,  0.        ,
          1.        ,  0.        ],
        [ 0.06430269, -0.26087792, -0.17373982, ...,  1.        ,
          0.        ,  0.        ],
        [-1.23950408, -0.36392329, -0.95964911, ...,  1.        ,
          1.        ,  0.        ],
        ...,
        [-0.87280842, -1.17000405, -0.85451414, ...,  0.        ,
          1.        ,  0.        ],
        [-1.15801615,  0.31916782, -0.87209546, ...,  1.        ,
          1.        ,  1.        ],
        [ 1.36810945,  1.35793167,  2.01234407, ...,  1.        ,
          1.        ,  0.        ]], shape=(7032, 39)),
 (7032, 39))

## Treinar o algoritmo LOF

In [None]:
# Instanciar um objeto LOF
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.26)
# 0.26 pq foi a proporção de churn que vimos no inicio

In [14]:
# Treinar o algoritmo e já gerar as classificações de anomalia para cada registro (Ponto de dados)
y_pred = lof.fit_predict(X_transformed)

In [15]:
# Mostrar valores preditos (Anomalia ou não anomalia)
# No Sklearn, o predict gera um valor = -1 para uma anomalia, e valor = 1 como ponto normal
y_pred

array([ 1,  1,  1, ...,  1, -1,  1], shape=(7032,))

In [17]:
# Mostrar o LOF calculado para cada ponto de dados
# No sklearn, o LOF calculado fica na propriedade negative_ooutlier_factor_
# negative_outlier_factor_ é o inverso do LOF. Quanto menor, mais anormal.
-lof.negative_outlier_factor_

array([1.0238333 , 1.03547225, 1.02610568, ..., 1.07053634, 1.19840027,
       1.08901757], shape=(7032,))

## Apresentar resultados

In [19]:
# Identificar anomalias
outliers = y_pred == -1
inliers = y_pred == 1

# Contar anomalias
num_outliers = np.sum(outliers)
num_inliers = np.sum(inliers)

# Apresentar resultados

print(f'Anomalias detectadas: {num_outliers}')
print(f'Pontos Normais: {num_inliers}')

Anomalias detectadas: 1829
Pontos Normais: 5203


In [20]:
# Converter y para a mesma base do y_pred
y_true = y.map(lambda x : -1 if x == 'Yes' else 1)

In [21]:
# Calcular Score com base no valor de y (Churn real da base)
# Usar recall, pois o objetivo principal é maximixar o TPR (True Positive Rate)
recall_score(y_true, y_pred)

0.7515010652721286