In [1]:
import pandas as pd
import numpy as np


In [2]:
path = ('../data/raw/telco_churn.csv')
df = pd.read_csv(path)

In [3]:
colunas_texto = df.select_dtypes(include=['object']).columns

print("Análise das colunas de Texto:")
print(f"{'Coluna':<20} | {'Valores Únicos'}")
print("-" * 35)

for col in colunas_texto:
    qtd_unicos = df[col].nunique()
    print(f"{col:<20} | {qtd_unicos}")

Análise das colunas de Texto:
Coluna               | Valores Únicos
-----------------------------------
customerID           | 7043
gender               | 2
Partner              | 2
Dependents           | 2
PhoneService         | 2
MultipleLines        | 3
InternetService      | 3
OnlineSecurity       | 3
OnlineBackup         | 3
DeviceProtection     | 3
TechSupport          | 3
StreamingTV          | 3
StreamingMovies      | 3
Contract             | 3
PaperlessBilling     | 2
PaymentMethod        | 4
TotalCharges         | 6531
Churn                | 2


In [4]:
#removendo a coluna ID, pois ela não ajuda a prever nada
cols_to_drop = ['customerID', 'customerId', 'CustomerID'] 
for col in cols_to_drop:
    if col in df.columns:
        df.drop(col, axis=1, inplace=True)
        print(f"Coluna removida: {col}")

Coluna removida: customerID


In [5]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [6]:
#Label Encoding
colunas_binarias = ['Partner', 'Dependents','PhoneService','PaperlessBilling','Churn']

for col in colunas_binarias:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

#Coluna Gender (Male/Female), transformei em Female = 1 e Male = 0
df['gender'] = df['gender'].map({'Female': 1, 'Male': 0})


In [7]:
print(df['Churn'].unique())

[0 1]


In [8]:
df['Churn'] = df['Churn'].astype(int)
print("Verificação final:")
print(df['Churn'].value_counts())

Verificação final:
Churn
0    5174
1    1869
Name: count, dtype: int64


In [9]:
#ONE HOT ENCODING
#usando a função get_dummies do pandas
df_proc = pd.get_dummies(df, drop_first=True)

print(f"Tamanho antigo do dataset: {df.shape}")
print(f"Novo tamanho (com One-Hot): {df_proc.shape}")
display(df_proc.head())

Tamanho antigo do dataset: (7043, 20)
Novo tamanho (com One-Hot): (7043, 31)


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,1,0,1,0,1,29.85,29.85,0,...,False,False,False,False,False,False,False,False,True,False
1,0,0,0,0,34,1,0,56.95,1889.5,0,...,False,False,False,False,False,True,False,False,False,True
2,0,0,0,0,2,1,1,53.85,108.15,1,...,False,False,False,False,False,False,False,False,False,True
3,0,0,0,0,45,0,0,42.3,1840.75,0,...,True,False,False,False,False,True,False,False,False,False
4,1,0,0,0,2,1,1,70.7,151.65,1,...,False,False,False,False,False,False,False,False,True,False


In [10]:
save_path = '../data/processed/churn_train_ready.csv'

df_proc.to_csv(save_path, index=False)

print(f"Dados processados salvos com sucesso em: {save_path}")

Dados processados salvos com sucesso em: ../data/processed/churn_train_ready.csv
