## Separando os Dados(Treino e Teste) e verificando dados faltantes

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Ignore Warning(Avisos)
import warnings

warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 100)

In [2]:
# Carregando os dados
churn = pd.read_csv('../Data/Data_Transformedv1.0.csv')

# Amostra
churn.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,InternetService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,MultipleLines_No phone service,MultipleLines_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes
7590-VHVEG,1,0,1,0,1,0,1,0,1,1,29.85,29.85,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5575-GNVDE,0,0,0,0,34,1,1,2,0,0,56.95,1889.5,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3668-QPYBK,0,0,0,0,2,1,1,0,1,0,53.85,108.15,1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7795-CFOCW,0,0,0,0,45,0,1,2,0,2,42.3,1840.75,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
9237-HQITU,1,0,0,0,2,1,2,0,1,1,70.7,151.65,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Separando as variáveis preditoras da variável target -> coluna 'Churn'
X = churn.drop(['Churn'], axis=1)
y = churn[['Churn']]

# Separando os dados em treino e teste
X_treino, X_teste, y_treino, y_teste = train_test_split(X,
                                                        y,
                                                        test_size=.2,
                                                        random_state=42)

X_treino.shape, X_teste.shape, y_treino.shape, y_teste.shape

((5634, 26), (1409, 26), (5634, 1), (1409, 1))

#### Tratando dados faltantes

In [4]:
# Verificando dados falatantes
print(f'X_treino\n{X_treino.isnull().sum()}')
print('-----------------------')
print(f'X_teste\n{X_teste.isnull().sum()}')
print('-----------------------')
print(f'y_treino\n{y_treino.isnull().sum()}')
print('-----------------------')
print(f'y_teste\n{y_teste.isnull().sum()}')

X_treino
gender                                   0
SeniorCitizen                            0
Partner                                  0
Dependents                               0
tenure                                   0
PhoneService                             0
InternetService                          0
Contract                                 0
PaperlessBilling                         0
PaymentMethod                            0
MonthlyCharges                           0
TotalCharges                            10
MultipleLines_No phone service           0
MultipleLines_Yes                        0
OnlineSecurity_No internet service       0
OnlineSecurity_Yes                       0
OnlineBackup_No internet service         0
OnlineBackup_Yes                         0
DeviceProtection_No internet service     0
DeviceProtection_Yes                     0
TechSupport_No internet service          0
TechSupport_Yes                          0
StreamingTV_No internet service          0
St

In [5]:
# Copiando os dados já separados
X_treino_original = X_treino.copy()
X_teste_original = X_teste.copy()
y_treino_original = y_treino.copy()
y_teste_original = y_teste.copy()

In [6]:
X_teste.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,InternetService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,MultipleLines_No phone service,MultipleLines_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes
1024-GUALD,1,0,1,0,1,0,1,0,1,1,24.8,24.8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0484-JPBRU,0,0,0,0,41,1,0,0,1,2,25.25,996.45,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3620-EHIMZ,1,0,1,1,52,1,0,1,0,0,19.35,1031.7,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
6910-HADCM,1,0,0,0,1,1,2,0,0,1,76.35,76.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8587-XYZSF,0,0,0,0,67,1,1,1,0,2,50.55,3260.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [7]:
# Vamos fazer a imputação dos dados faltantes na coluna 'TotalCharges' pela mediana(Medida de Tendência Central)
mediana_TotalCharges = X_treino['TotalCharges'].median()

X_treino.loc[X_treino['TotalCharges'].isnull(), 'TotalCharges'] = mediana_TotalCharges
X_teste.loc[X_teste['TotalCharges'].isnull(), 'TotalCharges'] = mediana_TotalCharges

In [8]:
# Salvando
X_treino.to_csv('../Data/x_treino.csv', index_label=False)
X_teste.to_csv('../Data/x_teste.csv', index_label=False)
y_treino.to_csv('../Data/y_treino.csv', index_label=False)
y_teste.to_csv('../Data/y_teste.csv', index_label=False)