# Gerando dados de Teste e Treino
---
**Autor(a):** Sabrina Bruni de Souza Faria

## Importando as Bibliotecas

In [1]:
import pandas as pd
import csv

## Leitura do Dataframe

In [2]:
# Dataframe com o cálculo do churn até determinada data #
# Teste #
dfC = pd.read_csv('Arquivos/churnResultado.csv', index_col=0)
dfC.head(5)

Unnamed: 0,id,churnLinear,churnExponencial_2,churnExponencial_e,churnRecente
0,2378,0.100363,0.0003099465,1.2016e-05,0.089041
1,576,0.045229,9.536743e-07,3.541643e-09,0.068027
2,704,0.060029,0.03906632,0.01314459,0.061224
3,3818,0.075473,0.03906727,0.01314459,0.081633
4,1972,0.132929,0.125001,0.08554822,0.163265


In [3]:
# Tabela de zeros e uns de todos os períodos do dataset análisado #
dftab = pd.read_csv('Arquivos/tabelaTotal.csv', index_col=0)
dftab.head(5)

Unnamed: 0,1993-01-03,1993-01-17,1993-01-31,1993-02-14,1993-02-28,1993-03-14,1993-03-28,1993-04-11,1993-04-25,1993-05-09,...,1998-08-09,1998-08-23,1998-09-06,1998-09-20,1998-10-04,1998-10-18,1998-11-01,1998-11-15,1998-11-29,1998-12-13
2378,0,1,1,1,1,0,1,1,1,1,...,1,1,1,1,1,1,1,0,1,1
576,1,0,1,0,1,0,1,1,1,1,...,1,1,1,1,1,1,1,0,1,1
704,1,0,1,0,1,0,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3818,1,0,1,0,1,0,1,0,1,1,...,1,1,1,1,1,1,1,0,1,0
1972,1,0,1,0,1,0,1,0,1,1,...,1,1,1,1,1,1,1,0,1,1


## Data alvo

In [4]:
data = '1998-08-31'
data_alvo = pd.to_datetime(data)
data_alvo

Timestamp('1998-08-31 00:00:00')

## Converter as colunas do DataFrame para datetime

In [5]:
datas_colunas = pd.to_datetime(dftab.columns)
dftab.columns = datas_colunas
datas_colunas

DatetimeIndex(['1993-01-03', '1993-01-17', '1993-01-31', '1993-02-14',
               '1993-02-28', '1993-03-14', '1993-03-28', '1993-04-11',
               '1993-04-25', '1993-05-09',
               ...
               '1998-08-09', '1998-08-23', '1998-09-06', '1998-09-20',
               '1998-10-04', '1998-10-18', '1998-11-01', '1998-11-15',
               '1998-11-29', '1998-12-13'],
              dtype='datetime64[ns]', length=156, freq=None)

## Encontrar a coluna mais próxima da data alvo

In [6]:
coluna_mais_proxima = datas_colunas[(datas_colunas >= data_alvo)].min()
coluna_mais_proxima

Timestamp('1998-09-06 00:00:00')

## Selecionar todas as colunas a partir da data mais próxima

In [7]:
colunas_selecionadas = datas_colunas[datas_colunas >= coluna_mais_proxima]
colunas_selecionadas

DatetimeIndex(['1998-09-06', '1998-09-20', '1998-10-04', '1998-10-18',
               '1998-11-01', '1998-11-15', '1998-11-29', '1998-12-13'],
              dtype='datetime64[ns]', freq=None)

## Filtrar o DataFrame para incluir apenas essas colunas

In [8]:
df_selecionado = dftab[colunas_selecionadas]
df_selecionado.head(5)

Unnamed: 0,1998-09-06 00:00:00,1998-09-20 00:00:00,1998-10-04 00:00:00,1998-10-18 00:00:00,1998-11-01 00:00:00,1998-11-15 00:00:00,1998-11-29 00:00:00,1998-12-13 00:00:00
2378,1,1,1,1,1,0,1,1
576,1,1,1,1,1,0,1,1
704,1,1,1,1,1,1,1,1
3818,1,1,1,1,1,0,1,0
1972,1,1,1,1,1,0,1,1


## Salvando a tabela de treino em um arquivo

In [9]:
colunas_opostas = dftab.columns.difference(colunas_selecionadas)
df_dif = dftab[colunas_opostas]

In [10]:
df_dif.to_csv( "Arquivos/tabelaAjustada.csv", index = True, header = True, quoting = csv.QUOTE_NONNUMERIC )

## Criando uma serie com "não churn" se houver pelo menos um valor 1 na linha, caso contrário "churn"

In [11]:
resultado = pd.Series( df_selecionado.apply(lambda row: "não churn" if 1 in row.values else "churn", axis=1), name='resultado')
# Treino #
resultado.head(5)

2378    não churn
576     não churn
704     não churn
3818    não churn
1972    não churn
Name: resultado, dtype: object

## Juntando o cálculo de churn com o resultado das datas

In [12]:
# Teste e Treino #
df_merged = dfC.merge(resultado, left_on="id", right_index=True)
df_merged.head(5)

Unnamed: 0,id,churnLinear,churnExponencial_2,churnExponencial_e,churnRecente,resultado
0,2378,0.100363,0.0003099465,1.2016e-05,0.089041,não churn
1,576,0.045229,9.536743e-07,3.541643e-09,0.068027,não churn
2,704,0.060029,0.03906632,0.01314459,0.061224,não churn
3,3818,0.075473,0.03906727,0.01314459,0.081633,não churn
4,1972,0.132929,0.125001,0.08554822,0.163265,não churn


## Salvando o dataframe de churn e resultado das datas em um arquivo CSV

In [13]:
df_merged.to_csv( "Arquivos/TesteTreinoChurn.csv", index = False, header = True, quoting = csv.QUOTE_NONNUMERIC )