# Cálculo de Churn Ponderado Exponencial com base 2:

### Importar bibliotecas:


In [1]:
import pandas as pd
import numpy as np
import csv
from datetime import datetime

### Ler dataframe:

In [2]:
arquivo = "CDNOW_master.txt"
nomes_colunas = ["id_cliente", "char_date", "categoria", "valor"]
cdf = pd.read_csv(arquivo, sep="\s+", names=nomes_colunas)
print(cdf)

       id_cliente  char_date  categoria  valor
0               1   19970101          1  11.77
1               2   19970112          1  12.00
2               2   19970112          5  77.00
3               3   19970102          2  20.76
4               3   19970330          2  20.76
...           ...        ...        ...    ...
69654       23568   19970405          4  83.74
69655       23568   19970422          1  14.99
69656       23569   19970325          2  25.74
69657       23570   19970325          3  51.12
69658       23570   19970326          2  42.96

[69659 rows x 4 columns]


### Definir Tipos:

In [3]:
cdf.drop(['categoria','valor'], axis=1, inplace=True)
cdf['char_date'] = cdf["char_date"].astype(str)
cdf["date"] = pd.to_datetime(cdf["char_date"], format="%Y%m%d")
cdf["char_date"] = cdf["date"].dt.strftime("%Y-%m-%d")
print(cdf)

       id_cliente   char_date       date
0               1  1997-01-01 1997-01-01
1               2  1997-01-12 1997-01-12
2               2  1997-01-12 1997-01-12
3               3  1997-01-02 1997-01-02
4               3  1997-03-30 1997-03-30
...           ...         ...        ...
69654       23568  1997-04-05 1997-04-05
69655       23568  1997-04-22 1997-04-22
69656       23569  1997-03-25 1997-03-25
69657       23570  1997-03-25 1997-03-25
69658       23570  1997-03-26 1997-03-26

[69659 rows x 3 columns]


### Declarar constantes:

In [4]:
total_periodos = 9
total_customer = len(cdf["id_cliente"].unique())

### Definir intervalo de tempo de análise:

In [5]:
dates_vector = pd.date_range(start=min(cdf["date"]), end=max(cdf["date"]), periods=total_periodos)
last_date = dates_vector[-1]
last_date = last_date + pd.DateOffset(days=1)
dates_vector = dates_vector[:-1].append(pd.DatetimeIndex([last_date]))
print(dates_vector)

DatetimeIndex(['1997-01-01 00:00:00', '1997-03-10 03:00:00',
               '1997-05-17 06:00:00', '1997-07-24 09:00:00',
               '1997-09-30 12:00:00', '1997-12-07 15:00:00',
               '1998-02-13 18:00:00', '1998-04-22 21:00:00',
               '1998-07-01 00:00:00'],
              dtype='datetime64[ns]', freq=None)


### Definir a matriz de clientes por período:

In [6]:
mat_id_churn = np.zeros((total_customer, len(dates_vector)-1))
print(mat_id_churn[:5])

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


In [7]:
valor_media = 0
for i in range(len(dates_vector)-1):
    valor_media += 2**(i+1)
    for j in range(len(cdf['id_cliente'])):
        if ((dates_vector[i] <= cdf.loc[j,'date']) and (cdf.loc[j,'date'] < dates_vector[i+1])):
            mat_id_churn[cdf.loc[j, 'id_cliente']-1,i] = 2**(i+1)
print(valor_media)
print(mat_id_churn[:5])

510
[[  2.   0.   0.   0.   0.   0.   0.   0.]
 [  2.   0.   0.   0.   0.   0.   0.   0.]
 [  2.   4.   0.   0.  32.   0.   0. 256.]
 [  2.   0.   0.  16.   0.  64.   0.   0.]
 [  2.   4.   8.  16.   0.  64.   0.   0.]]


### Transformar a matriz em um dataframe:

In [8]:
df_id_churn = pd.DataFrame(mat_id_churn)
df_id_churn.columns = dates_vector[:-1].astype(str)
print(df_id_churn.loc[:5])

   1997-01-01 00:00:00  1997-03-10 03:00:00  1997-05-17 06:00:00  \
0                  2.0                  0.0                  0.0   
1                  2.0                  0.0                  0.0   
2                  2.0                  4.0                  0.0   
3                  2.0                  0.0                  0.0   
4                  2.0                  4.0                  8.0   
5                  2.0                  0.0                  0.0   

   1997-07-24 09:00:00  1997-09-30 12:00:00  1997-12-07 15:00:00  \
0                  0.0                  0.0                  0.0   
1                  0.0                  0.0                  0.0   
2                  0.0                 32.0                  0.0   
3                 16.0                  0.0                 64.0   
4                 16.0                  0.0                 64.0   
5                  0.0                  0.0                  0.0   

   1998-02-13 18:00:00  1998-04-22 21:00:00  


### Definir o dataframe final do churn:

In [9]:
churn = 1 - (df_id_churn.sum(axis=1) / valor_media)
churn = pd.DataFrame(churn)
churn.insert(0,'id',cdf['id_cliente'].unique().astype(str))
churn.rename(columns={0: 'churn'}, inplace=True)
print(churn)

          id     churn
0          1  0.996078
1          2  0.996078
2          3  0.423529
3          4  0.839216
4          5  0.815686
...      ...       ...
23565  23566  0.992157
23566  23567  0.992157
23567  23568  0.992157
23568  23569  0.992157
23569  23570  0.992157

[23570 rows x 2 columns]


### Salvando o dataframe em um arquivo:

In [10]:
churn.to_csv('calculoChurnPonderadoExponencial_2.txt', index=False, header=True, quoting=csv.QUOTE_NONNUMERIC)