In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data/df_chargeback.csv", index_col = [0])
df.columns= df.columns.str.lower()
df.rename(columns={"card number":"cartao", "date":"data", "amount":"valor"}, inplace=True)
df

Unnamed: 0,cartao,data,valor,cbk
0,536518******2108,2015-05-01 00:01:54,36.54,No
1,536518******2108,2015-05-01 00:03:46,36.54,No
2,453211******1239,2015-05-01 00:08:50,69.00,No
3,548827******1705,2015-05-01 00:27:00,193.43,No
4,531681******9778,2015-05-01 01:32:46,132.00,No
...,...,...,...,...
11123,514868******7409,2015-05-30 23:07:01,53.00,No
11124,439354******5281,2015-05-30 23:08:47,15.00,No
11125,549167******1648,2015-05-30 23:15:24,20.00,No
11126,518759******8384,2015-05-30 23:17:41,70.00,No


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11127 entries, 0 to 11127
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   cartao  11127 non-null  object 
 1   data    11127 non-null  object 
 2   valor   11127 non-null  float64
 3   cbk     11127 non-null  object 
dtypes: float64(1), object(3)
memory usage: 434.6+ KB


### Missing values

In [4]:
# Checando missing values
df.isna().sum()

cartao    0
data      0
valor     0
cbk       0
dtype: int64

### Removing duplicates

In [5]:
df.duplicated().sum()

123

In [6]:
df[df.duplicated()]

Unnamed: 0,cartao,data,valor,cbk
6104,515894******6461,2015-05-15 23:00:20,264.00,Yes
11004,514945******7580,2015-05-30 14:32:17,15.00,No
11005,498408******2729,2015-05-30 14:32:37,96.42,No
11006,441524******8556,2015-05-30 14:33:03,35.00,No
11007,546451******1223,2015-05-30 14:35:14,99.00,No
...,...,...,...,...
11122,518759******2179,2015-05-30 23:06:38,10.00,No
11123,514868******7409,2015-05-30 23:07:01,53.00,No
11124,439354******5281,2015-05-30 23:08:47,15.00,No
11125,549167******1648,2015-05-30 23:15:24,20.00,No


In [7]:
df[df.cartao == '515894******6461']

Unnamed: 0,cartao,data,valor,cbk
6103,515894******6461,2015-05-15 23:00:20,264.0,Yes
6104,515894******6461,2015-05-15 23:00:20,264.0,Yes


In [8]:
df.drop_duplicates(inplace=True)

### Feature engineering

In [9]:
def get_day_period(time):
    period = ""
    if 6 <= time <= 12:
        period = "manha"
    elif 13 <= time <= 18:
        period = "tarde"
    elif 19 <= time <= 23:
        period = "noite"
    elif 00 <= time <= 5:
        period = "madrugada"
    return period

def get_month_period(day):
    period = ""
    if 1 <= day <= 10:
        period = "inicio"
    elif 11 <= day <= 20:
        period = "meio"
    elif 21 <= day <= 31:
        period = "fim"
    return period

def get_week_period(day):
    period = ""
    if day in range(0,5):
        period = "semana"
    else:
        period = "fds"

    return period

def get_value_mean_by_col(data, col, value):
    mean_day = data.groupby(col)['valor'].mean()
    value_day = round(mean_day[value], 2)
    return value_day

def get_count(data, col, value):
    card_count = data[col].value_counts()[value]
    return card_count

In [11]:
# Extraindo apenas o dia, pois os meses e os anos são todos iguais
df.loc[:, "dia"] = pd.to_datetime(df.loc[:, "data"]).dt.day.astype(int)
df.loc[:, "periodo_mes"] = df.loc[:, "dia"].apply(lambda x: get_month_period(x))
df.loc[:, "dia_semana"] = pd.to_datetime(df.loc[:, "data"]).dt.day_of_week.astype(int)
df.loc[:, "periodo_semana"] = df.loc[:, "dia_semana"].apply(lambda x: get_week_period(x))
df.loc[:, "hora_completa"] = pd.to_datetime(df.loc[:, "data"]).dt.timetz
df.loc[:, "hora"] = df.loc[:, "hora_completa"].apply(lambda x: x.hour)
df.loc[:, "hora_completa_segundos"] = df.loc[:, "hora_completa"].apply(lambda x: (x.hour * 60 + x.minute) * 60 + x.second)
df.loc[:, "periodo_dia"] = df.loc[:, "hora"].apply(lambda x: get_day_period(x))
df.loc[:, "media_dia"] = df.loc[:, "dia"].apply(lambda x: get_value_mean_by_col(df, 'dia', x))
df.loc[:, "media_cartao"] = df.loc[:, "cartao"].apply(lambda x: get_value_mean_by_col(df, 'cartao', x))
df.loc[:, "total_uso_cartao"] = df.loc[:, "cartao"].apply(lambda x: get_count(df, 'cartao', x))
df.loc[:, "cbk"] = df.loc[:, "cbk"].replace({"Yes":"Sim", "No":"Não"})

df_final = df.loc[:, ['dia','dia_semana', 'periodo_semana', 'periodo_mes', 'hora', 'hora_completa_segundos', 'periodo_dia', 'cartao', 'valor', 'media_dia', 'media_cartao', 'total_uso_cartao', 'cbk']] 
df_final.drop_duplicates(inplace=True)

In [12]:
uniq_cards = df_final['cartao'].drop_duplicates().values
df_final['vez_uso_cartao'] = 0
for card in uniq_cards:
    df_final.loc[df_final['cartao'] == card, 'vez_uso_cartao'] = [x + 1 for x in range(len(df_final[df_final['cartao'] == card]))]

df_final.loc[:, "cartao_usado_antes"] = df_final.loc[:, "vez_uso_cartao"].apply(lambda x: x > 1)

In [13]:
df_final

Unnamed: 0,dia,dia_semana,periodo_semana,periodo_mes,hora,hora_completa_segundos,periodo_dia,cartao,valor,media_dia,media_cartao,total_uso_cartao,cbk,vez_uso_cartao,cartao_usado_antes
0,1,4,semana,inicio,0,114,madrugada,536518******2108,36.54,139.59,36.54,2,Não,1,False
1,1,4,semana,inicio,0,226,madrugada,536518******2108,36.54,139.59,36.54,2,Não,2,True
2,1,4,semana,inicio,0,530,madrugada,453211******1239,69.00,139.59,69.00,1,Não,1,False
3,1,4,semana,inicio,0,1620,madrugada,548827******1705,193.43,139.59,193.43,1,Não,1,False
4,1,4,semana,inicio,1,5566,madrugada,531681******9778,132.00,139.59,132.00,1,Não,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11000,30,5,fds,fim,23,83327,noite,439354******5281,15.00,95.18,15.00,1,Não,1,False
11001,30,5,fds,fim,23,83724,noite,549167******1648,20.00,95.18,20.00,1,Não,1,False
11002,30,5,fds,fim,23,83861,noite,518759******8384,70.00,95.18,70.00,1,Não,1,False
11066,30,5,fds,fim,18,66573,tarde,455188******5104,2.10,95.18,2.10,1,Não,1,False


In [14]:
df_final.to_csv("../data/chargeback_data_final.csv", index=False)