In [10]:
import pandas as pd

from dtype_diet import report_on_dataframe, optimize_dtypes
from summarytools import dfSummary

from sklearn.model_selection import train_test_split


In [3]:
url = "https://github.com/PedroReste/prescriptive_and_prediction_analysis/raw/main/pt_modelo_de_previsao_de_churn/ecommerce_dataset_sem_tratamento.csv"
df = pd.read_csv(url, sep=";")
df.head()

Unnamed: 0,CustomerID,Churn,Tenure,PreferredLoginDevice,CityTier,WarehouseToHome,PreferredPaymentMode,Gender,HourSpendOnApp,NumberOfDeviceRegistered,PreferedOrderCat,SatisfactionScore,MaritalStatus,NumberOfAddress,Complain,OrderAmountHikeFromlastYear,CouponUsed,OrderCount,DaySinceLastOrder,CashbackAmount
0,50001,1,4.0,Mobile Phone,3,6.0,Debit Card,Female,3.0,3,Laptop & Accessory,2,Single,9,1,11.0,1.0,1.0,5.0,160
1,50002,1,,Phone,1,8.0,UPI,Male,3.0,4,Mobile,3,Single,7,1,15.0,0.0,1.0,0.0,121
2,50003,1,,Phone,1,30.0,Debit Card,Male,2.0,4,Mobile,3,Single,6,1,14.0,0.0,1.0,3.0,120
3,50004,1,0.0,Phone,3,15.0,Debit Card,Male,2.0,4,Laptop & Accessory,5,Single,8,0,23.0,0.0,1.0,3.0,134
4,50005,1,0.0,Phone,1,12.0,CC,Male,,3,Mobile,5,Single,3,0,11.0,1.0,1.0,3.0,130


In [6]:
optimized_df = report_on_dataframe(df, unit="MB")
df_optimized = optimize_dtypes(df, optimized_df)

print(f'Df original: {df.memory_usage(deep=True).sum()/1024/1014} MB')
print(f'Df otimizado, memória usada: {df_optimized.memory_usage(deep=True).sum()/1024/1014} MB')

df = df_optimized

Df original: 2.4223709858851086 MB
Df otimizado, memória usada: 0.1702916974852071 MB


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5630 entries, 0 to 5629
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   CustomerID                   5630 non-null   int32   
 1   Churn                        5630 non-null   int8    
 2   Tenure                       5366 non-null   float16 
 3   PreferredLoginDevice         5630 non-null   category
 4   CityTier                     5630 non-null   int8    
 5   WarehouseToHome              5379 non-null   float16 
 6   PreferredPaymentMode         5630 non-null   category
 7   Gender                       5630 non-null   category
 8   HourSpendOnApp               5375 non-null   float16 
 9   NumberOfDeviceRegistered     5630 non-null   int8    
 10  PreferedOrderCat             5630 non-null   category
 11  SatisfactionScore            5630 non-null   int8    
 12  MaritalStatus                5630 non-null   category
 13  Num

In [11]:
df.nunique()

CustomerID                     5630
Churn                             2
Tenure                           36
PreferredLoginDevice              3
CityTier                          3
WarehouseToHome                  34
PreferredPaymentMode              7
Gender                            2
HourSpendOnApp                    6
NumberOfDeviceRegistered          6
PreferedOrderCat                  6
SatisfactionScore                 5
MaritalStatus                     3
NumberOfAddress                  15
Complain                          2
OrderAmountHikeFromlastYear      16
CouponUsed                       17
OrderCount                       16
DaySinceLastOrder                22
CashbackAmount                  220
dtype: int64

In [17]:
df.isnull().sum()

CustomerID                       0
Churn                            0
Tenure                         264
PreferredLoginDevice             0
CityTier                         0
WarehouseToHome                251
PreferredPaymentMode             0
Gender                           0
HourSpendOnApp                 255
NumberOfDeviceRegistered         0
PreferedOrderCat                 0
SatisfactionScore                0
MaritalStatus                    0
NumberOfAddress                  0
Complain                         0
OrderAmountHikeFromlastYear    265
CouponUsed                     256
OrderCount                     258
DaySinceLastOrder              307
CashbackAmount                   0
dtype: int64

In [18]:
df["Churn"].value_counts(normalize=True)

Churn
0    0.831616
1    0.168384
Name: proportion, dtype: float64

In [None]:
#Correções possiveis antes de separar os dados
var = {"Mobile Phone" : "Mobile",
       "Phone" : "Mobile"}
df["PreferredLoginDevice"].replace(var, inplace= True)

var = {"Mobile Phone" : "Mobile"}
df["PreferedOrderCat"].replace(var, inplace= True)

var = {"Debit Card" : "Debit",
       "Credit Card" : "Credit",
       "CC" : "Credit",
       "Cash on Delivery" : "Cash",
       "COD" : "Cash"}
df["PreferredPaymentMode"].replace(var, inplace= True)

In [None]:
var = {"Computer": 0,
       "Mobile" : 1}
df["PreferredLoginDevice"].replace(var, inplace= True)

var = {"Female": 0,
       "Male" : 1}
df["Gender"].replace(var, inplace= True)

var = {"Single": 0,
       "Married" : 1}
df["MaritalStatus"].replace(var, inplace= True)

In [19]:
#Separando a variável dependente e as variáveis independentes
X = df.drop(columns=["CustomerID", "Churn"])
y = df["Churn"]

#Separando dados de treino e teste
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, stratify=y)

Remover outliers apenas do treino, manter no teste

Dados vazios, utilizar o padrão do treino tanto para teste quanto para treino