# Importación de Librerias

In [41]:
import pandas as pd
import pickle
import os


# Lectura de archivo

In [42]:
data = pd.read_csv("customer_dataset.csv", sep=',') 

In [43]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623 entries, 0 to 622
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   orderAmount                       478 non-null    float64
 1   orderState                        478 non-null    object 
 2   paymentMethodRegistrationFailure  342 non-null    object 
 3   paymentMethodType                 342 non-null    object 
 4   paymentMethodProvider             342 non-null    object 
 5   paymentMethodIssuer               342 non-null    object 
 6   transactionAmount                 623 non-null    int64  
 7   transactionFailed                 623 non-null    bool   
 8   fraudulent                        168 non-null    object 
 9   samecity                          623 non-null    object 
 10  samestate                         623 non-null    object 
 11  emailDomain                       623 non-null    object 
 12  emailPro

# Discretización

###
- orderAmount

In [44]:
# Discretizacion con 5 cuantiles
orderAmmont_cat,  saved_bins_order = pd.qcut(data['orderAmount'],q=5,duplicates='drop',retbins=True)

In [45]:
MAIN_FOLDER = os.path.abspath(os.path.join(os.getcwd(), ".."))
pickle_path = os.path.join(MAIN_FOLDER, "data", "model", "saved_bins_order.pickle")


# Guardar correctamente en esa ruta
with open(pickle_path, 'wb') as handle:
    pickle.dump(saved_bins_order, handle, protocol=pickle.HIGHEST_PROTOCOL)




In [46]:
# observamos la distribucion de las categorias
orderAmmont_cat.value_counts().sort_index()

orderAmount
(9.999, 18.4]    96
(18.4, 30.0]     96
(30.0, 39.0]     98
(39.0, 47.0]     97
(47.0, 353.0]    91
Name: count, dtype: int64

### 
- transactionAmount

In [47]:
# Discretizacion con 4 cuantiles
transactionAmount_cat, saved_bins_transaction = pd.qcut(data['transactionAmount'], q=4,duplicates='drop',retbins=True)

In [48]:
# distribucion de las categorias
transactionAmount_cat.value_counts().sort_index()

transactionAmount
(9.999, 21.0]    167
(21.0, 34.0]     153
(34.0, 45.0]     152
(45.0, 353.0]    151
Name: count, dtype: int64

In [49]:
#guardar
pickle_path1 = os.path.join(MAIN_FOLDER, "data", "model", "saved_bins_transaction.pickle")

with open(pickle_path1, 'wb') as handle:
    pickle.dump(saved_bins_transaction, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Evaluación

In [50]:
# creacion de una copia del dataframe original
probando = data.copy()

In [51]:
#cargar los bins guardados
with open(pickle_path ,'rb') as handle:
    new_saved_bins_order = pickle.load(handle)

In [52]:
#cargar los bins guardados
with open(pickle_path1, 'rb') as handle:
    new_saved_bins_transaction = pickle.load(handle)

In [53]:
#aplicar los bins guardados a la columna orderAmount
probando['orderAmount_binned'] = pd.cut(
    probando['orderAmount'],
    bins=new_saved_bins_order,
    include_lowest=True)

In [54]:
#aplicar los bins guardados a la columna transactionAmount
probando['transactionAmount_binned'] = pd.cut(
    probando['transactionAmount'],
    bins=new_saved_bins_transaction,
    include_lowest=True
)

In [55]:
probando.head()

Unnamed: 0,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,transactionAmount,transactionFailed,fraudulent,samecity,samestate,emailDomain,emailProvider,customerIPAddressSimplified,orderAmount_binned,transactionAmount_binned
0,18.0,pending,True,card,JCB 16 digit,Citizens First Banks,18,False,False,yes,yes,com,yahoo,only_letters,"(9.999, 18.4]","(9.999, 21.0]"
1,26.0,fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,26,False,True,no,no,com,yahoo,only_letters,"(18.4, 30.0]","(21.0, 34.0]"
2,45.0,fulfilled,False,card,VISA 16 digit,Vertex Bancorp,45,False,False,no,no,com,yahoo,digits_and_letters,"(39.0, 47.0]","(34.0, 45.0]"
3,23.0,fulfilled,False,card,Diners Club / Carte Blanche,His Majesty Bank Corp.,23,False,False,no,no,com,yahoo,digits_and_letters,"(18.4, 30.0]","(21.0, 34.0]"
4,43.0,fulfilled,False,card,Mastercard,Vertex Bancorp,43,True,True,no,no,com,other,only_letters,"(39.0, 47.0]","(34.0, 45.0]"


In [56]:
#confirmar que los bins se han aplicado correctamente
print("Original orderAmount bins:", saved_bins_order)
print("Aplicados en probando:", probando['orderAmount_binned'].cat.categories)


Original orderAmount bins: [ 10.   18.4  30.   39.   47.  353. ]
Aplicados en probando: IntervalIndex([(9.999, 18.4], (18.4, 30.0], (30.0, 39.0], (39.0, 47.0],
               (47.0, 353.0]],
              dtype='interval[float64, right]')


In [57]:
print("Original transactionAmount bins:", new_saved_bins_transaction)
print("Aplicados en probando:", probando['transactionAmount_binned'].cat.categories)

Original transactionAmount bins: [ 10.  21.  34.  45. 353.]
Aplicados en probando: IntervalIndex([(9.999, 21.0], (21.0, 34.0], (34.0, 45.0], (45.0, 353.0]], dtype='interval[float64, right]')


# Preparacion de los datos

- paymentMethodIssuer

In [58]:
#cantidad de datos nulos
data['paymentMethodIssuer'].isnull().sum()

281

In [59]:
#tipo de dato
data['paymentMethodIssuer'].dtype

dtype('O')

In [60]:
#valores de la columna paymentMethodIssuer
data['paymentMethodIssuer'].value_counts()

paymentMethodIssuer
Her Majesty Trust           43
Vertex Bancorp              37
Fountain Financial Inc.     35
His Majesty Bank Corp.      33
Bastion Banks               29
Bulwark Trust Corp.         29
Citizens First Banks        28
Grand Credit Corporation    27
Solace Banks                27
Rose Bancshares             25
B                            7
e                            5
c                            4
r                            3
                             2
n                            2
x                            2
o                            2
a                            1
p                            1
Name: count, dtype: int64

In [61]:
# Reemplazar un valor a la vez
weird_payment_method = ["B", "e", "c", "r", " ", "n", "x", "o", "a", "p"]

In [62]:
#iterar en la columna con los valores a reemplazar con 'weird'
data['paymentMethodIssuer'] = data['paymentMethodIssuer'].replace(weird_payment_method, 'weird')

In [63]:
# asi es como queda la columna
data['paymentMethodIssuer'].value_counts()

paymentMethodIssuer
Her Majesty Trust           43
Vertex Bancorp              37
Fountain Financial Inc.     35
His Majesty Bank Corp.      33
Bastion Banks               29
weird                       29
Bulwark Trust Corp.         29
Citizens First Banks        28
Solace Banks                27
Grand Credit Corporation    27
Rose Bancshares             25
Name: count, dtype: int64

- paymentMethodProvider

In [64]:
#contamos los datos nulos
data['paymentMethodProvider'].isnull().sum()  

281

In [65]:
# contamos los datos 
data['paymentMethodProvider'].value_counts()

paymentMethodProvider
JCB 16 digit                   65
VISA 16 digit                  57
Voyager                        36
Diners Club / Carte Blanche    34
Maestro                        32
VISA 13 digit                  32
Discover                       25
American Express               22
JCB 15 digit                   20
Mastercard                     19
Name: count, dtype: int64

- paymentMethodType

In [66]:
#contamos los datos nulos
data['paymentMethodType'].isnull().sum()

281

In [67]:
#vemos los datos
data['paymentMethodType'].value_counts()

paymentMethodType
card         242
apple pay     36
paypal        36
bitcoin       28
Name: count, dtype: int64

- fraudulent

In [68]:
# rellenamos los datos nulos 
data['fraudulent']=data['fraudulent'].fillna(value="warning")

In [69]:
data["fraudulent"].value_counts()

fraudulent
False      107
True        61
Name: count, dtype: int64

In [70]:
# Convertir la columna 'fraudulent' a tipo string
data['fraudulent']=data['fraudulent'].astype(str)

In [71]:
# colocamos un diccionario para mapear los valores de la columna 'fraudulent'
class_map = {'False': 0, 'True': 1, 'warning': 2}

In [72]:
#aplicamos el mapeo a la columna 'fraudulent'
data['fraudulent'] = data['fraudulent'].map(class_map)

In [73]:
#chequeamos
data["fraudulent"].unique()

array([0, 1, 2], dtype=int64)

# Tratamiento de datos faltantes

### orderAmaunt

In [74]:
# contar las variables en este caso numericas
data['orderAmount'].value_counts()

orderAmount
46.0     21
11.0     15
36.0     14
14.0     14
12.0     14
         ..
353.0     1
69.0      1
53.0      1
63.0      1
62.0      1
Name: count, Length: 67, dtype: int64

In [75]:
# cambiamos al tipo de datos categorico
data['orderAmount'] = data['orderAmount'].astype('category')


In [76]:
#agregamos una categoria a los datos
data['orderAmount']=data['orderAmount'].cat.add_categories("desconocido")


In [77]:
# rellenamos la categoria desconocido con el mismo nombre
data['orderAmount'] = data['orderAmount'].fillna("desconocido")


In [78]:
data['orderAmount'].value_counts()

orderAmount
desconocido    145
46.0            21
11.0            15
36.0            14
12.0            14
              ... 
69.0             1
63.0             1
53.0             1
353.0            1
62.0             1
Name: count, Length: 68, dtype: int64

# 5) One hot encoding

In [79]:
#copiamos el dataset
data_ohe=pd.get_dummies(data)

In [80]:
#eliminar la variable objetivo
data_ohe_without_fraudulent = data_ohe.drop(["fraudulent"], axis=1)

In [81]:
data_ohe_without_fraudulent

Unnamed: 0,transactionAmount,transactionFailed,orderAmount_10.0,orderAmount_11.0,orderAmount_12.0,orderAmount_13.0,orderAmount_14.0,orderAmount_15.0,orderAmount_16.0,orderAmount_17.0,...,emailDomain_net,emailDomain_org,emailDomain_weird,emailProvider_gmail,emailProvider_hotmail,emailProvider_other,emailProvider_weird,emailProvider_yahoo,customerIPAddressSimplified_digits_and_letters,customerIPAddressSimplified_only_letters
0,18,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
1,26,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
2,45,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
3,23,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,True,False
4,43,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,25,True,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,True,False,False,True
619,25,True,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,True,False,False,True
620,25,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,True,False,False,True
621,19,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,True,False,False,True


In [82]:
#guardamos los datos

columns_path = os.path.join(MAIN_FOLDER, "data", "model", "categories_ohe_without_fraudulent.pickle")
with open(columns_path, 'wb') as handle:
    pickle.dump(data_ohe_without_fraudulent.columns, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [83]:
#guardamos todo el dataset con la variable fraudulent
filename_path = os.path.join(MAIN_FOLDER, "data", "model", "ohe_customer_dataset.csv")
data_ohe.to_csv(filename_path, index = False)

In [44]:
data[data['fraudulent']==1]

Unnamed: 0,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,transactionAmount,transactionFailed,fraudulent,samecity,samestate,emailDomain,emailProvider,customerIPAddressSimplified
1,26.0,fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,26,False,1,no,no,com,yahoo,only_letters
4,43.0,fulfilled,False,card,Mastercard,Vertex Bancorp,43,True,1,no,no,com,other,only_letters
5,33.0,failed,False,bitcoin,American Express,Bastion Banks,33,False,1,no,no,com,other,only_letters
6,24.0,fulfilled,False,card,Maestro,Her Majesty Trust,24,True,1,no,no,com,other,only_letters
7,25.0,fulfilled,False,card,JCB 16 digit,Fountain Financial Inc.,24,False,1,no,no,com,gmail,only_letters
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,48.0,failed,True,apple pay,VISA 16 digit,Bulwark Trust Corp.,47,False,1,no,no,com,other,digits_and_letters
163,29.0,fulfilled,False,card,VISA 13 digit,Fountain Financial Inc.,39,False,1,no,no,com,other,only_letters
164,22.0,fulfilled,False,apple pay,VISA 13 digit,Vertex Bancorp,18,True,1,no,no,biz,other,only_letters
165,44.0,fulfilled,False,card,Discover,weird,18,False,1,no,no,com,gmail,digits_and_letters


In [45]:
data.columns

Index(['orderAmount', 'orderState', 'paymentMethodRegistrationFailure',
       'paymentMethodType', 'paymentMethodProvider', 'paymentMethodIssuer',
       'transactionAmount', 'transactionFailed', 'fraudulent', 'samecity',
       'samestate', 'emailDomain', 'emailProvider',
       'customerIPAddressSimplified'],
      dtype='object')

In [51]:
data['paymentMethodIssuer'].value_counts()

paymentMethodIssuer
Her Majesty Trust           43
Vertex Bancorp              37
Fountain Financial Inc.     35
His Majesty Bank Corp.      33
Bastion Banks               29
weird                       29
Bulwark Trust Corp.         29
Citizens First Banks        28
Solace Banks                27
Grand Credit Corporation    27
Rose Bancshares             25
Name: count, dtype: int64

In [59]:
t=data['samecity'].unique().tolist()
print(t)

['yes', 'no', 'unknown']


In [61]:
data.dtypes


orderAmount                         category
orderState                            object
paymentMethodRegistrationFailure      object
paymentMethodType                     object
paymentMethodProvider                 object
paymentMethodIssuer                   object
transactionAmount                      int64
transactionFailed                       bool
fraudulent                             int64
samecity                              object
samestate                             object
emailDomain                           object
emailProvider                         object
customerIPAddressSimplified           object
dtype: object