In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


%matplotlib notebook


#### Importacion de DB ventas transbank 

In [2]:
tbk = pd.read_csv('transbank.csv',encoding = "ISO-8859-1")

#### verificacion de datos importados

In [3]:
tbk.head()

Unnamed: 0,id,ws_token,oc,total,estado,response,fecha,fetchtbk,cod_aut,t_tarjeta,n_ctas,c_ctas,n_tarjetas
0,1,,82,213763,pendiente,0.0,18/05/2020,,,,,,
1,2,,83,100321,pendiente,0.0,18/05/2020,,,,,,
2,3,,84,297080,pendiente,0.0,18/05/2020,,,,,,
3,4,,85,297080,pendiente,0.0,18/05/2020,,,,,,
4,5,,86,297080,pendiente,0.0,18/05/2020,,,,,,


#### 1 - verificar datos NAN y reemplazarlos

In [4]:
tbk.isna().sum()

id               0
ws_token       655
oc               0
total            0
estado           0
response       626
fecha            0
fetchtbk       730
cod_aut        731
t_tarjeta      730
n_ctas         730
c_ctas        1059
n_tarjetas     730
dtype: int64

#### 2 - chambiar NaN a 0

In [5]:
tbk =tbk.fillna(0)

In [24]:
tbk.head()

Unnamed: 0,id,ws_token,oc,total,estado,response,fetchtbk,cod_aut,t_tarjeta,n_ctas,c_ctas,n_tarjetas
49,50,ea8ea12a1712cb93a846f906833d3e387ed807192b84e1...,131,254420,1,0.0,2020-06-15T14:03:43.118-04:00,194884.0,VD,0.0,0.0,1.0
56,57,e392c21addcd253d213a325f220103406d0ad928c97ac6...,139,254420,1,0.0,2020-06-15T14:18:45.988-04:00,999429.0,VD,0.0,0.0,1.0
57,58,e5a171c23445daf69d361159262be7f066e88e3990793f...,140,254420,1,0.0,2020-06-15T14:20:13.448-04:00,590542.0,VD,0.0,0.0,1.0
58,59,ed65a3e236f7f87e45cbcac629e2ea50f6d5593e96e408...,141,254420,1,0.0,2020-06-15T14:25:38.911-04:00,755976.0,VD,0.0,0.0,1.0
59,60,ef44b8d490f00661a611c6d112351a8ceebd25ecf67f56...,142,254420,1,0.0,2020-06-15T15:11:43.928-04:00,159123.0,VD,0.0,0.0,1.0


In [25]:
tbk.describe(include="all")

Unnamed: 0,id,ws_token,oc,total,estado,response,fetchtbk,cod_aut,t_tarjeta,n_ctas,c_ctas,n_tarjetas
count,497.0,497,497.0,497.0,497.0,497.0,497.0,497.0,497,497.0,497.0,497.0
unique,,484,,,,,389.0,,6,,,
top,,1131d755e4cc84852287611903227c4a737dcdbc14138f...,,,,,0.0,,VD,,,
freq,,3,,,,,97.0,,267,,,
mean,583.044266,,684.655936,486068.1,0.802817,-0.313883,,254586.331992,,1.162978,24182.064386,2754.265594
std,327.895584,,341.040368,585038.1,0.398273,0.741938,,288854.40805,,2.726734,70756.722572,2956.686949
min,50.0,,0.0,1.0,0.0,-4.0,,0.0,,0.0,0.0,0.0
25%,275.0,,365.0,185128.0,1.0,0.0,,3571.0,,0.0,0.0,1.0
50%,565.0,,667.0,320381.0,1.0,0.0,,153769.0,,0.0,0.0,1111.0
75%,881.0,,994.0,582525.0,1.0,0.0,,434399.0,,0.0,0.0,4421.0


In [26]:
tbk.isna().sum()

id            0
ws_token      0
oc            0
total         0
estado        0
response      0
fetchtbk      0
cod_aut       0
t_tarjeta     0
n_ctas        0
c_ctas        0
n_tarjetas    0
dtype: int64

#### 2 - se reemplazan los datos del de la variable "estado", ya que contenia 3 descripciones y solo necesitaba si esta Aceptada = 1 o Rechazada 0

In [7]:
tbk['estado'].replace(["rechazado"],["rechazada"],inplace=True)

In [8]:
tbk['estado'].replace(["pendiente"],[0],inplace=True)

In [9]:
tbk['estado'].replace(["rechazada"],[0],inplace=True)

In [10]:
tbk['estado'].replace(["aceptada"],[1],inplace=True)

#### 3 -  verifico los datos de las variabels

In [11]:
tbk.dtypes

id              int64
ws_token       object
oc              int64
total           int64
estado          int64
response      float64
fecha          object
fetchtbk       object
cod_aut       float64
t_tarjeta      object
n_ctas        float64
c_ctas        float64
n_tarjetas    float64
dtype: object

#### 4 elimino la variable fecha ya que es el mismo datos de fetchtbk

In [12]:
tbk = tbk.drop(['fecha'], axis=1)

#### 5 se eliminaron tokens de codigo 0 ya que fueron errores y demos en su instalacion

In [13]:
tbk = tbk.drop(tbk[tbk["ws_token"] == 0].index)

In [14]:
tbk.head()

Unnamed: 0,id,ws_token,oc,total,estado,response,fetchtbk,cod_aut,t_tarjeta,n_ctas,c_ctas,n_tarjetas
49,50,ea8ea12a1712cb93a846f906833d3e387ed807192b84e1...,131,254420,1,0.0,2020-06-15T14:03:43.118-04:00,194884.0,VD,0.0,0.0,1.0
56,57,e392c21addcd253d213a325f220103406d0ad928c97ac6...,139,254420,1,0.0,2020-06-15T14:18:45.988-04:00,999429.0,VD,0.0,0.0,1.0
57,58,e5a171c23445daf69d361159262be7f066e88e3990793f...,140,254420,1,0.0,2020-06-15T14:20:13.448-04:00,590542.0,VD,0.0,0.0,1.0
58,59,ed65a3e236f7f87e45cbcac629e2ea50f6d5593e96e408...,141,254420,1,0.0,2020-06-15T14:25:38.911-04:00,755976.0,VD,0.0,0.0,1.0
59,60,ef44b8d490f00661a611c6d112351a8ceebd25ecf67f56...,142,254420,1,0.0,2020-06-15T15:11:43.928-04:00,159123.0,VD,0.0,0.0,1.0


In [36]:
tbk.isna().sum()

id            0
ws_token      0
oc            0
total         0
estado        0
response      0
fetchtbk      0
cod_aut       0
t_tarjeta     0
n_ctas        0
c_ctas        0
n_tarjetas    0
dtype: int64

In [37]:
demo = tbk.drop("ws_token",1)

  demo = tbk.drop("ws_token",1)


In [40]:
demo


Unnamed: 0,id,oc,total,estado,response,fetchtbk,cod_aut,t_tarjeta,n_ctas,c_ctas,n_tarjetas
49,50,131,254420,1,0.0,2020-06-15T14:03:43.118-04:00,194884.0,VD,0.0,0.0,1.0
56,57,139,254420,1,0.0,2020-06-15T14:18:45.988-04:00,999429.0,VD,0.0,0.0,1.0
57,58,140,254420,1,0.0,2020-06-15T14:20:13.448-04:00,590542.0,VD,0.0,0.0,1.0
58,59,141,254420,1,0.0,2020-06-15T14:25:38.911-04:00,755976.0,VD,0.0,0.0,1.0
59,60,142,254420,1,0.0,2020-06-15T15:11:43.928-04:00,159123.0,VD,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
1145,1146,1267,389801,1,0.0,2022-06-28T09:48:26.694-04:00,500963.0,VN,0.0,0.0,978.0
1147,1148,1269,109900,1,0.0,2022-06-28T15:25:50.355-04:00,972619.0,SI,3.0,36634.0,6460.0
1148,1149,1270,1869002,1,0.0,2022-06-28T20:34:45.153-04:00,531404.0,VC,12.0,0.0,1609.0
1149,1150,1271,373800,1,0.0,2022-06-28T20:39:35.849-04:00,747225.0,NC,6.0,62300.0,2095.0


In [43]:
demo.t_tarjeta.value_counts()

VD    267
0      97
NC     60
VN     40
SI     28
VC      5
Name: t_tarjeta, dtype: int64

In [44]:
demo.t_tarjeta = pd.Categorical(demo.t_tarjeta, categories=["VD", "0","NC","VN","SI","VC"])

In [47]:
demo.dtypes

id               int64
oc               int64
total            int64
estado           int64
response       float64
fetchtbk        object
cod_aut        float64
t_tarjeta     category
n_ctas         float64
c_ctas         float64
n_tarjetas     float64
dtype: object

In [48]:
demo.response.value_counts()

 0.0    401
-1.0     58
-3.0     20
-2.0     17
-4.0      1
Name: response, dtype: int64

In [51]:
demo.response = pd.cut(demo.response, bins=5, labels=False)

In [53]:
demo.cod_aut.value_counts()

0.0         97
1213.0       3
123080.0     3
113334.0     2
122610.0     2
            ..
200426.0     1
100827.0     1
431508.0     1
105406.0     1
895841.0     1
Name: cod_aut, Length: 387, dtype: int64

In [54]:
demo = demo.drop("cod_aut",1)

  demo = demo.drop("cod_aut",1)


In [55]:
demo

Unnamed: 0,id,oc,total,estado,response,fetchtbk,t_tarjeta,n_ctas,c_ctas,n_tarjetas
49,50,131,254420,1,4,2020-06-15T14:03:43.118-04:00,VD,0.0,0.0,1.0
56,57,139,254420,1,4,2020-06-15T14:18:45.988-04:00,VD,0.0,0.0,1.0
57,58,140,254420,1,4,2020-06-15T14:20:13.448-04:00,VD,0.0,0.0,1.0
58,59,141,254420,1,4,2020-06-15T14:25:38.911-04:00,VD,0.0,0.0,1.0
59,60,142,254420,1,4,2020-06-15T15:11:43.928-04:00,VD,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1145,1146,1267,389801,1,4,2022-06-28T09:48:26.694-04:00,VN,0.0,0.0,978.0
1147,1148,1269,109900,1,4,2022-06-28T15:25:50.355-04:00,SI,3.0,36634.0,6460.0
1148,1149,1270,1869002,1,4,2022-06-28T20:34:45.153-04:00,VC,12.0,0.0,1609.0
1149,1150,1271,373800,1,4,2022-06-28T20:39:35.849-04:00,NC,6.0,62300.0,2095.0


In [57]:
demo.dtypes

id               int64
oc               int64
total            int64
estado           int64
response         int64
fetchtbk        object
t_tarjeta     category
n_ctas         float64
c_ctas         float64
n_tarjetas     float64
dtype: object

In [61]:
demo.n_ctas = demo.n_ctas.apply(np.int64)
demo.c_ctas = demo.c_ctas.apply(np.int64)
demo.n_tarjetas = demo.n_tarjetas.apply(np.int64)

In [66]:
demo.fetchtbk = pd.to_datetime(demo.fetchtbk, errors = "coerce")

In [69]:
demo.dtypes

id               int64
oc               int64
total            int64
estado           int64
response         int64
fetchtbk        object
t_tarjeta     category
n_ctas           int64
c_ctas           int64
n_tarjetas       int64
dtype: object

In [72]:
demo.head(1000)

Unnamed: 0,id,oc,total,estado,response,fetchtbk,t_tarjeta,n_ctas,c_ctas,n_tarjetas
49,50,131,254420,1,4,2020-06-15T14:03:43.118-04:00,VD,0,0,1
56,57,139,254420,1,4,2020-06-15T14:18:45.988-04:00,VD,0,0,1
57,58,140,254420,1,4,2020-06-15T14:20:13.448-04:00,VD,0,0,1
58,59,141,254420,1,4,2020-06-15T14:25:38.911-04:00,VD,0,0,1
59,60,142,254420,1,4,2020-06-15T15:11:43.928-04:00,VD,0,0,1
...,...,...,...,...,...,...,...,...,...,...
1145,1146,1267,389801,1,4,2022-06-28T09:48:26.694-04:00,VN,0,0,978
1147,1148,1269,109900,1,4,2022-06-28T15:25:50.355-04:00,SI,3,36634,6460
1148,1149,1270,1869002,1,4,2022-06-28T20:34:45.153-04:00,VC,12,0,1609
1149,1150,1271,373800,1,4,2022-06-28T20:39:35.849-04:00,NC,6,62300,2095


TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']

1106    1228
573      683
422      517
228      322
321      415
        ... 
253      347
623      735
829      944
1021    1139
244      338
Name: oc, Length: 248, dtype: int64

1106    379800
573     628030
422          1
228     535500
321     280207
         ...  
253     807002
623     305878
829     311799
1021    183177
244     150546
Name: total, Length: 248, dtype: int64