# Pré-processamento de dados

***

Base creditdata

## Tratamento de valores inconsistentes em datasets

#### Possíveis soluções:

1. Corrigir manualmente os valores um a um.

2. Apagar coluna onde houver valores inconsistentes.

3. Apagar somente os registros com valores inconsistentes.

4. Substituir valores inconsistentes pela média dos valores consistentes.

In [1]:
import pandas as pd

In [2]:
base = pd.read_csv('creditdata.csv')

In [3]:
base.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [4]:
base

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [5]:
# é possivel ver valores nulos
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   clientid  2000 non-null   int64  
 1   income    2000 non-null   float64
 2   age       1997 non-null   float64
 3   loan      2000 non-null   float64
 4   default   2000 non-null   int64  
dtypes: float64(3), int64(2)
memory usage: 78.2 KB


In [6]:
# filtra por indice ou com base em alguma logica
base.loc[base['age'] > 0]

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


Técnica 2:

In [7]:
# apaga coluna(modifica o dataset diretamente)
#base.drop('age', axis=1, inplace=True)

Técnica 3:

In [8]:
#base.drop(base[base.age < 0].index, inplace=True)

In [9]:
base

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


Técnica 4:

In [10]:
# substituir os registros inconsistentes pela media

# media para cada coluna possivel do database
base.mean()
# media de todas as idades
base['age'].mean()
# media das idades com valor maior que zero
base['age'][base.age > 0].mean()

40.92770044906149

In [11]:
# retorna registros da coluna 'age' q forem maiores q 0 
base.loc[base['age']<=0]

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,-28.218361,3977.287432,0
21,22,32197.620701,-52.42328,4244.057136,0
26,27,63287.038908,-36.496976,9595.286289,0


In [12]:

# retorna somente a coluna 'clientid' dos registros da coluna 'age' q tem valor maior q 0 
base.loc[base['age']<=0]['clientid']

15    16
21    22
26    27
Name: clientid, dtype: int64

In [13]:
# substituindo idades c valor menor q zero pela media das idades maiores q zero
base.loc[base.age < 0, 'age'] = base['age'][base.age > 0].mean()

In [14]:
# retorna os registros com base no valor da coluna
base.query('clientid==[16, 22, 27]')

Unnamed: 0,clientid,income,age,loan,default
15,16,50501.726689,40.9277,3977.287432,0
21,22,32197.620701,40.9277,4244.057136,0
26,27,63287.038908,40.9277,9595.286289,0


## Tratamento de valores faltantes (NaN)

Visualizando valores faltantes na coluna 'age':

In [15]:
# retorna true ou false c base em coluna isnull
pd.isnull(base['age'])

# retorna registros q tem coluna 'age' null
base.loc[pd.isnull(base['age'])]

Unnamed: 0,clientid,income,age,loan,default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


## Separação do dataset em previsores e classe

In [16]:
# .values: retorna um objeto do tipo numpy.array
previsores = base.iloc[:, 1:4].values

# retorno em cima dos registros com valores nan para coluna age
previsores[[28, 30, 31]]

array([[59417.80540626,            nan,  2082.62593812],
       [48528.85279572,            nan,  6155.78467026],
       [23526.30255511,            nan,  2862.01013875]])

In [17]:
# .values: retorna um objeto do tipo numpy.array
classe = base.iloc[:, 4].values
classe

array([0, 0, 0, ..., 1, 0, 0])

#### Atribuindo média aos valores faltantes

In [18]:
from sklearn.impute import SimpleImputer
import numpy as np

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

> Retorno dos registros em cima dos antigos registros com valores NaN para coluna 'age'.
>
> Observamos que agora os valores estam preenchidos com a média os valores existentes:

In [19]:
#imputer = imputer.fit(previsores[:, 0:3])
imputer = imputer.fit(previsores)
#previsores[:, 0:3] = imputer.transform(previsores[:, 0:3])
previsores = imputer.transform(previsores)
previsores[[28, 30, 31]]

array([[5.94178054e+04, 4.09277004e+01, 2.08262594e+03],
       [4.85288528e+04, 4.09277004e+01, 6.15578467e+03],
       [2.35263026e+04, 4.09277004e+01, 2.86201014e+03]])

## Escalonamento de atributos

**Padronização:** *x = x - media(x) / desvio padrao(x)*

**Normalização:** *x = x - min(x) / max(x) - min(x)*

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
previsores = scaler.fit_transform(previsores)
previsores

array([[ 1.45393393,  1.36538005,  1.20281942],
       [-0.76217555,  0.54265932,  0.69642695],
       [ 0.83682073,  1.67417101,  1.17471147],
       ...,
       [-0.07122592, -0.97448606,  0.35420081],
       [-0.11000289,  1.73936652, -0.92675625],
       [ 1.682986  ,  1.14917551,  0.96381038]])