In [None]:
import numpy as np
import pandas as pd
houses=pd.read_csv("https://raw.githubusercontent.com/ProfLuciano/cd/gh-pages/data/recife.csv",encoding = "ISO-8859-1")
houses.head()

## Setando campos como categóricos

In [None]:
houses.dtypes

In [None]:
#ajustando tipos dos dados
houses['suburb'] = houses['suburb'].astype('category')
houses['district'] = houses['district'].astype('category')
houses['city'] = houses['city'].astype('category')
houses['state'] = houses['state'].astype('category')
houses['type'] = houses['type'].astype('category')

In [None]:
houses.dtypes

In [None]:
houses['suburb'].cat.categories

In [None]:
houses['type'].cat.categories 

In [None]:
houses['type'] = houses['type'].cat.codes

In [None]:
houses.head()

In [None]:
houses.dtypes

## Removendo instâncias

In [None]:
houses.shape

In [None]:
print(houses.isnull().sum())

In [None]:
#remove instâncias com ao menos 1 coluna com dado ausente
houses1 = houses.dropna()
houses1.head()

In [None]:
len(houses1)

In [None]:
print(houses1.isnull().sum())

## Removendo colunas quase vazias

In [None]:
print(houses.isnull().sum())

In [None]:
#remove colunas com ao menos 70% de dados ausentes
houses2 = houses.dropna(thresh=0.7*len(houses), axis=1)

In [None]:
houses2.head()

In [None]:
print(houses2.isnull().sum())

## Inserindo dados ausentes em 1 coluna

In [None]:
print(houses2['bathrooms'].mean())
print(houses2['bathrooms'].median())

In [None]:
#adiciona media de banheiros nos dados ausentes de banheiros
houses2['bathrooms'].fillna(houses2['bathrooms'].mean(), inplace=True)

In [None]:
print(houses2.isnull().sum())

In [None]:
print(houses2['bathrooms'].mean())
print(houses2['bathrooms'].median())

In [None]:
houses2.corr(method='spearman')

In [None]:
houses.corr(method='spearman')

In [None]:
#remove colunas com até 70% de dados ausentes
houses2 = houses.dropna(thresh=0.7*len(houses), axis=1)
#adiciona mediana de banheiros nos dados ausentes de banheiros
houses2['bathrooms'].fillna(houses2['bathrooms'].median(), inplace=True)


In [None]:
print(houses2['bathrooms'].mean())
print(houses2['bathrooms'].median())

In [None]:
houses.corr(method='spearman')

In [None]:
houses2.corr(method='spearman')

In [None]:
houses['bathrooms'].describe()

In [None]:
import random
houses2 = houses.dropna(thresh=0.7*len(houses), axis=1)
#adiciona valor aleatório no intervalo de min e max de banheiros nos dados ausentes de banheiros
houses2['bathrooms'].fillna(random.randint(houses2['bathrooms'].min(),houses2['bathrooms'].max()), inplace=True)

In [None]:
print(houses2['bathrooms'].mean())
print(houses2['bathrooms'].median())


## Inserindo dados ausentes em mais de uma coluna

In [None]:
houses=pd.read_csv("https://raw.githubusercontent.com/ProfLuciano/cd/gh-pages/data/recife.csv",encoding = "ISO-8859-1")
houses.head()

In [None]:
houses['suburb'] = houses['suburb'].astype('category')
houses['district'] = houses['district'].astype('category')
houses['city'] = houses['city'].astype('category')
houses['state'] = houses['state'].astype('category')
houses['type'] = houses['type'].astype('category')

In [None]:
houses['suburb'] = houses['suburb'].cat.codes
houses['district'] = houses['district'].cat.codes
houses['city'] = houses['city'].cat.codes
houses['state'] = houses['state'].cat.codes
houses['type'] = houses['type'].cat.codes

In [None]:
houses.head()

In [None]:
from fancyimpute import KNN
houses_knn = KNN(k=3).fit_transform(houses)

In [None]:
houses_knn.shape

In [None]:
houses3 = pd.DataFrame(data=houses_knn[0:,0:],columns=['price',	'latitude',	'longitude',	'bedrooms',	'area',	'pkspaces',	'bathrooms',	'ensuites', 'suburb'	,'district',	'city', 'state',	'type']) 

In [None]:
houses3.head()

In [None]:
print(houses3.isnull().sum())

In [None]:
houses3['bathrooms'].describe()

## Normalização

In [None]:
houses3.head()

In [None]:
#preço está dominando o cálculo
dist = np.linalg.norm(houses3.values[1]-houses3.values[2])
print(dist)

In [None]:
# normalizando min-max
houses_norm = (houses3 - houses3.min()) / (houses3.max() - houses3.min())
print(houses_norm.head())

In [None]:
houses_norm = houses_norm.dropna(thresh=0.7*len(houses), axis=1)
print(houses_norm.head())

In [None]:
dist = np.linalg.norm(houses_norm.values[1]-houses_norm.values[2])
print(dist)

In [None]:
# nomarlizando entre 0 e 1
houses['price_norm'] = (houses['price'] - houses['price'].min()) / (houses['price'].max() - houses['price'].min())

In [None]:
houses['price_norm'].describe()

In [None]:
# normalizando entre -1 e 1
houses['price_norm'] = ((houses['price'] - houses['price'].min()) / (houses['price'].max() - houses['price'].min())*2) -1

In [None]:
houses['price_norm'].describe()

In [None]:
# min-max do scikit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = MinMaxScaler()
houses['price_norm'] = scaler.fit_transform(houses[['price']])
houses['price_norm'].describe()

In [None]:
# nomalizando com média e desvio padrão
houses['price_norm'] = (houses['price'] - houses['price'].mean()) / (houses['price'].std())

In [None]:
houses['price_norm'].describe()

In [None]:
# nomalizando com média e desvio padrão do scikit
scaler = StandardScaler()
houses['price_norm'] = scaler.fit_transform(houses[['price']])
houses['price_norm'].describe()

## Discretizando área







In [None]:
# discretizando com 10 bins com mesmo intervalo
houses['area_dist'] = pd.cut(houses['area'],10)

In [None]:
houses['area_dist'].value_counts()

In [None]:
houses['area'].describe()

In [None]:
# discretizando com 4 bins com mesmo intervalo (quantiles)
houses['area_dist'] = pd.qcut(houses['area'],4)

In [None]:
houses['area'].describe()

In [None]:
houses['area_dist'].value_counts()