In [163]:
import pandas as pd 
import numpy as np
import seaborn as sns
import plotly.express as px 
import ipywidgets as widgets 

from ipywidgets            import fixed
from matplotlib            import gridspec
from matplotlib            import pyplot as plt
from geopy.geocoders       import Nominatim
from IPython.core.display  import HTML
from IPython.display       import Image

In [133]:
# Criação de Botões

def bt_intslider(Val,Min,Max,desc=''):
    bt = widgets.IntSlider( 
        value = Val,
        min = Min,
        max = Max,
        description=desc,
        disable=False,
        orietation='horizontal',
        style= {'description_width': 'initial'}       
    )

# Criação de Botões

def bt_dropdown(data,x='',desc=''):
    
    bt1 = widgets.Dropdown(
    options=data[x].sort_values().unique().tolist(),
    description= desc,
    disabled=False,
    style={'description_width': 'initial'})
    
    return bt1   

# Estatística Descritiva

def num_metricas(num_attributes):
    
  #Central tendencian - mean, median

  ct1 = pd.DataFrame(num_attributes.apply(np.mean)).T # Criar tabalas e Colunas Visivelmente melhores.
  ct2 = pd.DataFrame(num_attributes.apply(np.median)).T # Criar tabalas e Colunas Visivelmente melhores.

  #Dispersion - std (Desvio Padrão), min ( minimo ), max ( Maximo ), range ( Alcance), skew (), kurtosis ( Curva da Função).
    
  d1 = pd.DataFrame(num_attributes.apply(np.std)).T
  d2 = pd.DataFrame(num_attributes.apply(min)).T
  d3 = pd.DataFrame(num_attributes.apply(max)).T
  d4 = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min() )).T
  d5 = pd.DataFrame(num_attributes.apply(lambda x: x.skew() )).T
  d6 = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis() )).T

  #Concatenate
  metrics = pd.concat([d2, d3, d4, ct1, ct2, d1, d5, d6]).T.reset_index()
  metrics.columns = ['attributes','min', 'max', 'range', 'mean', 'median', 'std', 'skew','kurtosis']
  return metrics


def jupyter_settings():
    
    %matplotlib inline 
    #%pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize']=[20,10]
    plt.rcParams['font.size']=10
    
    display( HTML('<style>.container {width:100% !important; }</style>'))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr',False )
    pd.set_option('display.float_format',lambda x: '%.2f' % x)
    

    sns.set()
    
jupyter_settings()

In [134]:
df_raw = pd.read_csv('../datasets/kc_house_data.csv')
df = df_raw.copy()
df.sample(5)



Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
3827,8682250090,20140504T000000,775000.0,2,2.5,2680,7392,1.0,0,0,3,9,2680,0,2004,0,98053,47.72,-122.03,2315,7045
20301,1085623730,20141129T000000,498445.0,4,2.5,3216,5902,2.0,0,0,3,9,3216,0,2014,0,98030,47.34,-122.18,2815,4916
15940,8133300050,20140626T000000,200500.0,3,1.75,1260,9346,1.0,0,0,4,7,1260,0,1963,0,98030,47.37,-122.19,1800,9705
6445,7349400420,20141105T000000,286285.0,4,2.25,1980,9714,1.0,0,0,3,7,1170,810,1977,0,98002,47.32,-122.21,1610,9272
12351,9551200270,20140825T000000,1000000.0,5,3.0,3350,9450,2.0,0,0,5,8,2180,1170,1912,1980,98103,47.67,-122.34,2660,4500


# 2.0 Descrição dos Dados
    # 2.1 Renomear as Colunas

In [171]:
df.columns # Nesse caso não será necessario

Index(['level', 'price'], dtype='object')

# 2.2 Dimensão dos Dados

In [136]:
print('Numero de linhas : {}'.format(df.shape[0]))  
print('Numero de colunas : {}'.format(df.shape[1]))  


Numero de linhas : 21613
Numero de colunas : 21


# 2.3 Tipos dos Dados

In [172]:
df.dtypes

level      int64
price    float64
dtype: object

# 2.3.1 Mudança do tipo da variável
    Primeiramente vamos colocar a variável "date" que está como object para o tipo datetime.



In [138]:
df['year'] = pd.to_datetime(df['date']).dt.strftime('%Y')
df['date'] = pd.to_datetime(df['date']).dt.strftime("%m/%d/%Y")
df['week_year'] = pd.to_datetime(df['date']).dt.strftime('%Y-%U')

In [173]:
print(df[['year', 'date', 'week_year']])

KeyError: "None of [Index(['year', 'date', 'week_year'], dtype='object')] are in the [columns]"

In [174]:
# Verificar se ha valores nulos 

df.isnull().sum()


level    0
price    0
dtype: int64

# 2.5 Estatística Descritiva

In [175]:
# Separando as variáveis em categórica e numérica

num_atributos = df.select_dtypes(include=['int64','float64'])
cat_atributos= df.select_dtypes(include = ['object'])


In [176]:
num_metricas(num_atributos)

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,level,0.0,3.0,3.0,1.5,1.5,1.12,0.0,-1.2
1,price,251544.62,985008.98,733464.36,539075.81,459874.81,276458.13,1.25,1.57


In [143]:
cat_atributos.columns

Index(['date', 'year', 'week_year'], dtype='object')

In [177]:
# Condition house filters 

df['condition_type'] = 'NA'
df['condition_type'] = df['condition'].apply(lambda x : 'bad' if x <= 2 else
                                             'regular' if (x >=3) & (x <=4) else
                                             'good'               
)

KeyError: 'condition'

In [145]:
df.sample(10)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year,week_year,condition_type
844,108000127,12/09/2014,456500.0,4,3.5,2000,2309,3.0,0,0,3,8,2000,0,2008,0,98177,47.7,-122.36,1440,1548,2014,2014-49,regular
21250,1776460190,06/26/2014,429900.0,3,2.5,2370,5353,2.0,0,0,3,8,2370,0,2009,0,98019,47.73,-121.97,2130,6850,2014,2014-25,regular
2515,133000070,09/16/2014,179900.0,2,1.0,680,6400,1.0,0,0,3,6,680,0,1943,0,98168,47.51,-122.32,1240,7800,2014,2014-37,regular
13582,2698200210,09/08/2014,274000.0,3,1.75,1440,7198,1.0,0,0,3,7,990,450,1981,0,98055,47.43,-122.19,1550,7156,2014,2014-36,regular
8541,1126059095,05/26/2014,880000.0,3,2.0,2130,35169,1.0,0,0,4,8,2130,0,1989,0,98072,47.75,-122.12,2860,43560,2014,2014-21,regular
10711,597000566,04/28/2015,335000.0,3,2.0,1340,1951,1.0,0,0,3,6,670,670,1915,0,98144,47.58,-122.31,1520,2248,2015,2015-17,regular
1440,7431500341,04/24/2015,1355000.0,3,2.5,3600,21399,1.0,0,3,3,9,2310,1290,1950,2007,98008,47.62,-122.1,2830,17559,2015,2015-16,regular
15456,8856000545,05/07/2014,100000.0,2,1.0,910,22000,1.0,0,0,3,6,910,0,1956,0,98001,47.28,-122.25,1326,9891,2014,2014-18,regular
2621,5101406441,04/16/2015,490000.0,3,1.0,1600,6380,1.5,0,0,3,7,1400,200,1939,0,98125,47.7,-122.32,1760,6380,2015,2015-15,regular
21218,5700000446,10/29/2014,465000.0,3,1.75,1590,1322,2.0,0,0,3,8,1060,530,2014,0,98144,47.58,-122.29,1530,5400,2014,2014-43,regular


# 2.6.2 Filtre os Niveis dos Imoveis pelos preços : 

    Nível 0 -> Preço entre 0 e 321.950
    Nível 1 -> Preço entre 321.950 e 450.000
    Nível 2 -> Preço entre 450.000 e 645.000
    Nível 3 -> Acima de 645.000

In [178]:
# Filter Price 

df['level']= 'NA'
df['level']= df['price'].apply(lambda x: 0  if x < 321950 else 
                                              1  if (x >= 321950) & (x < 450000) else 
                                              2  if (x >= 450000) & (x < 645000) else 3)  
df['level'] = df['level'].astype(int)


# Filtre os niveis dos imoveis pela media do tamanho dos Quartos 

    Size 0 -> Tamanho entre 0 e 1427 sqft
    Size 1 -> Tamanho entre 1427 e 1910 sqft
    Size 2 -> Tamanho entre 1910 e 2550 sqft
    Size 3 -> Tamanho acima de 2550 sqft

In [179]:
df.dtypes

level               int32
price             float64
condition_type     object
dtype: object

In [180]:
df['size'] = 'NA'
df['size'] = df['sqft_living'].apply (lambda x : 0 if x < 1427 else 
                                          1 if (x >= 1427) & (x  < 1910 ) else
                                          2 if (x >= 1910) & (x  < 2550 ) else 
                                          3 )
df['size'] = df['size'].astype(int)

KeyError: 'sqft_living'

In [149]:
df.sample(10)


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year,week_year,condition_type,level,size
15985,5637500250,02/10/2015,447000.0,2,1.0,760,6035,1.0,0,0,3,6,760,0,1920,0,98136,47.54,-122.38,2110,6046,2015,2015-06,regular,1,0
19684,1973700030,04/29/2015,2205000.0,3,2.5,3430,10177,2.0,0,0,3,10,3430,0,2014,0,98034,47.72,-122.25,3110,12339,2015,2015-17,regular,3,3
3779,2623069067,03/05/2015,605000.0,3,2.5,2460,138085,2.0,0,0,4,9,2460,0,1977,0,98027,47.46,-122.01,2090,219542,2015,2015-09,regular,2,2
5670,7625703900,09/26/2014,689000.0,4,2.5,2020,9600,2.0,0,0,4,7,2020,0,1954,0,98136,47.54,-122.39,2250,8550,2014,2014-38,regular,3,2
9668,4039700090,09/23/2014,643403.0,3,2.5,2350,9648,1.0,0,0,4,9,2350,0,1966,0,98008,47.62,-122.11,2320,10512,2014,2014-38,regular,2,2
5492,7575610170,04/23/2015,200000.0,4,2.75,2210,13235,2.0,0,0,3,8,1730,480,1988,0,98003,47.35,-122.3,1750,7542,2015,2015-16,regular,0,2
4083,1156000100,12/24/2014,246700.0,3,2.0,1610,13309,1.0,0,0,4,7,1610,0,1967,0,98042,47.34,-122.13,1610,15725,2014,2014-51,regular,0,1
21277,3362401763,05/08/2014,441750.0,2,1.5,1020,1060,3.0,0,0,3,8,1020,0,2008,0,98103,47.68,-122.35,1340,1415,2014,2014-18,regular,1,0
12539,3885805300,04/29/2015,595000.0,3,1.0,1300,11520,1.0,0,0,3,6,1300,0,1958,0,98033,47.68,-122.19,1440,8064,2015,2015-17,regular,2,0
1538,4279200060,12/30/2014,420000.0,4,2.5,2110,9825,2.0,0,0,3,8,2110,0,2000,0,98059,47.5,-122.15,1650,9900,2014,2014-52,regular,1,2


# 2.6.4 Waterfront

In [150]:
df['is_waterfront'] = df['waterfront'].apply(lambda x : 'yes' if x == 1 else 'no' )

df.sample(10)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,year,week_year,condition_type,level,size,is_waterfront
920,7524000030,06/30/2014,250000.0,3,2.0,1440,9220,1.0,0,0,3,7,1440,0,1965,0,98198,47.37,-122.32,1390,7830,2014,2014-26,regular,0,1,no
5500,855700170,02/25/2015,482000.0,4,2.25,2240,8322,2.0,0,0,3,8,2240,0,1979,0,98034,47.73,-122.21,2240,6448,2015,2015-08,regular,2,2,no
7768,428000150,07/18/2014,269950.0,3,1.0,990,9950,1.0,0,0,5,7,990,0,1961,0,98056,47.51,-122.17,1370,9260,2014,2014-28,good,0,0,no
9071,9510900360,05/09/2014,260000.0,3,2.0,1920,8075,1.0,0,0,4,7,1510,410,1969,0,98023,47.31,-122.38,1920,7826,2014,2014-18,regular,0,2,no
2782,859000160,12/03/2014,375000.0,4,2.0,1720,2410,1.0,0,0,3,7,970,750,1930,2006,98106,47.53,-122.36,1160,1404,2014,2014-48,regular,1,1,no
5079,6873000190,03/11/2015,656000.0,2,2.5,2270,1763,3.0,0,0,3,7,1820,450,2009,0,98052,47.68,-122.12,2180,1763,2015,2015-10,regular,3,2,no
21098,1806900502,10/14/2014,649000.0,3,3.25,1720,936,2.0,0,0,3,8,1030,690,2004,0,98112,47.62,-122.31,1720,1527,2014,2014-41,regular,3,1,no
5125,3741600020,09/15/2014,540000.0,3,2.25,2100,20018,1.0,0,4,3,8,1470,630,1948,0,98166,47.45,-122.37,2410,17196,2014,2014-37,regular,2,2,no
16734,357000005,12/22/2014,500000.0,4,2.0,1680,3813,2.0,0,0,4,7,1680,0,1900,0,98144,47.59,-122.29,2540,3996,2014,2014-51,regular,2,1,no
17834,439000230,04/29/2015,805000.0,4,2.25,2440,9889,1.0,0,0,3,7,1540,900,1952,0,98115,47.69,-122.3,1710,6284,2015,2015-17,regular,3,2,no


# 3.0 - Qual a média do preço de compra dos imóveis por "Nível"?

In [181]:
aux = []

for i in range (4):
    m = (df['level'] == i).sum()/(df.shape[0]) 
    aux.append(m)
    media_level= pd.DataFrame(aux).reset_index()
    media_level.columns = ['Level','Mean']

media_level


Unnamed: 0,Level,Mean
0,0,0.25
1,1,0.25
2,2,0.25
3,3,0.25


In [167]:
# Media dos preços em R$ 


In [168]:
# # Media por Tamanho ' Size '

aux01 = [] 
for i in range(4): 
    m = (df['size'] == i).sum()/(df.shape[0])
    aux01.append(m)
    media_size= pd.DataFrame(aux01).reset_index()
    media_size.columns = ['Size','Mean']

media_size



KeyError: 'size'

# 5.0 - Adicione as seguintes informações ao conjunto de dados original:
    PlaceID: identificação da localização.
    OSM Type: Open Street Map Type
    Country: Nome do País
    Country Code: Código do país


In [155]:
df4 = df.copy()

In [182]:

df4 = df.copy()

lista = ['country','state','city','neighbourhood','road','house_number']

# Cria linhas vazias
df4['country'] = 'NA'
df4['state'] = 'NA'
df4['city'] = 'NA'
df4['neighbourhood'] = 'NA'
df4['road'] = 'NA'
df4['house_number'] = 'NA'

# Inicializar API
geolocator = Nominatim(user_agent='geoapiExercises')

for i in range(len(df4)):
    
    print('Loop:{} / {}'.format(i,len(df4)))
    
    query = str(df4.loc[i,'lat']) + ','+ str(df4.loc[i,'long'])
    
    # Requisição na API 
    response = geolocator.reverse(query)
    
    # Verificar se tem as informações no json
    for j in lista:
      
      if 'j' in response.raw['address']:
        df4.loc[i,'j'] = response.raw['address']['country']



Loop:0 / 4


KeyError: 'lat'