In [2]:
import pandas as pd 
import numpy as np
import seaborn as sns
import plotly.express as px 
import ipywidgets as widgets 

from ipywidgets            import fixed
from matplotlib            import gridspec
from matplotlib            import pyplot as plt
from geopy.geocoders       import Nominatim
from IPython.core.display  import HTML
from IPython.display       import Image

In [3]:
# Criação de Botões

def bt_intslider(Val,Min,Max,desc=''):
    bt = widgets.IntSlider( 
        value = Val,
        min = Min,
        max = Max,
        description=desc,
        disable=False,
        orietation='horizontal',
        style= {'description_width': 'initial'}       
    )

# Criação de Botões

def bt_dropdown(data,x='',desc=''):
    
    bt1 = widgets.Dropdown(
    options=data[x].sort_values().unique().tolist(),
    description= desc,
    disabled=False,
    style={'description_width': 'initial'})
    
    return bt1   

# Estatística Descritiva

def num_metricas(num_attributes):
    
  #Central tendencian - mean, median

  ct1 = pd.DataFrame(num_attributes.apply(np.mean)).T
  ct2 = pd.DataFrame(num_attributes.apply(np.median)).T

  #Dispersion - std, min, max, range, skew, kurtosis

  d1 = pd.DataFrame(num_attributes.apply(np.std)).T
  d2 = pd.DataFrame(num_attributes.apply(min)).T
  d3 = pd.DataFrame(num_attributes.apply(max)).T
  d4 = pd.DataFrame(num_attributes.apply(lambda x: x.max() - x.min() )).T
  d5 = pd.DataFrame(num_attributes.apply(lambda x: x.skew() )).T
  d6 = pd.DataFrame(num_attributes.apply(lambda x: x.kurtosis() )).T

  #Concatenate
  metrics = pd.concat([d2, d3, d4, ct1, ct2, d1, d5, d6]).T.reset_index()
  metrics.columns = ['attributes','min', 'max', 'range', 'mean', 'median', 'std', 'skew','kurtosis']
  return metrics


def jupyter_settings():
    
    %matplotlib inline 
    #%pylab inline
    
    plt.style.use('bmh')
    plt.rcParams['figure.figsize']=[20,10]
    plt.rcParams['font.size']=10
    
    display( HTML('<style>.container {width:100% !important; }</style>'))
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option('display.expand_frame_repr',False )
    pd.set_option('display.float_format',lambda x: '%.2f' % x)
    

    sns.set()
    
jupyter_settings()

In [4]:
df_raw = pd.read_csv('../datasets/kc_house_data.csv')
df = df_raw.copy()
df.sample(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
493,3760000030,20141030T000000,669950.0,5,2.5,2820,14062,2.0,0,0,4,7,2380,440,1960,0,98034,47.71,-122.22,1910,10392
18593,6384500590,20141113T000000,526000.0,3,1.75,1530,6125,1.0,0,0,3,7,1120,410,1958,0,98116,47.57,-122.4,1360,6125
3736,9297301015,20150408T000000,277284.0,3,1.75,1030,4800,1.0,0,0,3,6,930,100,1927,0,98126,47.57,-122.37,1540,4800
10746,7972602435,20150318T000000,287000.0,2,1.0,950,6350,1.0,0,0,3,7,950,0,1951,0,98106,47.53,-122.35,1080,7620
7590,8121100147,20140714T000000,390000.0,3,2.25,1640,2875,2.0,0,0,3,6,1240,400,1983,0,98118,47.57,-122.29,1500,3960


# 2.0 Descrição dos Dados
    # 2.1 Renomear as Colunas

In [5]:
df.columns # Nesse caso não será necessario

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

# 2.2 Dimensão dos Dados

In [6]:
print('Numero de linhas : {}'.format(df.shape[0]))  
print('Numero de colunas : {}'.format(df.shape[1]))  


Numero de linhas : 21613
Numero de colunas : 21


# 2.3 Tipos dos Dados

In [7]:
df.dtypes

id                 int64
date              object
price            float64
bedrooms           int64
bathrooms        float64
sqft_living        int64
sqft_lot           int64
floors           float64
waterfront         int64
view               int64
condition          int64
grade              int64
sqft_above         int64
sqft_basement      int64
yr_built           int64
yr_renovated       int64
zipcode            int64
lat              float64
long             float64
sqft_living15      int64
sqft_lot15         int64
dtype: object

# 2.3.1 Mudança do tipo da variável
    Primeiramente vamos colocar a variável "date" que está como object para o tipo datetime.



In [8]:
df['year'] = pd.to_datetime(df['date']).dt.strftime('%Y')
df['date'] = pd.to_datetime(df['date']).dt.strftime("%m/%d/%Y")
df['week_year'] = pd.to_datetime(df['date']).dt.strftime('%Y-%U')

In [9]:
print(df[['year', 'date', 'week_year']])

       year        date week_year
0      2014  10/13/2014   2014-41
1      2014  12/09/2014   2014-49
2      2015  02/25/2015   2015-08
3      2014  12/09/2014   2014-49
4      2015  02/18/2015   2015-07
5      2014  05/12/2014   2014-19
6      2014  06/27/2014   2014-25
7      2015  01/15/2015   2015-02
8      2015  04/15/2015   2015-15
9      2015  03/12/2015   2015-10
10     2015  04/03/2015   2015-13
11     2014  05/27/2014   2014-21
12     2014  05/28/2014   2014-21
13     2014  10/07/2014   2014-40
14     2015  03/12/2015   2015-10
15     2015  01/24/2015   2015-03
16     2014  07/31/2014   2014-30
17     2014  05/29/2014   2014-21
18     2014  12/05/2014   2014-48
19     2015  04/24/2015   2015-16
20     2014  05/14/2014   2014-19
21     2014  08/26/2014   2014-34
22     2014  07/03/2014   2014-26
23     2014  05/16/2014   2014-19
24     2014  11/20/2014   2014-46
25     2014  11/03/2014   2014-44
26     2014  06/26/2014   2014-25
27     2014  12/01/2014   2014-48
28     2014  0

In [10]:
# Verificar se ha valores nulos 

df.isnull().sum()


id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
year             0
week_year        0
dtype: int64

# 2.5 Estatística Descritiva

In [11]:
# Separando as variáveis em categórica e numérica

num_atributos = df.select_dtypes(include=['int64','float64'])
cat_atributos= df.select_dtypes(include = ['object'])


In [13]:
num_metricas(num_atributos)

Unnamed: 0,attributes,min,max,range,mean,median,std,skew,kurtosis
0,id,1000102.0,9900000190.0,9899000088.0,4580301520.86,3904930410.0,2876499023.43,0.24,-1.26
1,price,75000.0,7700000.0,7625000.0,540088.14,450000.0,367118.7,4.02,34.59
2,bedrooms,0.0,33.0,33.0,3.37,3.0,0.93,1.97,49.06
3,bathrooms,0.0,8.0,8.0,2.11,2.25,0.77,0.51,1.28
4,sqft_living,290.0,13540.0,13250.0,2079.9,1910.0,918.42,1.47,5.24
5,sqft_lot,520.0,1651359.0,1650839.0,15106.97,7618.0,41419.55,13.06,285.08
6,floors,1.0,3.5,2.5,1.49,1.5,0.54,0.62,-0.48
7,waterfront,0.0,1.0,1.0,0.01,0.0,0.09,11.39,127.63
8,view,0.0,4.0,4.0,0.23,0.0,0.77,3.4,10.89
9,condition,1.0,5.0,4.0,3.41,3.0,0.65,1.03,0.53
