In [1]:
import pandas as pd
import json
import numpy as np

In [2]:
df1 = pd.DataFrame({'id': [1,2,3], 'name': ['A', 'B', 'C']})
df2 = pd.DataFrame({'id': [2,3,4], 'idade':['20', '30', '40']})

In [3]:
print(df1)
print(df2)

   id name
0   1    A
1   2    B
2   3    C
   id idade
0   2    20
1   3    30
2   4    40


In [4]:
df_inner = pd.merge(df1,df2, on='id', how='inner')

df_inner

Unnamed: 0,id,name,idade
0,2,B,20
1,3,C,30


In [5]:
df_left = pd.merge(df1, df2, on='id', how='left')
df_left

Unnamed: 0,id,name,idade
0,1,A,
1,2,B,20.0
2,3,C,30.0


In [6]:
df_right = pd.merge(df1, df2, on='id', how='right')
df_right

Unnamed: 0,id,name,idade
0,2,B,20
1,3,C,30
2,4,,40


In [7]:
df_outer = pd.merge(df1, df2, on='id', how='outer')
df_outer

Unnamed: 0,id,name,idade
0,1,A,
1,2,B,20.0
2,3,C,30.0
3,4,,40.0


In [8]:
df_vendas = pd.DataFrame({
    'loja': ['A', 'A', 'B', 'C', 'B', 'C'],
    'categoria': ['Eletro', 'Comida', 'Eletro', 'comida', 'Eletro', 'Eletro'],
    'valor': [100, 300, 400, 100, 200, 2000],
    'estoque': [12, 32, 15, 1231, 923, 512]
})

df_media = df_vendas.groupby('loja')['valor'].sum()
df_media

df_grouped = df_vendas.groupby(['loja', 'categoria'])['valor'].sum()
df_grouped

loja  categoria
A     Comida        300
      Eletro        100
B     Eletro        600
C     Eletro       2000
      comida        100
Name: valor, dtype: int64

In [9]:
df_grouped = df_vendas.groupby('loja').agg({
    'valor': ['sum', 'mean', 'max'],
    'estoque': ['count', 'min' ]
})

df_grouped

Unnamed: 0_level_0,valor,valor,valor,estoque,estoque
Unnamed: 0_level_1,sum,mean,max,count,min
loja,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
A,400,200.0,300,2,12
B,600,300.0,400,2,15
C,2100,1050.0,2000,2,512


In [10]:
with open('data.json', 'r', encoding='utf-8') as f:
    data=json.load(f)

In [11]:
df_raiz = pd.DataFrame([{
    'bid': d.get('business_id'),
    'name': d.get('name'),
    'cidade': d.get('city'),
    'stars': d.get('stars'),
    'categoria': d.get('categoria')
} for d in data])

In [12]:
df_raiz.fillna('Desconhecido', inplace=True)
df_raiz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   bid        150346 non-null  object 
 1   name       150346 non-null  object 
 2   cidade     150346 non-null  object 
 3   stars      150346 non-null  float64
 4   categoria  150346 non-null  object 
dtypes: float64(1), object(4)
memory usage: 5.7+ MB


In [13]:
df_atributo = pd.DataFrame ([{
    'bid': d.get('business_id'),
    'latitude': d.get('latitude'),
    'longitude': d.get('longitude')
} for d in data])

df_atributo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   bid        150346 non-null  object 
 1   latitude   150346 non-null  float64
 2   longitude  150346 non-null  float64
dtypes: float64(2), object(1)
memory usage: 3.4+ MB


In [14]:
weekdays = ['Sunday','Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

df_horas = pd.DataFrame([{
     'bid': d.get('business_id'),
     **{day: (d.get('hours') or {}).get(day, 'Fechado')   for day in weekdays}
}for d in data])

print(df_horas.head())

                      bid     Sunday    Monday    Tuesday  Wednesday  \
0  Pns2l4eNsfO8kk83dixA6A    Fechado   Fechado    Fechado    Fechado   
1  mpf3x-BjTdTEA3yCZrAYPw    Fechado   0:0-0:0  8:0-18:30  8:0-18:30   
2  tUFrWirKiKi_TAnsVWINQQ   8:0-22:0  8:0-22:0   8:0-22:0   8:0-22:0   
3  MTSW4McQd7CbVtyjqoe9mw   7:0-21:0  7:0-20:0   7:0-20:0   7:0-20:0   
4  mWMc6_wTdE0EUBKIGXDVfA  12:0-18:0   Fechado    Fechado  14:0-22:0   

    Thursday     Friday   Saturday  
0    Fechado    Fechado    Fechado  
1  8:0-18:30  8:0-18:30   8:0-14:0  
2   8:0-22:0   8:0-23:0   8:0-23:0  
3   7:0-20:0   7:0-21:0   7:0-21:0  
4  16:0-22:0  12:0-22:0  12:0-22:0  


In [28]:
df_completa = df_raiz.merge(df_atributo, on='bid', how='left').merge(df_horas, on='bid', how='left')
df_completa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   bid        150346 non-null  object 
 1   name       150346 non-null  object 
 2   cidade     150346 non-null  object 
 3   stars      150346 non-null  float64
 4   categoria  150346 non-null  object 
 5   latitude   150346 non-null  float64
 6   longitude  150346 non-null  float64
 7   Sunday     150346 non-null  object 
 8   Monday     150346 non-null  object 
 9   Tuesday    150346 non-null  object 
 10  Wednesday  150346 non-null  object 
 11  Thursday   150346 non-null  object 
 12  Friday     150346 non-null  object 
 13  Saturday   150346 non-null  object 
dtypes: float64(3), object(11)
memory usage: 16.1+ MB


In [None]:
df_split = df_completa.assign(categoria=df_completa["categoria"].str.split(", "))
df_split.head()

Unnamed: 0,bid,name,cidade,stars,categoria,latitude,longitude,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ",Santa Barbara,5.0,[Desconhecido],34.426679,-119.711197,Fechado,Fechado,Fechado,Fechado,Fechado,Fechado,Fechado
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,Affton,3.0,[Desconhecido],38.551126,-90.335695,Fechado,0:0-0:0,8:0-18:30,8:0-18:30,8:0-18:30,8:0-18:30,8:0-14:0
2,tUFrWirKiKi_TAnsVWINQQ,Target,Tucson,3.5,[Desconhecido],32.223236,-110.880452,8:0-22:0,8:0-22:0,8:0-22:0,8:0-22:0,8:0-22:0,8:0-23:0,8:0-23:0
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,Philadelphia,4.0,[Desconhecido],39.955505,-75.155564,7:0-21:0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-20:0,7:0-21:0,7:0-21:0
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,Green Lane,4.5,[Desconhecido],40.338183,-75.471659,12:0-18:0,Fechado,Fechado,14:0-22:0,16:0-22:0,12:0-22:0,12:0-22:0


In [35]:
df_counts = df_split['categoria'].value_counts()
df_counts.head(20)

categoria
[Desconhecido]    150346
Name: count, dtype: int64