In [1]:
import pandas as pd

### Limpíar, Filtrar y ordenar DF business de yealp - Trabajaremos con data de la cadena Starbucks

In [143]:
# Carga el archivo .parquet en un DataFrame de Pandas
df_business = pd.read_parquet('..//data//business.parquet')

In [144]:
# Filtrar el DataFrame para incluir solo las filas donde 'name' contiene la palabra 'Starbucks'
df_business = df_business[df_business['name'].str.contains('Starbucks', na=False)]

In [145]:
# Convertir la columna 'attributes' que contiene diccionarios en una serie de columnas
attributes_df = pd.json_normalize(df_business['attributes'])

# Concatenar el DataFrame original con el DataFrame de atributos
df_business = pd.concat([df_business.drop(columns=['attributes']), attributes_df], axis=1)

In [146]:
# Configurar pandas para mostrar todas las columnas
pd.set_option('display.max_columns', None)
df_business.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories,hours,AcceptsInsurance,AgesAllowed,Alcohol,Ambience,BYOB,BYOBCorkage,BestNights,BikeParking,BusinessAcceptsBitcoin,BusinessAcceptsCreditCards,BusinessParking,ByAppointmentOnly,Caters,CoatCheck,Corkage,DietaryRestrictions,DogsAllowed,DriveThru,GoodForDancing,GoodForKids,GoodForMeal,HairSpecializesIn,HappyHour,HasTV,Music,NoiseLevel,Open24Hours,OutdoorSeating,RestaurantsAttire,RestaurantsCounterService,RestaurantsDelivery,RestaurantsGoodForGroups,RestaurantsPriceRange2,RestaurantsReservations,RestaurantsTableService,RestaurantsTakeOut,Smoking,WheelchairAccessible,WiFi
590,6_Zf2QsRHCH1RhbzA9inyw,Starbucks,8602 Hillsborough Ave,Tampa,LA,33615,27.99637,-82.580954,3.0,56.0,1.0,"Coffee & Tea, Food","{'Friday': '5:30-21:0', 'Monday': '5:30-21:0',...",,,,,,,,True,,True,"{'garage': False, 'street': False, 'validated'...",,False,,,,,,,,,,,,,,,True,,,,,2,,,True,,,u'free'


In [147]:
# Redondear las coordenadas para simplificar el agrupamiento
df_business['rounded_latitude'] = df_business['latitude'].round(2)
df_business['rounded_longitude'] = df_business['longitude'].round(2)

In [148]:
# Crear la nueva columna 'sweetgreen_id' concatenando las columnas especificadas, separadas por un guion
df_business['starbucks_id'] = df_business['state'].astype(str) + '-' + \
                               df_business['rounded_latitude'].astype(str) + '-' + \
                               df_business['rounded_longitude'].astype(str)

In [149]:
# Eliminar las columnas 'rounded_latitude' y 'rounded_longitude'
df_business.drop(columns=['rounded_latitude', 'rounded_longitude'], inplace=True)

In [150]:
# Convertir la columna 'postal_code' a tipo numérico (float)
df_business['postal_code'] = pd.to_numeric(df_business['postal_code'], errors='coerce')

In [151]:
# Convertir a tipo int, ignorando los NaN
df_business['postal_code'] = df_business['postal_code'].astype('Int64')

In [152]:
# Verificar los primeros registros de la columna 'postal_code' después de la conversión
print(df_business['postal_code'].head())

590     33615
613      <NA>
978      8077
1296    89501
1543     8012
Name: postal_code, dtype: Int64


In [153]:
# Convertir la columna 'city' a tipo str
df_business['city'] = df_business['city'].astype(str)

In [154]:
# Eliminar las filas donde la columna 'name' tiene valores NaN
df_business = df_business.dropna(subset=['name'])

In [156]:
# Restablecer el índice
df_business = df_business.reset_index(drop=True)

In [157]:
df_business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 53 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   business_id                 730 non-null    object 
 1   name                        730 non-null    object 
 2   address                     730 non-null    object 
 3   city                        730 non-null    object 
 4   state                       730 non-null    object 
 5   postal_code                 683 non-null    Int64  
 6   latitude                    730 non-null    float64
 7   longitude                   730 non-null    float64
 8   stars                       730 non-null    float64
 9   review_count                730 non-null    float64
 10  is_open                     730 non-null    float64
 11  categories                  730 non-null    object 
 12  hours                       693 non-null    object 
 13  AcceptsInsurance            0 non-n

In [158]:
# Configurar pandas para mostrar todas las columnas
pd.set_option('display.max_columns', None)
df_business.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories,hours,AcceptsInsurance,AgesAllowed,Alcohol,Ambience,BYOB,BYOBCorkage,BestNights,BikeParking,BusinessAcceptsBitcoin,BusinessAcceptsCreditCards,BusinessParking,ByAppointmentOnly,Caters,CoatCheck,Corkage,DietaryRestrictions,DogsAllowed,DriveThru,GoodForDancing,GoodForKids,GoodForMeal,HairSpecializesIn,HappyHour,HasTV,Music,NoiseLevel,Open24Hours,OutdoorSeating,RestaurantsAttire,RestaurantsCounterService,RestaurantsDelivery,RestaurantsGoodForGroups,RestaurantsPriceRange2,RestaurantsReservations,RestaurantsTableService,RestaurantsTakeOut,Smoking,WheelchairAccessible,WiFi,starbucks_id
0,6_Zf2QsRHCH1RhbzA9inyw,Starbucks,8602 Hillsborough Ave,Tampa,LA,33615,27.99637,-82.580954,3.0,56.0,1.0,"Coffee & Tea, Food","{'Friday': '5:30-21:0', 'Monday': '5:30-21:0',...",,,,,,,,True,,True,"{'garage': False, 'street': False, 'validated'...",,False,,,,,,,,,,,,,,,True,,,,,2,,,True,,,u'free',LA-28.0--82.58


In [159]:
# Guardar el DataFrame como archivo Parquet
df_business.to_parquet('..//data//business.parquet')