In [1]:
import pandas as pd

### Limpíar, Filtrar y ordenar DF business de yealp - Trabajaremos con data de la competencia de Starbucks

In [3]:
# Cargar el archivo .pkl
df = pd.read_pickle('..//data//business.pkl')

# Eliminar columnas duplicadas
df = df.loc[:, ~df.columns.duplicated()]

# Guardar el DataFrame en formato .parquet
df.to_parquet('..//data//business_competence.parquet')

In [29]:
# Carga el archivo .parquet en un DataFrame de Pandas
df_business = pd.read_parquet('..//data//business_competence.parquet')

In [11]:
# Filtrar el DataFrame para incluir solo las filas donde 'name' contiene 'Dunkin' 
df_business_dunkin = df_business[df_business['name'].str.contains('Dunkin', na=False)]

In [34]:
# Convertir la columna 'attributes' que contiene diccionarios en una serie de columnas
attributes_df = pd.json_normalize(df_business_dunkin['attributes'])

# Concatenar el DataFrame original con el DataFrame de atributos
df_business_dunkin = pd.concat([df_business_dunkin.drop(columns=['attributes']), attributes_df], axis=1)

In [37]:
# Convertir la columna 'postal_code' a tipo numérico (float)
df_business_dunkin['postal_code'] = pd.to_numeric(df_business_dunkin['postal_code'], errors='coerce')

In [38]:
# Convertir a tipo int, ignorando los NaN
df_business_dunkin['postal_code'] = df_business_dunkin['postal_code'].astype('Int64')

In [39]:
# Verificar los primeros registros de la columna 'postal_code' después de la conversión
print(df_business_dunkin['postal_code'].head())

556    33777
572    46227
728    33511
873     8075
939     8002
Name: postal_code, dtype: Int64


In [40]:
# Convertir la columna 'city' a tipo str
df_business_dunkin['city'] = df_business_dunkin['city'].astype(str)

In [41]:
# Eliminar las filas donde la columna 'name' tiene valores NaN
df_business_dunkin = df_business_dunkin.dropna(subset=['name'])

In [42]:
# Restablecer el índice
df_business_dunkin = df_business_dunkin.reset_index(drop=True)

In [43]:
df_business_dunkin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550 entries, 0 to 549
Data columns (total 52 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   business_id                 550 non-null    object 
 1   name                        550 non-null    object 
 2   address                     550 non-null    object 
 3   city                        550 non-null    object 
 4   state                       550 non-null    object 
 5   postal_code                 550 non-null    Int64  
 6   latitude                    550 non-null    float64
 7   longitude                   550 non-null    float64
 8   stars                       550 non-null    float64
 9   review_count                550 non-null    float64
 10  is_open                     550 non-null    float64
 11  categories                  550 non-null    object 
 12  hours                       525 non-null    object 
 13  AcceptsInsurance            0 non-n

In [44]:
# Configurar pandas para mostrar todas las columnas
pd.set_option('display.max_columns', None)
df_business_dunkin.head(1)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories,hours,AcceptsInsurance,AgesAllowed,Alcohol,Ambience,BYOB,BYOBCorkage,BestNights,BikeParking,BusinessAcceptsBitcoin,BusinessAcceptsCreditCards,BusinessParking,ByAppointmentOnly,Caters,CoatCheck,Corkage,DietaryRestrictions,DogsAllowed,DriveThru,GoodForDancing,GoodForKids,GoodForMeal,HairSpecializesIn,HappyHour,HasTV,Music,NoiseLevel,Open24Hours,OutdoorSeating,RestaurantsAttire,RestaurantsCounterService,RestaurantsDelivery,RestaurantsGoodForGroups,RestaurantsPriceRange2,RestaurantsReservations,RestaurantsTableService,RestaurantsTakeOut,Smoking,WheelchairAccessible,WiFi
0,vxrGNnuEef7YCfB7mN08rA,Dunkin',8150 Bryan Dairy Rd,Pinellas Park,AZ,33777,27.871828,-82.750285,2.0,8.0,1.0,"Coffee & Tea, Donuts, Food","{'Friday': '5:0-22:0', 'Monday': '5:0-22:0', '...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [45]:
# Guardar el DataFrame como archivo Parquet
df_business_dunkin.to_parquet('..//data//business_dunkin.parquet')