- **IMPORTACIÓN, EXTRACCIÓN Y CONVERSION DE LOS DATOS DE METADATA**

In [1]:
import pandas as pd

In [59]:
import os
import json

def convert_json_to_parquet(input_folder, output_folder):
    # Crear la carpeta de salida si no existe
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Listar todos los archivos en la carpeta de entrada
    files = os.listdir(input_folder)
    
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(input_folder, file)
            try:
                # Leer el archivo JSON línea por línea
                with open(file_path, 'r') as f:
                    data = [json.loads(line) for line in f]
                
                # Convertir a DataFrame
                df = pd.DataFrame(data)
                
                # Convertir a Parquet
                output_file_path = os.path.join(output_folder, file.replace('.json', '.parquet'))
                df.to_parquet(output_file_path)
                print('Converted', file, 'to', output_file_path)
            except ValueError as e:
                print('Error processing', file, ':', e)

# Llamar a la función
convert_json_to_parquet('metadata-sitios', 'metadata-sitios-parquet')

Converted 1.json to metadata-sitios-parquet\1.parquet
Converted 10.json to metadata-sitios-parquet\10.parquet
Converted 11.json to metadata-sitios-parquet\11.parquet
Converted 2.json to metadata-sitios-parquet\2.parquet
Converted 3.json to metadata-sitios-parquet\3.parquet
Converted 4.json to metadata-sitios-parquet\4.parquet
Converted 5.json to metadata-sitios-parquet\5.parquet
Converted 6.json to metadata-sitios-parquet\6.parquet
Converted 7.json to metadata-sitios-parquet\7.parquet
Converted 8.json to metadata-sitios-parquet\8.parquet
Converted 9.json to metadata-sitios-parquet\9.parquet


In [60]:
def merge_parquet_files(input_folder):
    # Listar todos los archivos en la carpeta de entrada
    files = os.listdir(input_folder)
    
    # Filtrar solo los archivos Parquet
    parquet_files = [file for file in files if file.endswith('.parquet')]
    
    # Leer y concatenar todos los archivos Parquet en un solo DataFrame
    dataframes = []
    for file in parquet_files:
        file_path = os.path.join(input_folder, file)
        df = pd.read_parquet(file_path)
        dataframes.append(df)
    
    # Concatenar todos los DataFrames
    merged_df = pd.concat(dataframes, ignore_index=True)
    
    return merged_df

# Llamar a la función
merged_df = merge_parquet_files('metadata-sitios-parquet')
merged_df.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,Porter Pharmacy,"Porter Pharmacy, 129 N Second St, Cochran, GA ...",0x88f16e41928ff687:0x883dad4fd048e8f8,,32.3883,-83.3571,[Pharmacy],4.9,16,,"[[Friday, 8AM–6PM], [Saturday, 8AM–12PM], [Sun...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 6PM,"[0x88f16e41929435cf:0x5b2532a2885e9ef6, 0x88f1...",https://www.google.com/maps/place//data=!4m2!3...
1,City Textile,"City Textile, 3001 E Pico Blvd, Los Angeles, C...",0x80c2c98c0e3c16fd:0x29ec8a728764fdf9,,34.018891,-118.21529,[Textile exporter],4.5,6,,,,Open now,"[0x80c2c624136ea88b:0xb0315367ed448771, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
2,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,,34.058092,-118.29213,[Korean restaurant],4.4,18,,"[[Thursday, 6:30AM–6PM], [Friday, 6:30AM–6PM],...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 6PM,"[0x80c2c78249aba68f:0x35bf16ce61be751d, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
3,Nova Fabrics,"Nova Fabrics, 2200 E 11th St, Los Angeles, CA ...",0x80c2c89923b27a41:0x32041559418d447,,34.023669,-118.23293,[Fabric store],3.3,6,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...","{'Accessibility': None, 'Activities': None, 'A...",Open ⋅ Closes 5PM,"[0x80c2c8811477253f:0x23a8a492df1918f7, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
4,Nobel Textile Co,"Nobel Textile Co, 719 E 9th St, Los Angeles, C...",0x80c2c632f933b073:0xc31785961fe826a6,,34.036694,-118.249421,[Fabric store],4.3,7,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...","{'Accessibility': None, 'Activities': None, 'A...",Open ⋅ Closes 5PM,"[0x80c2c62c496083d1:0xdefa11317fe870a1, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...


- Filtrar por "Starbucks"

In [61]:
# Crear una serie booleana que indique las filas que contienen 'starbucks' (case-insensitive)
starbucks_matches = merged_df['name'].str.contains('Starbucks', case=False)

# Contar el número de valores verdaderos (ocurrencias)
starbucks_count = starbucks_matches.sum()

# Imprima el recuento
print(starbucks_count)

3499


In [63]:
#Filtrar el dataframe solo para las filas que contengan "starbucks" en la columna "name"
merged_df['name'] = merged_df['name'].apply(lambda x: x if x is not None else [])


starbucks_merged_df = merged_df[merged_df['name'].apply(lambda x: 'Starbucks' in x)]

# Inspeccionar las primeras filas del DataFrame filtrado
starbucks_merged_df.head(10)

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
2759,Starbucks,"Starbucks, 777 Coushatta Dr, Kinder, LA 70648",0x863b157fa8b51d01:0x1e4fe1352f3c5410,Seattle-based coffeehouse chain known for its ...,30.544849,-92.813979,"[Coffee shop, Cafe, Coffee store, Espresso bar]",3.3,3,$$,"[[Thursday, 7AM–7PM], [Friday, 7AM–7PM], [Satu...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 7AM,"[0x863b14b4c7136f01:0xb153879cf4c9fd95, 0x863b...",https://www.google.com/maps/place//data=!4m2!3...
4689,Starbucks,"Starbucks, 1021 S Highline Pl, Sioux Falls, SD...",0x878eb38c36597305:0xcbf23f742073a95a,Seattle-based coffeehouse chain known for its ...,43.539009,-96.654927,"[Coffee shop, Cafe, Coffee store, Espresso bar]",3.2,18,$$,"[[Wednesday, 7AM–8PM], [Thursday, 7AM–8PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 7AM Thu,"[0x878eb38bf781ca91:0xc0710c30260e2429, 0x878e...",https://www.google.com/maps/place//data=!4m2!3...
6767,Starbucks,"Starbucks, 3285 Crosspark Rd, Coralville, IA 5...",0x87e445ce7b5d0903:0x64ef7bd0dd566918,Seattle-based coffeehouse chain known for its ...,41.721587,-91.60453,"[Coffee shop, Cafe, Coffee store, Espresso bar]",4.0,15,$$,"[[Wednesday, 6AM–8PM], [Thursday, 6AM–8PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,Closes soon ⋅ 8PM ⋅ Opens 6AM Thu,"[0x87e441aeb4a25f27:0xb13d1072372fbdd1, 0x87e4...",https://www.google.com/maps/place//data=!4m2!3...
8853,Starbucks,"Starbucks, 9600 Falls of Neuse Rd, Raleigh, NC...",0x89ac57caf94e3281:0x8d8e0a9bcb3797f4,Seattle-based coffeehouse chain known for its ...,35.90497,-78.60109,"[Coffee shop, Cafe, Coffee store, Espresso bar]",4.5,8,$$,"[[Wednesday, 6AM–8PM], [Thursday, 6AM–8PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 8PM,,https://www.google.com/maps/place//data=!4m2!3...
9937,Starbucks,"Starbucks, Exit 326 Eastbound, Milepost 324, 6...",0x89c693867ed7de97:0x3f76a336d8d512e0,Seattle-based coffeehouse chain known for its ...,40.083163,-75.4401,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.9,15,$$,"[[Wednesday, 7AM–7PM], [Thursday, 7AM–7PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 7PM,"[0x89c6944bb5799605:0x16d89163c48f625a, 0x89c6...",https://www.google.com/maps/place//data=!4m2!3...
11372,Starbucks,"Starbucks, 2340 Legge Blvd, Winchester, VA 22601",0x89b5eee6325d8599:0xa8b3b69fd1331fe6,Seattle-based coffeehouse chain known for its ...,39.1543,-78.167358,"[Coffee shop, Cafe, Coffee store, Espresso bar]",4.3,18,$$,"[[Wednesday, 8AM–9PM], [Thursday, 8AM–9PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 9PM,"[0x89b5efb977f22785:0x1f1094c8a1599fed, 0x89b5...",https://www.google.com/maps/place//data=!4m2!3...
11964,Starbucks,"Starbucks, Newport FS Town Center, Menifee, CA...",0x80dc9dcbf3081853:0x431bcc44582ddf42,,33.6846,-117.179018,[Coffee shop],4.5,26,,"[[Wednesday, 5AM–9PM], [Thursday, 5AM–9PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 9PM,,https://www.google.com/maps/place//data=!4m2!3...
12625,Starbucks,"Starbucks, 400 Bridge St, Clarkston, WA 99403",0x54a1cac671b3294b:0x4c2626802f30ae27,Seattle-based coffeehouse chain known for its ...,46.421219,-117.04307,[Coffee shop],4.1,13,$$,"[[Wednesday, 5AM–6:30PM], [Thursday, 5AM–6:30P...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 6:30PM,"[0x54a1cac2869f2207:0x448085c25a8e4f7f, 0x54a1...",https://www.google.com/maps/place//data=!4m2!3...
17937,Starbucks,"Starbucks, 8000 York Rd, Towson, MD 21252",0x89c80fbcae20f7ed:0xb468795a07685c27,Seattle-based coffeehouse chain known for its ...,39.394018,-76.606298,[Coffee shop],4.1,4,$$,"[[Wednesday, 8AM–7PM], [Thursday, 8AM–7PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 8AM,"[0x89c80fa598b5e1e3:0xe1a549a39e5cc962, 0x89c8...",https://www.google.com/maps/place//data=!4m2!3...
17975,Starbucks,"Starbucks, 3333 Spartan Rd, Olney, MD 20832",0x89b7d1197df7866f:0xdbd6df3a25b26f97,Seattle-based coffeehouse chain known for its ...,39.150387,-77.065087,"[Coffee shop, Cafe, Coffee store, Espresso bar]",4.2,18,$$,"[[Wednesday, 6AM–8PM], [Thursday, 6AM–8PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 6AM,"[0x89b7d119f6fe318f:0x7c4c059aad2fefac, 0x89b7...",https://www.google.com/maps/place//data=!4m2!3...


In [70]:
import os
output_folder = 'metadata-sitios-parquet'
output_file = os.path.join(output_folder, 'Metadata_Starbucks.parquet')

# Crear la carpeta de salida si no existe
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Convertir las listas en las columnas a cadenas
for column in starbucks_merged_df.columns:
    if starbucks_merged_df[column].apply(lambda x: isinstance(x, list)).any():
        starbucks_merged_df[column] = starbucks_merged_df[column].apply(lambda x: str(x) if isinstance(x, list) else x)

# Exportar el DataFrame combinado a un archivo Parquet
starbucks_merged_df.to_parquet(output_file)
print('Dataframe mergeado exportado a', output_file)

Dataframe mergeado exportado a metadata-sitios-parquet\Metadata_Starbucks.parquet


- Filtrar por "Dunkin"

In [71]:
# Crear una serie booleana que indique las filas que contienen 'starbucks' (case-insensitive)
dunkin_matches = merged_df['name'].str.contains('Dunkin', case=False)

# Contar el número de valores verdaderos (ocurrencias)
dunkin_matches_count = dunkin_matches.sum()

# Imprima el recuento
print(dunkin_matches_count)

2182


In [73]:
#Filtrar el dataframe solo para las filas que contengan "starbucks" en la columna "name"
merged_df['name'] = merged_df['name'].apply(lambda x: x if x is not None else [])


dunkin_merged_df = merged_df[merged_df['name'].apply(lambda x: 'Dunkin' in x)]

# Inspeccionar las primeras filas del DataFrame filtrado
dunkin_merged_df.head(10)

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
535,Dunkin Bridge,"Dunkin Bridge, Yale, OK 74085",0x87b16b690c76dc71:0xdf78fabac3bdaa5f,,36.044505,-96.820584,[Bridge],5.0,5,,,,,"[0x87b0dfddc4e496ad:0x295ee748aa3bdf41, 0x87b1...",https://www.google.com/maps/place//data=!4m2!3...
742,Dunkin',"Dunkin', 4008 Bell Blvd, Queens, NY 11361",0x89c261f60bdf13db:0x38da730e4687a97b,Long-running chain serving signature breakfast...,40.763985,-73.77143,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.5,8,$,"[[Thursday, 6AM–7PM], [Friday, 6AM–7PM], [Satu...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 7PM,"[0x89c3ab9229879ec3:0x3f4b2b46d7d2c503, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
2139,Dunkin,"Dunkin, 1132 Mineral Spring Ave, North Provide...",0x89e44489cbeccc03:0xd3b75bf4e9a39824,Long-running chain serving signature breakfast...,41.867869,-71.428798,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.9,8,$,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89e445c7c4df7a27:0x43ad29caf35d3302, 0x89e4...",https://www.google.com/maps/place//data=!4m2!3...
7003,Dunkin',"Dunkin', 525 Pleasant Valley Ave, Mt Laurel To...",0x89c1352001dc66d1:0xb8ca54f815dbb1bf,Long-running chain serving signature breakfast...,39.948164,-74.949908,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",4.1,8,$,"[[Wednesday, 5AM–8PM], [Thursday, 5AM–8PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,Closes soon ⋅ 8PM ⋅ Opens 5AM Thu,,https://www.google.com/maps/place//data=!4m2!3...
14129,Dunkin',"Dunkin', In Stop & Shop, 380 Main Ave, Norwalk...",0x89e81daec8b2f445:0x6fb1428534e11ad0,Long-running chain serving signature breakfast...,41.140586,-73.423947,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.9,15,$,"[[Wednesday, 6AM–8PM], [Thursday, 6AM–8PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 8PM,"[0x89e81dae24931a4f:0x90566a736c61470d, 0x89e8...",https://www.google.com/maps/place//data=!4m2!3...
20538,Dunkin',"Dunkin', 205 Wheeler Rd, Hauppauge, NY 11788",0x89e8311761d696e5:0xdcd22b9e69edba24,Long-running chain serving signature breakfast...,40.803367,-73.212653,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.9,16,$,"[[Wednesday, 5AM–8PM], [Thursday, 5AM–8PM], [F...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 5AM,,https://www.google.com/maps/place//data=!4m2!3...
25600,Dunkin',"Dunkin', In Walmart, 615 Meadow St, Littleton,...",0x4cb468bf747bd0a1:0xa7d59087af3aa8a,Long-running chain serving signature breakfast...,44.304694,-71.798561,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",2.9,27,$,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x4cb46f2dffbff059:0x2726c2de459ce710, 0x4cb4...",https://www.google.com/maps/place//data=!4m2!3...
27108,Dunkin Bridge,"Dunkin Bridge, Yale, OK 74085",0x87b16b690c76dc71:0xdf78fabac3bdaa5f,,36.044505,-96.820584,[Bridge],5.0,5,,,,,"[0x87b0dfddc4e496ad:0x295ee748aa3bdf41, 0x87b1...",https://www.google.com/maps/place//data=!4m2!3...
27315,Dunkin',"Dunkin', 4008 Bell Blvd, Queens, NY 11361",0x89c261f60bdf13db:0x38da730e4687a97b,Long-running chain serving signature breakfast...,40.763985,-73.77143,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.5,8,$,"[[Thursday, 6AM–7PM], [Friday, 6AM–7PM], [Satu...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 7PM,"[0x89c3ab9229879ec3:0x3f4b2b46d7d2c503, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
28712,Dunkin,"Dunkin, 1132 Mineral Spring Ave, North Provide...",0x89e44489cbeccc03:0xd3b75bf4e9a39824,Long-running chain serving signature breakfast...,41.867869,-71.428798,"[Coffee shop, Bagel shop, Bakery, Breakfast re...",3.9,8,$,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89e445c7c4df7a27:0x43ad29caf35d3302, 0x89e4...",https://www.google.com/maps/place//data=!4m2!3...


In [74]:
# Obtener el número de filas del DataFrame
num_filas = dunkin_merged_df.shape[0]

# Imprimir el número de filas
print('El DataFrame dunkin_merged_df tiene', num_filas, 'filas.')

El DataFrame dunkin_merged_df tiene 2180 filas.


In [75]:
import os
output_folder = 'metadata-sitios-parquet'
output_file = os.path.join(output_folder, 'Metadata_Dunkin.parquet')

# Crear la carpeta de salida si no existe
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Convertir las listas en las columnas a cadenas
for column in dunkin_merged_df.columns:
    if dunkin_merged_df[column].apply(lambda x: isinstance(x, list)).any():
        dunkin_merged_df[column] = dunkin_merged_df[column].apply(lambda x: str(x) if isinstance(x, list) else x)

# Exportar el DataFrame combinado a un archivo Parquet
dunkin_merged_df.to_parquet(output_file)
print('Dataframe mergeado exportado a', output_file)

Dataframe mergeado exportado a metadata-sitios-parquet\Metadata_Dunkin.parquet


- **IMPORTACIÓN, EXTRACCIÓN Y CONVERSIÓN DE LOS DATOS DE REVIEWS-ESTADOS.**

-Aplicamos una funcion para convertir los archivos a parquet de cada carpeta de reviews-estados(los 51 estados), creamos una carpeta nueva y almacenamos alli los mismos.

In [3]:
import os
import pandas as pd
import json

def direct_json_to_parquet():
    # Crear la carpeta de destino si no existe
    output_folder = 'reviews-estados-parquet'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Listar todas las carpetas dentro de 'reviews-estados'
    base_folder = 'reviews-estados'
    folders = [f for f in os.listdir(base_folder) if os.path.isdir(os.path.join(base_folder, f))]
    
    for folder in folders:
        folder_path = os.path.join(base_folder, folder)
        # Listar archivos JSON en la carpeta
        json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
        
        # Leer y combinar archivos JSON
        df_list = []
        for json_file in json_files:
            file_path = os.path.join(folder_path, json_file)
            with open(file_path, 'r') as file:
                data = [json.loads(line) for line in file if line.strip()]
            df = pd.DataFrame(data)
            df_list.append(df)
        
        # Concatenar todos los DataFrames
        combined_df = pd.concat(df_list, ignore_index=True)
        
        # Convertir a Parquet y guardar
        parquet_file = os.path.join(output_folder, folder + '.parquet')
        combined_df.to_parquet(parquet_file)
        print('Converted and saved:', parquet_file)

# Ejecutar la función
direct_json_to_parquet()

- **MERGE DE LA DATA DE METADATA CON REVIEWS-ESTADOS PARA STARBUCKS Y DUNKIN.**

-Aplicamos una funcion, para mergear todos lo archivos parquet de los reviews de cada estado con la data filtrada de starbucks de metadata. Lo almacenamos en una carpeta nueva 'Starbucks-Google-Reviews'.

In [79]:
import os
import pandas as pd

def merge_with_metadata_and_save():
    # Cargar el dataset de metadata
    metadata_starbucks_df = pd.read_parquet('metadata-sitios-parquet/Metadata_Starbucks.parquet')
    
    # Crear la carpeta de destino si no existe
    output_folder = 'Starbucks-Google-Reviews'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Listar todos los archivos Parquet en la carpeta 'reviews-estados-prueba'
    input_folder = 'reviews-estados-parquet'
    parquet_files = [f for f in os.listdir(input_folder) if f.endswith('.parquet')]
    
    for parquet_file in parquet_files:
        file_path = os.path.join(input_folder, parquet_file)
        
        # Cargar el archivo Parquet
        reviews_df = pd.read_parquet(file_path)
        
        # Joinear los datasets en 'gmap_id'
        merged_df = pd.merge(metadata_starbucks_df, reviews_df, on='gmap_id')
        
        # Generar el nombre del archivo de salida basado en el nombre del archivo de entrada
        state_name = parquet_file.replace('.parquet', '').replace('review-', '').replace('_', ' ')
        output_file = os.path.join(output_folder, state_name + ' Starbucks Reviews.parquet')
        
        # Guardar el DataFrame resultante en un nuevo archivo Parquet
        merged_df.to_parquet(output_file)
        print('Merged and saved:', output_file)

# Ejecutar la función
merge_with_metadata_and_save()

Merged and saved: Starbucks-Google-Reviews\Alabama Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\Alaska Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\Arizona Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\Arkansas Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\California Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\Colorado Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\Connecticut Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\Delaware Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\District of Columbia Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\Florida Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\Georgia Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\Hawaii Starbucks Reviews.parquet
Merged and saved: Starbucks-Google-Reviews\Idaho Starbu

KeyboardInterrupt: 

-Mergear todos los parquet de Starbucks reviews y guardarlo en un unico archivo parquet en la carpeta "Starbucks-Google-Reviews".

In [None]:
import os
import pandas as pd

def merge_all_reviews():
    # Listar todos los archivos Parquet en la carpeta 'Starbucks-Google-Reviews'
    input_folder = 'Starbucks-Google-Reviews'
    parquet_files = [f for f in os.listdir(input_folder) if f.endswith('.parquet')]
    
    # Leer y combinar todos los archivos Parquet
    df_list = []
    for parquet_file in parquet_files:
        file_path = os.path.join(input_folder, parquet_file)
        df = pd.read_parquet(file_path)
        df_list.append(df)
    
    # Concatenar todos los DataFrames
    combined_df = pd.concat(df_list, ignore_index=True)
    
    # Guardar el DataFrame combinado en un solo archivo Parquet
    output_file = os.path.join(input_folder, 'Google_Starbucks_Reviews.parquet')
    combined_df.to_parquet(output_file)
    print('Google_Starbucks_Reviews guardado en:', output_file)

# Ejecutar la función
merge_all_reviews()

-Hacemos los mismos pasos pero con "dunkin".

In [81]:
import os
import pandas as pd

def merge_with_metadata_and_save():
    # Cargar el dataset de metadata
    metadata_starbucks_df = pd.read_parquet('metadata-sitios-parquet/Metadata_Dunkin.parquet')
    
    # Crear la carpeta de destino si no existe
    output_folder = 'Dunkin-Google-Reviews'
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Listar todos los archivos Parquet en la carpeta 'reviews-estados-prueba'
    input_folder = 'reviews-estados-parquet'
    parquet_files = [f for f in os.listdir(input_folder) if f.endswith('.parquet')]
    
    for parquet_file in parquet_files:
        file_path = os.path.join(input_folder, parquet_file)
        
        # Cargar el archivo Parquet
        reviews_df = pd.read_parquet(file_path)
        
        # Joinear los datasets en 'gmap_id'
        merged_df = pd.merge(metadata_starbucks_df, reviews_df, on='gmap_id')
        
        # Generar el nombre del archivo de salida basado en el nombre del archivo de entrada
        state_name = parquet_file.replace('.parquet', '').replace('review-', '').replace('_', ' ')
        output_file = os.path.join(output_folder, state_name + ' Dunkin Reviews.parquet')
        
        # Guardar el DataFrame resultante en un nuevo archivo Parquet
        merged_df.to_parquet(output_file)
        print('Merged and saved:', output_file)

# Ejecutar la función
merge_with_metadata_and_save()

Merged and saved: Dunkin-Google-Reviews\Alabama Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\Alaska Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\Arizona Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\Arkansas Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\California Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\Colorado Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\Connecticut Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\Delaware Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\District of Columbia Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\Florida Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\Georgia Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\Hawaii Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\Idaho Dunkin Reviews.parquet
Merged and saved: Dunkin-Google-Reviews\Illinois Dunkin Re

In [82]:
import os
import pandas as pd

def merge_all_reviews():
    # Listar todos los archivos Parquet en la carpeta 'Starbucks-Google-Reviews'
    input_folder = 'Dunkin-Google-Reviews'
    parquet_files = [f for f in os.listdir(input_folder) if f.endswith('.parquet')]
    
    # Leer y combinar todos los archivos Parquet
    df_list = []
    for parquet_file in parquet_files:
        file_path = os.path.join(input_folder, parquet_file)
        df = pd.read_parquet(file_path)
        df_list.append(df)
    
    # Concatenar todos los DataFrames
    combined_df = pd.concat(df_list, ignore_index=True)
    
    # Guardar el DataFrame combinado en un solo archivo Parquet
    output_file = os.path.join(input_folder, 'Google_Dunkin_Reviews.parquet')
    combined_df.to_parquet(output_file)
    print('Google_Dunkin_Reviews guardado en:', output_file)

# Ejecutar la función
merge_all_reviews()

Google_Dunkin_Reviews guardado en: Dunkin-Google-Reviews\Google_Dunkin_Reviews.parquet


- **MANIPULACIÓN DE DATOS NULOS.**

In [31]:
starbucks_reviews = pd.read_parquet('Starbucks-Google-Reviews/Google_Starbucks_Reviews.parquet')

In [32]:
starbucks_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 313886 entries, 0 to 313885
Data columns (total 22 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   name_x            313886 non-null  object 
 1   address           313886 non-null  object 
 2   gmap_id           313886 non-null  object 
 3   description       309360 non-null  object 
 4   latitude          313886 non-null  float64
 5   longitude         313886 non-null  float64
 6   category          313886 non-null  object 
 7   avg_rating        313886 non-null  float64
 8   num_of_reviews    313886 non-null  int64  
 9   price             310002 non-null  object 
 10  hours             294788 non-null  object 
 11  MISC              313858 non-null  object 
 12  state             294880 non-null  object 
 13  relative_results  297560 non-null  object 
 14  url               313886 non-null  object 
 15  user_id           313886 non-null  object 
 16  name_y            31

In [33]:
starbucks_reviews.isnull().sum()

name_x                   0
address                  0
gmap_id                  0
description           4526
latitude                 0
longitude                0
category                 0
avg_rating               0
num_of_reviews           0
price                 3884
hours                19098
MISC                    28
state                19006
relative_results     16326
url                      0
user_id                  0
name_y                   0
time                     0
rating                   0
text                163602
pics                308142
resp                313802
dtype: int64

-Se eliminan las columnas que tienen una cantidad relevante de nulos, en este caso "pics" y "resp".

In [34]:
starbucks_reviews = starbucks_reviews.drop(columns=['pics', 'resp'])
print('Columnas pics y resp han sido eliminadas.')

Columnas pics y resp han sido eliminadas.


- **TRANSFORMACIÓN DE COLUMNAS**

-Le cambiamos el nombre a la columna "state" por "open-close".

In [35]:
# Rename the column 'state' to 'open-close'
starbucks_reviews = starbucks_reviews.rename(columns={'state': 'open-close'})
print('La columna state ha sido renombrado como open-close.')

La columna state ha sido renombrado como open-close.


-Creamos una nueva columna llamada "state" con las siglas de cada estado.

In [36]:


#Lista de abreviaturas estatales
state_abbreviations = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY', 'DC']

# Función para extraer la abreviatura del estado de "address"
def extract_state(address):
    for state in state_abbreviations:
        if state in address:
            return state
    return None

# Aplicar la función para crear la nueva columna
starbucks_reviews['state'] = starbucks_reviews['address'].apply(extract_state)


In [37]:
null_count = starbucks_reviews['state'].isnull().sum()
print('Cantidad de valores nulos en la columna state:', null_count)

Cantidad de valores nulos en la columna state: 76


In [38]:
# Filtrar el dataframe para mostrar las filas donde 'state' es nulo
df_nulos = starbucks_reviews[starbucks_reviews['state'].isnull()]

# Mostrar el head del dataframe filtrado
df_nulos.head()

Unnamed: 0,name_x,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,...,MISC,open-close,relative_results,url,user_id,name_y,time,rating,text,state
152391,Starbucks,"Starbucks, 5296 Kings Plaza Unit 0K2, 11234",0x89c25cbb98d8d5a5:0xe877a4885703cf3b,Seattle-based coffeehouse chain known for its ...,40.610002,-73.92086,"[Coffee shop, Breakfast restaurant, Cafe, Coff...",3.9,48,$$,...,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89c25be97857f285:0x408adc5000c63fc, 0x89c24...",https://www.google.com/maps/place//data=!4m2!3...,109252223292842956988,AliciaRenee,1604233934274,5,Quick stop and shop Starbucks. My Chai tea was...,
152392,Starbucks,"Starbucks, 5296 Kings Plaza Unit 0K2, 11234",0x89c25cbb98d8d5a5:0xe877a4885703cf3b,Seattle-based coffeehouse chain known for its ...,40.610002,-73.92086,"[Coffee shop, Breakfast restaurant, Cafe, Coff...",3.9,48,$$,...,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89c25be97857f285:0x408adc5000c63fc, 0x89c24...",https://www.google.com/maps/place//data=!4m2!3...,109865412095765831400,Tricia Bless,1600725466674,4,Staff is friendly. Food is expensive.,
152393,Starbucks,"Starbucks, 5296 Kings Plaza Unit 0K2, 11234",0x89c25cbb98d8d5a5:0xe877a4885703cf3b,Seattle-based coffeehouse chain known for its ...,40.610002,-73.92086,"[Coffee shop, Breakfast restaurant, Cafe, Coff...",3.9,48,$$,...,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89c25be97857f285:0x408adc5000c63fc, 0x89c24...",https://www.google.com/maps/place//data=!4m2!3...,108245242730935605788,Yehuda Rand,1604267585104,5,Fast and easy,
152394,Starbucks,"Starbucks, 5296 Kings Plaza Unit 0K2, 11234",0x89c25cbb98d8d5a5:0xe877a4885703cf3b,Seattle-based coffeehouse chain known for its ...,40.610002,-73.92086,"[Coffee shop, Breakfast restaurant, Cafe, Coff...",3.9,48,$$,...,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89c25be97857f285:0x408adc5000c63fc, 0x89c24...",https://www.google.com/maps/place//data=!4m2!3...,105773100329086681291,Chris Podvin,1568695752561,3,Need more Saturday staff. Need some kind of ai...,
152395,Starbucks,"Starbucks, 5296 Kings Plaza Unit 0K2, 11234",0x89c25cbb98d8d5a5:0xe877a4885703cf3b,Seattle-based coffeehouse chain known for its ...,40.610002,-73.92086,"[Coffee shop, Breakfast restaurant, Cafe, Coff...",3.9,48,$$,...,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89c25be97857f285:0x408adc5000c63fc, 0x89c24...",https://www.google.com/maps/place//data=!4m2!3...,109941687183369723471,Sinatra Young,1570840574349,5,Fast service polite staff,


In [39]:
starbucks_reviews.head()

Unnamed: 0,name_x,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,...,MISC,open-close,relative_results,url,user_id,name_y,time,rating,text,state
0,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,$$,...,{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,100107003653040726165,Jacob McCalpin,1505996531691,5,Chanel is the greatest barista of all time. I'...,AL
1,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,$$,...,{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,108921061266588850634,Alex Z,1538579468609,2,The food is always warm and delicious but the ...,AL
2,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,$$,...,{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,115087327175786879005,James Drummond,1557117732370,1,The location is a franchise of sorts operated ...,AL
3,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,$$,...,{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,103797448577708424762,Matthew Pearson,1555686635302,1,Go to the one in Sterne. This place is a mess....,AL
4,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,$$,...,{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,104674782787422072897,Craig Winn,1534647989256,5,Open early and well staffed.,AL


- **CONVERSIÓN DE TIPOS DE DATOS.**

-Convertimos la columna "time" a valores reales de fecha y hora.

In [40]:
import datetime
import pandas as pd

# Función para convertir timestamp en milisegundos a formato legible
def convertir_timestamp(timestamp_ms):
    if pd.isna(timestamp_ms):
        return None
    try:
        timestamp_s = float(timestamp_ms) / 1000
        date_time = datetime.datetime.fromtimestamp(timestamp_s)
        return date_time.strftime('%Y-%m-%d %H:%M:%S')
    except Exception as e:
        print(f"Error converting timestamp: {timestamp_ms}, error: {e}")
        return None

# Mostrar los primeros valores de la columna 'time' antes de la conversión
print(starbucks_reviews['time'].head())

# Convertir la columna 'time' a tipo numérico
starbucks_reviews['time'] = pd.to_numeric(starbucks_reviews['time'], errors='coerce')

# Aplicar la función a la columna 'time'
starbucks_reviews['time'] = starbucks_reviews['time'].apply(convertir_timestamp)

# Mostrar el head del dataframe para verificar la conversión
starbucks_reviews.head()

0    1505996531691
1    1538579468609
2    1557117732370
3    1555686635302
4    1534647989256
Name: time, dtype: int64


Unnamed: 0,name_x,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,...,MISC,open-close,relative_results,url,user_id,name_y,time,rating,text,state
0,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,$$,...,{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,100107003653040726165,Jacob McCalpin,2017-09-21 09:22:11,5,Chanel is the greatest barista of all time. I'...,AL
1,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,$$,...,{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,108921061266588850634,Alex Z,2018-10-03 12:11:08,2,The food is always warm and delicious but the ...,AL
2,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,$$,...,{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,115087327175786879005,James Drummond,2019-05-06 01:42:12,1,The location is a franchise of sorts operated ...,AL
3,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,$$,...,{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,103797448577708424762,Matthew Pearson,2019-04-19 12:10:35,1,Go to the one in Sterne. This place is a mess....,AL
4,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,$$,...,{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,104674782787422072897,Craig Winn,2018-08-19 00:06:29,5,Open early and well staffed.,AL


-Analizamos la columna "price" porque parece tener valores irrelevantes.

In [41]:
# Contar cuántas veces se repite el valor '$$' en la columna 'price' sin nada extra
count_exact_dollar_signs = starbucks_reviews[starbucks_reviews['price'] == '$$'].shape[0]

# Mostrar el resultado
print('El valor $$ se repite', count_exact_dollar_signs, 'veces en la columna price sin nada extra')

El valor $$ se repite 242892 veces en la columna price sin nada extra


In [42]:
# Mostrar los valores únicos en la columna 'price' que no sean '$$'
unique_prices_not_dollar = starbucks_reviews['price'].unique()
unique_prices_not_dollar = [price for price in unique_prices_not_dollar if price != '$$']

# Mostrar los primeros valores únicos que no sean '$$'
print(unique_prices_not_dollar[:10])

['₩₩', None, '₩']


In [43]:
# Eliminar la columna 'price' del dataframe
starbucks_reviews.drop(columns=['price'], inplace=True)

In [44]:
starbucks_reviews.head()

Unnamed: 0,name_x,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,hours,MISC,open-close,relative_results,url,user_id,name_y,time,rating,text,state
0,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,100107003653040726165,Jacob McCalpin,2017-09-21 09:22:11,5,Chanel is the greatest barista of all time. I'...,AL
1,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,108921061266588850634,Alex Z,2018-10-03 12:11:08,2,The food is always warm and delicious but the ...,AL
2,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,115087327175786879005,James Drummond,2019-05-06 01:42:12,1,The location is a franchise of sorts operated ...,AL
3,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,103797448577708424762,Matthew Pearson,2019-04-19 12:10:35,1,Go to the one in Sterne. This place is a mess....,AL
4,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",0x88891beed225fed1:0x3c63ad3e69972d22,Seattle-based coffeehouse chain known for its ...,33.501601,-86.807263,"[Coffee shop, Cafe, Coffee store, Espresso bar]",2.6,18,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...,104674782787422072897,Craig Winn,2018-08-19 00:06:29,5,Open early and well staffed.,AL


-Renombramos las columnas "name_x" por "name" y "name_y" por "user_name"

In [45]:
starbucks_reviews.rename(columns={'name_x': 'name', 'name_y': 'user_name'}, inplace=True)


-Reordenamos las columnas.

In [53]:
ordered_columns = ['name', 'address','state', 'description', 'category', 'user_id', 'user_name', 'rating', 'avg_rating', 'num_of_reviews', 'time', 'text', 'latitude', 'longitude']
remaining_columns = [col for col in starbucks_reviews.columns if col not in ordered_columns]
new_column_order = ordered_columns + remaining_columns

starbucks_reviews = starbucks_reviews[new_column_order]
print('Columns reordered successfully.')
starbucks_reviews.head()

Columns reordered successfully.


Unnamed: 0,name,address,state,description,category,user_id,user_name,rating,avg_rating,num_of_reviews,time,text,latitude,longitude,gmap_id,hours,MISC,open-close,relative_results,url
0,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",AL,Seattle-based coffeehouse chain known for its ...,"[Coffee shop, Cafe, Coffee store, Espresso bar]",100107003653040726165,Jacob McCalpin,5,2.6,18,2017-09-21 09:22:11,Chanel is the greatest barista of all time. I'...,33.501601,-86.807263,0x88891beed225fed1:0x3c63ad3e69972d22,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...
1,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",AL,Seattle-based coffeehouse chain known for its ...,"[Coffee shop, Cafe, Coffee store, Espresso bar]",108921061266588850634,Alex Z,2,2.6,18,2018-10-03 12:11:08,The food is always warm and delicious but the ...,33.501601,-86.807263,0x88891beed225fed1:0x3c63ad3e69972d22,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...
2,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",AL,Seattle-based coffeehouse chain known for its ...,"[Coffee shop, Cafe, Coffee store, Espresso bar]",115087327175786879005,James Drummond,1,2.6,18,2019-05-06 01:42:12,The location is a franchise of sorts operated ...,33.501601,-86.807263,0x88891beed225fed1:0x3c63ad3e69972d22,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...
3,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",AL,Seattle-based coffeehouse chain known for its ...,"[Coffee shop, Cafe, Coffee store, Espresso bar]",103797448577708424762,Matthew Pearson,1,2.6,18,2019-04-19 12:10:35,Go to the one in Sterne. This place is a mess....,33.501601,-86.807263,0x88891beed225fed1:0x3c63ad3e69972d22,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...
4,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",AL,Seattle-based coffeehouse chain known for its ...,"[Coffee shop, Cafe, Coffee store, Espresso bar]",104674782787422072897,Craig Winn,5,2.6,18,2018-08-19 00:06:29,Open early and well staffed.,33.501601,-86.807263,0x88891beed225fed1:0x3c63ad3e69972d22,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...


-Verificar que todas las columnas tengan el tipo de dato correcto.

In [54]:
# Convertir la columna 'time' a datetime
# Asegurarse de que 'latitude' y 'longitude' sean floats
# Asegurarse de que 'rating' y 'avg_rating' sean numéricos

# Convertir 'time' a datetime
try:
    starbucks_reviews['time'] = pd.to_datetime(starbucks_reviews['time'])
    print('Columna time convertida a datetime exitosamente.')
except Exception as e:
    print('Error al convertir time a datetime:', e)

# Convertir 'latitude' y 'longitude' a float
try:
    starbucks_reviews['latitude'] = starbucks_reviews['latitude'].astype(float)
    starbucks_reviews['longitude'] = starbucks_reviews['longitude'].astype(float)
    print('Columnas latitude y longitude convertidas a float exitosamente.')
except Exception as e:
    print('Error al convertir latitude y longitude a float:', e)

# Convertir 'rating' y 'avg_rating' a numérico
try:
    starbucks_reviews['rating'] = pd.to_numeric(starbucks_reviews['rating'], errors='coerce')
    starbucks_reviews['avg_rating'] = pd.to_numeric(starbucks_reviews['avg_rating'], errors='coerce')
    print('Columnas rating y avg_rating convertidas a numérico exitosamente.')
except Exception as e:
    print('Error al convertir rating y avg_rating a numérico:', e)

# Mostrar los tipos de datos para confirmar los cambios
print(starbucks_reviews.dtypes)

Columna time convertida a datetime exitosamente.
Columnas latitude y longitude convertidas a float exitosamente.
Columnas rating y avg_rating convertidas a numérico exitosamente.
name                        object
address                     object
state                       object
description                 object
category                    object
user_id                     object
user_name                   object
rating                       int64
avg_rating                 float64
num_of_reviews               int64
time                datetime64[ns]
text                        object
latitude                   float64
longitude                  float64
gmap_id                     object
hours                       object
MISC                        object
open-close                  object
relative_results            object
url                         object
dtype: object


-Asegurarse de que los datos estén en un formato consistente

In [55]:
# Normalización de datos: Asegurarse de que las direcciones estén en un formato estándar
import re

# Función para normalizar direcciones
def normalize_address(address):
    address = address.strip().title()  # Eliminar espacios en blanco y poner en formato título
    address = re.sub(r'\s+', ' ', address)  # Reemplazar múltiples espacios por un solo espacio
    return address

# Aplicar la función de normalización a la columna 'address'
starbucks_reviews['address'] = starbucks_reviews['address'].apply(normalize_address)

print('Direcciones normalizadas exitosamente.')
starbucks_reviews['address'].head()

Direcciones normalizadas exitosamente.


0    Starbucks, 1400 University Blvd, Birmingham, A...
1    Starbucks, 1400 University Blvd, Birmingham, A...
2    Starbucks, 1400 University Blvd, Birmingham, A...
3    Starbucks, 1400 University Blvd, Birmingham, A...
4    Starbucks, 1400 University Blvd, Birmingham, A...
Name: address, dtype: object

In [56]:
starbucks_reviews.head()

Unnamed: 0,name,address,state,description,category,user_id,user_name,rating,avg_rating,num_of_reviews,time,text,latitude,longitude,gmap_id,hours,MISC,open-close,relative_results,url
0,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",AL,Seattle-based coffeehouse chain known for its ...,"[Coffee shop, Cafe, Coffee store, Espresso bar]",100107003653040726165,Jacob McCalpin,5,2.6,18,2017-09-21 09:22:11,Chanel is the greatest barista of all time. I'...,33.501601,-86.807263,0x88891beed225fed1:0x3c63ad3e69972d22,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...
1,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",AL,Seattle-based coffeehouse chain known for its ...,"[Coffee shop, Cafe, Coffee store, Espresso bar]",108921061266588850634,Alex Z,2,2.6,18,2018-10-03 12:11:08,The food is always warm and delicious but the ...,33.501601,-86.807263,0x88891beed225fed1:0x3c63ad3e69972d22,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...
2,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",AL,Seattle-based coffeehouse chain known for its ...,"[Coffee shop, Cafe, Coffee store, Espresso bar]",115087327175786879005,James Drummond,1,2.6,18,2019-05-06 01:42:12,The location is a franchise of sorts operated ...,33.501601,-86.807263,0x88891beed225fed1:0x3c63ad3e69972d22,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...
3,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",AL,Seattle-based coffeehouse chain known for its ...,"[Coffee shop, Cafe, Coffee store, Espresso bar]",103797448577708424762,Matthew Pearson,1,2.6,18,2019-04-19 12:10:35,Go to the one in Sterne. This place is a mess....,33.501601,-86.807263,0x88891beed225fed1:0x3c63ad3e69972d22,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...
4,Starbucks,"Starbucks, 1400 University Blvd, Birmingham, A...",AL,Seattle-based coffeehouse chain known for its ...,"[Coffee shop, Cafe, Coffee store, Espresso bar]",104674782787422072897,Craig Winn,5,2.6,18,2018-08-19 00:06:29,Open early and well staffed.,33.501601,-86.807263,0x88891beed225fed1:0x3c63ad3e69972d22,"[[Saturday, 10AM–1:30PM], [Sunday, Closed], [M...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 10AM,"[0x88891bc276c68cf9:0x669c302b2c5da34e, 0x8889...",https://www.google.com/maps/place//data=!4m2!3...


In [57]:
starbucks_reviews.to_parquet('Starbucks_reviews_limpio.parquet')