In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json

### Cargar business_dunkin.parquet

In [6]:
# Carga el archivo .parquet en un DataFrame de Pandas
df_business_dunkin = pd.read_parquet('..//data//business_dunkin.parquet')

### Preparar el achivo review_dunkin.parquet

In [2]:
def convert_and_split_json_to_parquet(json_file_path, output_prefix, num_parts):
    # Contadores
    line_count = 0
    part_count = 0

    # Lee el archivo JSON en chunks
    with open(json_file_path, 'r', encoding='utf-8') as file:
        # Obtén el número total de líneas en el archivo
        total_lines = sum(1 for line in file)
        file.seek(0)

        # Calcula el tamaño del chunk
        chunk_size = total_lines // num_parts

        # Inicializa una lista para almacenar los registros
        records = []

        for line in file:
            # Parse la línea del JSON
            record = json.loads(line.strip())
            records.append(record)
            line_count += 1

            # Si hemos alcanzado el tamaño del chunk o es la última línea
            if line_count % chunk_size == 0 or line_count == total_lines:
                part_file_path = f"{output_prefix}_part{part_count + 1}.parquet"
                
                # Convertir la lista de registros a DataFrame
                df_chunk = pd.DataFrame(records)
                
                # Convertir el DataFrame a tabla Arrow y escribir a Parquet
                table = pa.Table.from_pandas(df_chunk)
                pq.write_table(table, part_file_path)
                print(f"Part {part_count + 1} written to {part_file_path}")
                
                # Resetear la lista de registros y aumentar el contador de partes
                records = []
                part_count += 1

In [3]:
# Usar la función
convert_and_split_json_to_parquet('..//data//review.json', 'review_output', 5)

Part 1 written to review_output_part1.parquet
Part 2 written to review_output_part2.parquet
Part 3 written to review_output_part3.parquet
Part 4 written to review_output_part4.parquet
Part 5 written to review_output_part5.parquet


In [4]:
# Lista de archivos .parquet
file_paths = [
    '..//code//review_output_part1.parquet',
    '..//code//review_output_part2.parquet',
    '..//code//review_output_part3.parquet',
    '..//code//review_output_part4.parquet',
    '..//code//review_output_part5.parquet'
]

In [7]:
# Filtrar cada DataFrame y almacenarlos en una lista
filtered_dfs = []
for file_path in file_paths:
    # Leer el archivo .parquet
    df_part = pd.read_parquet(file_path)
    
    # Filtrar el DataFrame
    df_part_filtered = df_part[df_part['business_id'].isin(df_business_dunkin['business_id'])]
    
    # Agregar el DataFrame filtrado a la lista
    filtered_dfs.append(df_part_filtered)
    
    # Guardar el DataFrame filtrado en un archivo .parquet
    filtered_file_path = file_path.replace('.parquet', '_filtered.parquet')
    df_part_filtered.to_parquet(filtered_file_path)
    print(f"Filtered file written to {filtered_file_path}")

Filtered file written to ..//code//review_output_part1_filtered.parquet
Filtered file written to ..//code//review_output_part2_filtered.parquet
Filtered file written to ..//code//review_output_part3_filtered.parquet
Filtered file written to ..//code//review_output_part4_filtered.parquet
Filtered file written to ..//code//review_output_part5_filtered.parquet


In [8]:
# Combinar los DataFrames filtrados
df_review_dunkin = pd.concat(filtered_dfs, ignore_index=True)

In [9]:
# Guardar el DataFrame combinado en un archivo .parquet
combined_file_path = '..//data//review_dunkin.parquet'
df_review_dunkin.to_parquet(combined_file_path)

### Ya filtrado y guardado como .parquet y para los Dunkin de todo el pais

In [10]:
# Carga el archivo .parquet en un DataFrame de Pandas
df_review_dunkin = pd.read_parquet('..//data//review_dunkin.parquet')

In [11]:
# Función para normalizar y capitalizar texto
def normalize_and_capitalize(text):
    if text is None:
        return ''
    # Convertir a minúsculas y luego capitalizar la primera letra de cada palabra
    normalized_text = ' '.join([word.capitalize() for word in text.lower().split()])
    return normalized_text

In [12]:
# Asegurarse de que la columna 'text' está convertida a str y luego aplicar la función
df_review_dunkin['text'] = df_review_dunkin['text'].astype(str).apply(normalize_and_capitalize)

In [13]:
# Verificar los primeros registros de la columna 'text' después de la transformación
print(df_review_dunkin['text'].head())

0    I Was Greated By A Nice Friendly Staff. There ...
1    Get Off The Phone And Serve Your Customers. Al...
2    This Place Is A Disaster In Slow Motion. I Wou...
3    Had The Worst Experience At Dunkin Doughnuts [...
4    Gave Them Another Try Today ... Smdh .... How ...
Name: text, dtype: object


In [14]:
# Convertir la columna 'date' a tipo datetime
df_review_dunkin['date'] = pd.to_datetime(df_review_dunkin['date'], errors='coerce')

In [15]:
# Verificar los primeros registros de la columna 'date' después de la conversión
print(df_review_dunkin['date'].head())

0   2016-09-18 14:59:21
1   2012-03-01 01:48:48
2   2010-05-15 15:42:08
3   2017-04-30 15:33:53
4   2017-02-19 13:05:34
Name: date, dtype: datetime64[ns]


In [16]:
# Restablecer el índice
df_review_dunkin = df_review_dunkin.reset_index(drop=True)

In [17]:
df_review_dunkin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10875 entries, 0 to 10874
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   review_id    10875 non-null  object        
 1   user_id      10875 non-null  object        
 2   business_id  10875 non-null  object        
 3   stars        10875 non-null  float64       
 4   useful       10875 non-null  int64         
 5   funny        10875 non-null  int64         
 6   cool         10875 non-null  int64         
 7   text         10875 non-null  object        
 8   date         10875 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 764.8+ KB


In [18]:
df_review_dunkin.head(1)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,GXOKVGgp_PER947X-eNLqg,dBWYq9h7CXhWBqs-HlUn2w,GUAF7ybULhg68asLfFZYbA,4.0,0,0,0,I Was Greated By A Nice Friendly Staff. There ...,2016-09-18 14:59:21


In [19]:
# Guardar el DataFrame combinado en un archivo .parquet
combined_file_path = '..//data//review_dunkin.parquet'
df_review_dunkin.to_parquet(combined_file_path)