### 1. Importar librerías:

In [25]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import pyarrow as pa
import pyarrow.parquet as pq

In [21]:
# Definir la cantidad de registros
num_records = 10000000

# Crear DataFrame con columnas vacías
df = pd.DataFrame(columns=['service_type', 'year', 'month', 'day', 'day_of_week', 'hour', 'PULocationID', 
                           'DOLocationID', 'trip_miles', 'time_out', 'travel_time', 'fare_surcharges', 
                           'base_fare', 'service_number'])

# Generar fechas aleatorias con horas entre 0 y 23
start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 12, 31)
date_list = [start_date + timedelta(days=random.randint(0, (end_date - start_date).days), hours=random.randint(0, 23)) for _ in range(num_records)]

# Asignar fechas aleatorias al DataFrame
df['date'] = date_list

# Extraer datos de la fecha y eliminar la columna temporal
df['year'] = df['date'].dt.year.astype('int16')
df['month'] = df['date'].dt.month.astype('int32')
df['day'] = df['date'].dt.day.astype('int32')
df['day_of_week'] = df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour.astype('int64')
df.drop(columns=['date'], inplace=True)

# Generar PULocationID y DOLocationID únicos
location_ids = list(range(1, 264))

# Generar datos aleatorios para las columnas restantes
df['PULocationID'] = np.random.choice(location_ids, num_records)
df['DOLocationID'] = np.random.choice(location_ids, num_records)

df['trip_miles'] = np.random.uniform(2, 55.5, num_records)
df['travel_time'] = np.random.uniform(8, 120.5, num_records)
df['fare_surcharges'] = np.random.uniform(0.5, 20.5, num_records)
df['base_fare'] = np.random.uniform(0.5, 50.5, num_records)

# Función para calcular el service_number según las condiciones dadas
def calculate_service_number(row):
    service_number = random.randint(20, 150)
    if row['PULocationID'] in [ 4,  12,  13,  24,  41,  42,  43,  45,  48,  50,  68,  74,  75,
        79,  87,  88,  90, 100, 103, 107, 113, 114, 116, 120, 125, 127,
       128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153, 158, 161,
       162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224, 229, 230,
       231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246, 249, 261,
       262, 263]:
        service_number += 20
    if 16 <= row['hour'] <= 20:
        service_number += 20
    return service_number

# Aplicar la función a la columna service_number
df['service_number'] = df.apply(calculate_service_number, axis=1)

# Generar service_type según las condiciones dadas
service_types = ['UberLyft'] * int(num_records * 0.7) + ['Yellow'] * int(num_records * 0.25) + ['Green'] * int(num_records * 0.05)
random.shuffle(service_types)
df['service_type'] = service_types

# Generar time_out según las condiciones dadas
df.loc[df['service_type'] == 'UberLyft', 'time_out'] = np.random.uniform(5, 35.5, len(df[df['service_type'] == 'UberLyft']))
df.loc[df['service_type'] != 'UberLyft', 'time_out'] = 0


Reducir tamaño con formato de número:

In [28]:
df['trip_miles'] = df['trip_miles'].astype('float32')
df['fare_surcharges'] = df['fare_surcharges'].astype('float32')
df['base_fare'] = df['base_fare'].astype('float32')

df['year'] = df['year'].astype('int16')
df['month'] = df['month'].astype('int16')
df['day'] = df['day'].astype('int16')
df['hour'] = df['hour'].astype('int16')
df['PULocationID'] = df['PULocationID'].astype('int16')
df['DOLocationID'] = df['DOLocationID'].astype('int16')
df['time_out'] = df['time_out'].astype('int16')
df['travel_time'] = df['travel_time'].astype('int16')
df['service_number'] = df['service_number'].astype('int16')

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 14 columns):
 #   Column           Dtype  
---  ------           -----  
 0   service_type     object 
 1   year             int16  
 2   month            int16  
 3   day              int16  
 4   day_of_week      object 
 5   hour             int16  
 6   PULocationID     int16  
 7   DOLocationID     int16  
 8   trip_miles       float32
 9   time_out         int16  
 10  travel_time      int16  
 11  fare_surcharges  float32
 12  base_fare        float32
 13  service_number   int16  
dtypes: float32(3), int16(9), object(2)
memory usage: 438.7+ MB


In [30]:
# Guardar el DataFrame combinado en un archivo Parquet comprimido
pq.write_table(pa.Table.from_pandas(df), 'Servicios_Agrupados.parquet', compression='gzip')

print("El archivo Parquet comprimido se ha guardado exitosamente.")

El archivo Parquet comprimido se ha guardado exitosamente.
