In [4]:
#Importación de librerías y extracción de data

import requests
from bs4 import BeautifulSoup
import os
import re

# URL de la página
url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

# Realizar la solicitud GET a la página
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Crear una carpeta para almacenar los archivos descargados
os.makedirs('parquet_files', exist_ok=True)

# Función para verificar si el archivo está dentro del rango de años
def is_within_year_range(file_name, start_year, end_year):
    match = re.search(r'(\d{4})', file_name)
    if match:
        year = int(match.group(1))
        return start_year >= year >= end_year
    return False

# Rango de años
start_year = 2023
end_year = 2017

# Buscar todos los enlaces de archivos Parquet
for link in soup.find_all('a', href=True):
    href = link['href']
    if href.endswith('.parquet'):
        file_name = href.split('/')[-1]
        # Filtrar solo archivos que contengan 'green_tripdata' o 'yellow_tripdata' y estén en el rango de años
        if ('green_tripdata' in file_name or 'yellow_tripdata' in file_name) and is_within_year_range(file_name, start_year, end_year):
            # Descargar el archivo
            file_url = href
            file_response = requests.get(file_url)
            with open(os.path.join('parquet_files', file_name), 'wb') as file:
                file.write(file_response.content)
            
            print(f"Descargado: {file_name}")

print("Descarga completada.")


Descargado: yellow_tripdata_2023-01.parquet
Descargado: green_tripdata_2023-01.parquet
Descargado: yellow_tripdata_2023-02.parquet
Descargado: green_tripdata_2023-02.parquet
Descargado: yellow_tripdata_2023-03.parquet
Descargado: yellow_tripdata_2023-04.parquet
Descargado: green_tripdata_2023-04.parquet
Descargado: yellow_tripdata_2023-06.parquet
Descargado: green_tripdata_2023-06.parquet
Descargado: yellow_tripdata_2023-12.parquet
Descargado: green_tripdata_2023-12.parquet
Descargado: yellow_tripdata_2022-01.parquet
Descargado: green_tripdata_2022-01.parquet
Descargado: yellow_tripdata_2022-02.parquet
Descargado: green_tripdata_2022-02.parquet
Descargado: yellow_tripdata_2022-03.parquet
Descargado: green_tripdata_2022-03.parquet
Descargado: yellow_tripdata_2022-04.parquet
Descargado: green_tripdata_2022-04.parquet
Descargado: yellow_tripdata_2022-05.parquet
Descargado: green_tripdata_2022-05.parquet
Descargado: yellow_tripdata_2022-06.parquet
Descargado: green_tripdata_2022-06.parquet

In [1]:
#Leer archivo parquet

#importacion de librerias
import pandas as pd

#rutas a los archivos
file_path_greentaxis = './parquet_files/green_tripdata_2023-06.parquet'
file_path_yellowtaxis = './parquet_files/yellow_tripdata_2023-06.parquet'

#dataframe
df_green_taxis = pd.read_parquet(file_path_greentaxis)
df_yellow_taxis = pd.read_parquet(file_path_yellowtaxis)

In [2]:
df_green_taxis

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-06-01 00:32:25,2023-06-01 00:39:06,N,1.0,74,42,1.0,0.84,7.90,1.0,0.5,2.08,0.0,,1.0,12.48,1.0,1.0,0.00
1,2,2023-06-01 00:39:07,2023-06-01 00:51:59,N,1.0,41,229,1.0,4.05,19.10,1.0,0.5,3.65,0.0,,1.0,28.00,1.0,1.0,2.75
2,2,2023-06-01 00:35:59,2023-06-01 00:57:06,N,1.0,97,89,1.0,4.26,23.30,1.0,0.5,5.16,0.0,,1.0,30.96,1.0,1.0,0.00
3,2,2023-06-01 00:50:29,2023-06-01 01:00:32,N,1.0,75,263,1.0,1.79,12.10,1.0,0.5,3.47,0.0,,1.0,20.82,1.0,1.0,2.75
4,2,2023-06-01 00:15:15,2023-06-01 00:52:40,N,1.0,33,48,1.0,6.40,37.30,1.0,0.5,0.00,0.0,,1.0,42.55,1.0,1.0,2.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65545,2,2023-06-30 23:09:00,2023-06-30 23:16:00,,,75,263,,1.49,13.24,0.0,0.0,3.40,0.0,,1.0,20.39,,,
65546,2,2023-06-30 23:31:00,2023-06-30 23:41:00,,,223,260,,3.05,13.76,0.0,0.0,2.95,0.0,,1.0,17.71,,,
65547,2,2023-06-30 23:03:00,2023-06-30 23:22:00,,,134,180,,3.75,19.93,0.0,0.0,1.05,0.0,,1.0,21.98,,,
65548,2,2023-06-30 23:12:00,2023-06-30 23:32:00,,,33,157,,5.29,27.20,0.0,0.0,5.64,0.0,,1.0,33.84,,,


In [3]:
df_green_taxis.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2023-06-01 00:32:25,2023-06-01 00:39:06,N,1.0,74,42,1.0,0.84,7.9,1.0,0.5,2.08,0.0,,1.0,12.48,1.0,1.0,0.0
1,2,2023-06-01 00:39:07,2023-06-01 00:51:59,N,1.0,41,229,1.0,4.05,19.1,1.0,0.5,3.65,0.0,,1.0,28.0,1.0,1.0,2.75
2,2,2023-06-01 00:35:59,2023-06-01 00:57:06,N,1.0,97,89,1.0,4.26,23.3,1.0,0.5,5.16,0.0,,1.0,30.96,1.0,1.0,0.0
3,2,2023-06-01 00:50:29,2023-06-01 01:00:32,N,1.0,75,263,1.0,1.79,12.1,1.0,0.5,3.47,0.0,,1.0,20.82,1.0,1.0,2.75
4,2,2023-06-01 00:15:15,2023-06-01 00:52:40,N,1.0,33,48,1.0,6.4,37.3,1.0,0.5,0.0,0.0,,1.0,42.55,1.0,1.0,2.75


In [4]:
df_green_taxis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65550 entries, 0 to 65549
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   VendorID               65550 non-null  int32         
 1   lpep_pickup_datetime   65550 non-null  datetime64[us]
 2   lpep_dropoff_datetime  65550 non-null  datetime64[us]
 3   store_and_fwd_flag     60359 non-null  object        
 4   RatecodeID             60359 non-null  float64       
 5   PULocationID           65550 non-null  int32         
 6   DOLocationID           65550 non-null  int32         
 7   passenger_count        60359 non-null  float64       
 8   trip_distance          65550 non-null  float64       
 9   fare_amount            65550 non-null  float64       
 10  extra                  65550 non-null  float64       
 11  mta_tax                65550 non-null  float64       
 12  tip_amount             65550 non-null  float64       
 13  t

In [5]:
print(df_green_taxis.describe())

           VendorID        lpep_pickup_datetime       lpep_dropoff_datetime  \
count  65550.000000                       65550                       65550   
mean       1.875057  2023-06-15 18:02:19.476171  2023-06-15 18:22:14.300656   
min        1.000000         2023-05-26 01:58:10         2023-05-26 01:58:13   
25%        2.000000  2023-06-08 09:16:17.750000  2023-06-08 09:32:55.750000   
50%        2.000000         2023-06-15 12:55:04         2023-06-15 13:14:31   
75%        2.000000  2023-06-23 00:11:16.500000  2023-06-23 00:29:22.250000   
max        2.000000         2023-07-01 23:59:30         2023-07-02 00:32:47   
std        0.330657                         NaN                         NaN   

         RatecodeID  PULocationID  DOLocationID  passenger_count  \
count  60359.000000  65550.000000  65550.000000     60359.000000   
mean       1.160076     98.453059    139.621876         1.283007   
min        1.000000      1.000000      1.000000         0.000000   
25%        1.000

In [6]:
df_green_taxis.columns

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge'],
      dtype='object')

In [7]:
df_green_taxis.shape

(65550, 20)

In [8]:
df_yellow_taxis

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-06-01 00:08:48,2023-06-01 00:29:41,1.0,3.40,1.0,N,140,238,1,21.90,3.50,0.5,6.70,0.00,1.0,33.60,2.5,0.00
1,1,2023-06-01 00:15:04,2023-06-01 00:25:18,0.0,3.40,1.0,N,50,151,1,15.60,3.50,0.5,3.00,0.00,1.0,23.60,2.5,0.00
2,1,2023-06-01 00:48:24,2023-06-01 01:07:07,1.0,10.20,1.0,N,138,97,1,40.80,7.75,0.5,10.00,0.00,1.0,60.05,0.0,1.75
3,2,2023-06-01 00:54:03,2023-06-01 01:17:29,3.0,9.83,1.0,N,100,244,1,39.40,1.00,0.5,8.88,0.00,1.0,53.28,2.5,0.00
4,2,2023-06-01 00:18:44,2023-06-01 00:27:18,1.0,1.17,1.0,N,137,234,1,9.30,1.00,0.5,0.72,0.00,1.0,15.02,2.5,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3307229,2,2023-06-30 23:30:21,2023-07-01 00:00:46,,9.20,,,42,148,0,38.53,0.00,0.5,8.51,0.00,1.0,51.04,,
3307230,1,2023-06-30 23:34:22,2023-07-01 00:32:59,,20.20,,,132,74,0,70.00,1.75,0.5,11.97,6.55,1.0,91.77,,
3307231,2,2023-06-30 23:45:00,2023-07-01 00:10:00,,6.16,,,256,140,0,28.03,0.00,0.5,6.41,0.00,1.0,38.44,,
3307232,2,2023-06-30 23:13:38,2023-06-30 23:51:19,,11.65,,,91,246,0,45.74,0.00,0.5,0.00,6.55,1.0,56.29,,


In [10]:
df_yellow_taxis.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-06-01 00:08:48,2023-06-01 00:29:41,1.0,3.4,1.0,N,140,238,1,21.9,3.5,0.5,6.7,0.0,1.0,33.6,2.5,0.0
1,1,2023-06-01 00:15:04,2023-06-01 00:25:18,0.0,3.4,1.0,N,50,151,1,15.6,3.5,0.5,3.0,0.0,1.0,23.6,2.5,0.0
2,1,2023-06-01 00:48:24,2023-06-01 01:07:07,1.0,10.2,1.0,N,138,97,1,40.8,7.75,0.5,10.0,0.0,1.0,60.05,0.0,1.75
3,2,2023-06-01 00:54:03,2023-06-01 01:17:29,3.0,9.83,1.0,N,100,244,1,39.4,1.0,0.5,8.88,0.0,1.0,53.28,2.5,0.0
4,2,2023-06-01 00:18:44,2023-06-01 00:27:18,1.0,1.17,1.0,N,137,234,1,9.3,1.0,0.5,0.72,0.0,1.0,15.02,2.5,0.0


In [11]:
df_yellow_taxis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3307234 entries, 0 to 3307233
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int32         
 8   DOLocationID           int32         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee           

In [14]:
print(df_yellow_taxis.describe())

           VendorID        tpep_pickup_datetime       tpep_dropoff_datetime  \
count  3.307234e+06                     3307234                     3307234   
mean   1.737262e+00  2023-06-15 18:09:27.394469  2023-06-15 18:27:09.958314   
min    1.000000e+00         2002-12-31 23:03:19         2002-12-31 23:06:17   
25%    1.000000e+00  2023-06-08 09:12:42.250000  2023-06-08 09:30:06.250000   
50%    2.000000e+00  2023-06-15 14:15:24.500000         2023-06-15 14:36:09   
75%    2.000000e+00         2023-06-23 02:25:20  2023-06-23 02:40:05.750000   
max    6.000000e+00         2023-07-01 00:42:13         2023-07-03 16:31:24   
std    4.477026e-01                         NaN                         NaN   

       passenger_count  trip_distance    RatecodeID  PULocationID  \
count     3.207347e+06   3.307234e+06  3.207347e+06  3.307234e+06   
mean      1.369012e+00   4.368790e+00  1.547159e+00  1.649028e+02   
min       0.000000e+00   0.000000e+00  1.000000e+00  1.000000e+00   
25%       1.

In [17]:
df_yellow_taxis.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'Airport_fee'],
      dtype='object')

In [20]:
df_yellow_taxis.shape

(3307234, 19)