In [5]:
# all import statements needed for the project, for example:

import os

import bs4
import matplotlib.pyplot as plt
import pandas as pd
import requests
import sqlalchemy as db
from bs4 import BeautifulSoup
import re
from datetime import datetime
from typing import List

In [52]:
TAXI_URL: str = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

# Date range for filtering
START_DATE = datetime(2024, 1, 1)
END_DATE = datetime(2024, 3, 31)

def get_taxi_html() -> str:
    """Fetch the HTML content of the taxi data page."""
    response = requests.get(TAXI_URL)
    response.raise_for_status()
    html = response.content
    return html

def find_taxi_parquet_links() -> List[str]:
    """Find links to Yellow Taxi and HVFHV Parquet files within the date range."""
    html = get_taxi_html()
    soup = bs4.BeautifulSoup(html, "html.parser")
    
    # Find all <a> tags with relevant titles
    yellow_a_tags = soup.find_all("a", attrs={"title": "Yellow Taxi Trip Records"})
    hvfhv_a_tags = soup.find_all("a", href=re.compile(r"fhvhv.*\.parquet", re.IGNORECASE))
    
    # Combine all links
    all_a_tags = yellow_a_tags + hvfhv_a_tags
    
    # Extract href attributes and filter based on .parquet
    parquet_links = [a["href"].strip() for a in all_a_tags if ".parquet" in (a.get("href") or "")]
    return filter_links_by_date(parquet_links)

def filter_links_by_date(links: List[str]) -> List[str]:
    """Filter Parquet file links by date, retaining only those within the specified range."""
    filtered_links = []
    date_pattern = re.compile(r"_(\d{4})-(\d{2})\.parquet")
    
    for link in links:
        match = date_pattern.search(link)
        if match:
            year, month = int(match.group(1)), int(match.group(2))
            file_date = datetime(year, month, 1)
            if START_DATE <= file_date <= END_DATE:
                filtered_links.append(link)
    
    return filtered_links

def download_files(links: List[str], folder_name: str) -> None:
    """Download files from a list of links and save them to the specified folder."""
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"}
    
    for link in links:
        file_name = link.split("/")[-1]
        file_path = os.path.join(folder_name, file_name)
        print(f"Downloading {file_name} from {link}...")
        
        # Request with headers to mimic a browser
        response = requests.get(link, headers=headers)
        response.raise_for_status()  # Check if download was successful
        
        with open(file_path, "wb") as file:
            file.write(response.content)
        print(f"Downloaded {file_name}")

# Find and download filtered links
filtered_links = find_taxi_parquet_links()
download_files(filtered_links, "taxi_data")

Downloading yellow_tripdata_2024-01.parquet from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet...
Downloaded yellow_tripdata_2024-01.parquet
Downloading yellow_tripdata_2024-02.parquet from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet...
Downloaded yellow_tripdata_2024-02.parquet
Downloading yellow_tripdata_2024-03.parquet from https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet...
Downloaded yellow_tripdata_2024-03.parquet
Downloading fhvhv_tripdata_2024-01.parquet from https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-01.parquet...
Downloaded fhvhv_tripdata_2024-01.parquet
Downloading fhvhv_tripdata_2024-02.parquet from https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-02.parquet...
Downloaded fhvhv_tripdata_2024-02.parquet
Downloading fhvhv_tripdata_2024-03.parquet from https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_2024-03.parquet...
D

In [33]:
folder_path = "C:\Users\sanch\TOOLS for Analytics"

# List all .parquet files in the folder
parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

# Load all Parquet files into a list of DataFrames
dfs = [pd.read_parquet(os.path.join(folder_path, file), engine='pyarrow') for file in parquet_files]

# Concatenate all DataFrames into a single DataFrame
df_all = pd.concat(dfs, ignore_index=True)

# Display the first few rows of the concatenated DataFrame
print(df_all.head())

NameError: name 'hvfhv_a_tags' is not defined

In [None]:

TLC_URL = "https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

TAXI_ZONES_DIR = ""
TAXI_ZONES_SHAPEFILE = f"{TAXI_ZONES_DIR}/taxi_zones.shp"
WEATHER_CSV_DIR = ""

CRS = 4326  # coordinate reference system

# (lat, lon)
NEW_YORK_BOX_COORDS = ((40.560445, -74.242330), (40.908524, -73.717047))
LGA_BOX_COORDS = ((40.763589, -73.891745), (40.778865, -73.854838))
JFK_BOX_COORDS = ((40.639263, -73.795642), (40.651376, -73.766264))
EWR_BOX_COORDS = ((40.686794, -74.194028), (40.699680, -74.165205))

DATABASE_URL = "sqlite:///project.db"
DATABASE_SCHEMA_FILE = "schema.sql"
QUERY_DIRECTORY = "queries"

In [None]:
##load none-yellow and filter out none-uber

In [3]:
import os
import glob

In [31]:
import pandas as pd

# Define the file path
file_path = "/Users/rita/Downloads/E4501 in Downloads/final project/raw yellow_fhvhv/fhvhv_tripdata_2024-01.parquet"

# Read the Parquet file into a DataFrame
df = pd.read_parquet(file_path)

# Print the first 10 rows of the DataFrame
print(df.head(20))

   hvfhs_license_num dispatching_base_num originating_base_num  \
0             HV0003               B03404               B03404   
1             HV0003               B03404               B03404   
2             HV0003               B03404               B03404   
3             HV0003               B03404               B03404   
4             HV0003               B03404               B03404   
5             HV0003               B03404               B03404   
6             HV0003               B03404               B03404   
7             HV0003               B03404               B03404   
8             HV0003               B03404               B03404   
9             HV0003               B03404               B03404   
10            HV0005               B03406                 None   
11            HV0005               B03406                 None   
12            HV0003               B03404               B03404   
13            HV0005               B03406                 None   
14        

In [29]:
# Define the file path for input and output
input_file_path = "/Users/rita/Downloads/E4501 in Downloads/final project/raw yellow_fhvhv/fhvhv_tripdata_2024-01.parquet"
output_file_path = "/Users/rita/Downloads/E4501 in Downloads/final project/processed uber raw/uber_2024-01.parquet"

# Read the Parquet file into a DataFrame
df = pd.read_parquet(input_file_path)

# Filter the rows where 'hvfhs_license_num' is 'HV0003'
filtered_df = df[df['hvfhs_license_num'] == 'HV0003']

# Save the filtered DataFrame to a new Parquet file
filtered_df.to_parquet(output_file_path)

print(f"Filtered data saved to {output_file_path}")

Filtered data saved to /Users/rita/Downloads/E4501 in Downloads/final project/uber_2024-01.parquet


In [None]:
##have a look at the new file

In [33]:
# Define the file path
file_path = "/Users/rita/Downloads/E4501 in Downloads/final project/uber_2024-01.parquet"

# Read the Parquet file into a DataFrame
df = pd.read_parquet(file_path)

# Print the first 10 rows of the DataFrame
print(df.head(20))

   hvfhs_license_num dispatching_base_num originating_base_num  \
0             HV0003               B03404               B03404   
1             HV0003               B03404               B03404   
2             HV0003               B03404               B03404   
3             HV0003               B03404               B03404   
4             HV0003               B03404               B03404   
5             HV0003               B03404               B03404   
6             HV0003               B03404               B03404   
7             HV0003               B03404               B03404   
8             HV0003               B03404               B03404   
9             HV0003               B03404               B03404   
12            HV0003               B03404               B03404   
19            HV0003               B03404               B03404   
20            HV0003               B03404               B03404   
21            HV0003               B03404               B03404   
25        

In [7]:

# Define the source and destination folders
source_folder = "/Users/rita/Downloads/E4501 in Downloads/final project/raw yellow_fhvhv"
output_folder = "/Users/rita/Downloads/E4501 in Downloads/final project/processed uber raw"

# Use glob to find all files that match "fhvhv" in the name and have a .parquet extension
files = glob.glob(os.path.join(source_folder, "*fhvhv*.parquet"))

# Process each file
for file_path in files:
    # Extract the part of the filename after "fhvhv" to create the new file name
    base_name = os.path.basename(file_path)  # Extracts the filename with extension
    new_file_name = f"uber_{base_name.split('fhvhv')[-1]}"  # Create new filename with "uber_" prefix
    output_file_path = os.path.join(output_folder, new_file_name)  # Full output path
    
    # Read the Parquet file into a DataFrame
    df = pd.read_parquet(file_path)
    
    # Filter the rows where 'hvfhs_license_num' is 'HV0003'
    filtered_df = df[df['hvfhs_license_num'] == 'HV0003']
    
    # Save the filtered DataFrame to a new Parquet file
    filtered_df.to_parquet(output_file_path)
    
    print(f"Filtered data from {file_path} saved to {output_file_path}")


Filtered data from /Users/rita/Downloads/E4501 in Downloads/final project/raw yellow_fhvhv/fhvhv_tripdata_2024-02.parquet saved to /Users/rita/Downloads/E4501 in Downloads/final project/processed uber raw/uber__tripdata_2024-02.parquet
Filtered data from /Users/rita/Downloads/E4501 in Downloads/final project/raw yellow_fhvhv/fhvhv_tripdata_2024-03.parquet saved to /Users/rita/Downloads/E4501 in Downloads/final project/processed uber raw/uber__tripdata_2024-03.parquet
Filtered data from /Users/rita/Downloads/E4501 in Downloads/final project/raw yellow_fhvhv/fhvhv_tripdata_2024-01.parquet saved to /Users/rita/Downloads/E4501 in Downloads/final project/processed uber raw/uber__tripdata_2024-01.parquet


In [11]:
# Define the file path
file_path = "/Users/rita/Downloads/E4501 in Downloads/final project/processed uber raw/uber__tripdata_2024-01.parquet"

# Read the Parquet file into a DataFrame
df = pd.read_parquet(file_path)

# Print the first 10 rows of the DataFrame
print(df.tail(20))

         hvfhs_license_num dispatching_base_num originating_base_num  \
19663904            HV0003               B03404               B03404   
19663905            HV0003               B03404               B03404   
19663906            HV0003               B03404               B03404   
19663907            HV0003               B03404               B03404   
19663908            HV0003               B03404               B03404   
19663909            HV0003               B03404               B03404   
19663910            HV0003               B03404               B03404   
19663911            HV0003               B03404               B03404   
19663916            HV0003               B03404               B03404   
19663919            HV0003               B03404               B03404   
19663920            HV0003               B03404               B03404   
19663921            HV0003               B03404               B03404   
19663922            HV0003               B03404               B0