In [None]:
# prompt: /content/manhattan_stations_urbanFeatures.csv

import pandas as pd

# Assuming the file exists in the specified path
try:
  df = pd.read_csv('/content/manhattan_stations_urbanfeatures.csv')
  print(df.head()) # Display the first few rows of the DataFrame
except FileNotFoundError:
  print("Error: File not found. Please check the file path.")
except pd.errors.EmptyDataError:
  print("Error: The CSV file is empty.")
except pd.errors.ParserError:
  print("Error: There was an issue parsing the CSV file. Please check its format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")


  start_station_id  start_lat  start_lng  num_parks  num_malls  num_theaters
0          4818.03  40.700295 -73.950323          2          0             0
1          4821.03  40.700119 -73.986200          0          0             0
2           4821.1  40.700763 -73.988698          0          0             0
3          4829.01  40.700379 -73.995481          0          0             0
4          4832.07  40.701120 -73.930390          0          0             1


In [None]:
import pandas as pd

file_paths = [
    '/content/cpz_station_timeseries_2021_yearly_15min.parquet',
    '/content/cpz_station_timeseries_2022_yearly_15min.parquet',
    '/content/cpz_station_timeseries_2023_yearly_15min.parquet',
    '/content/cpz_station_timeseries_2024_yearly_15min.parquet'
]

for file_path in file_paths:
  try:
    df = pd.read_parquet(file_path)
    print(f"Head of {file_path}:")
    print(df.head())
    print("-" * 20)  # Separator between file outputs
  except FileNotFoundError:
    print(f"Error: File not found - {file_path}")
  except pd.errors.EmptyDataError:
    print(f"Error: The parquet file is empty - {file_path}")
  except pd.errors.ParserError:
    print(f"Error: There was an issue parsing the parquet file - {file_path}")
  except Exception as e:
    print(f"An unexpected error occurred while reading {file_path}: {e}")


Head of /content/cpz_station_timeseries_2021_yearly_15min.parquet:
  start_station_id         time_bucket  booking_count
0          2733.03 2021-12-01 13:45:00              1
1          2733.03 2021-12-01 15:45:00              1
2          2733.03 2021-12-02 14:00:00              1
3          2733.03 2021-12-02 15:00:00              1
4          2733.03 2021-12-02 15:15:00              1
--------------------
Head of /content/cpz_station_timeseries_2022_yearly_15min.parquet:
  start_station_id         time_bucket  booking_count
0          2733.03 2022-01-02 13:15:00              1
1          2733.03 2022-01-02 13:45:00              1
2          2733.03 2022-01-02 14:45:00              1
3          2733.03 2022-01-02 15:45:00              1
4          2733.03 2022-01-02 16:30:00              2
--------------------
Head of /content/cpz_station_timeseries_2023_yearly_15min.parquet:
  start_station_id         time_bucket  booking_count
0          2733.03 2023-01-01 00:00:00              1
1

In [None]:
# prompt: /content/Final_weather.csv prints the head and tail

import pandas as pd

# Assuming the file exists in the specified path
try:
  df = pd.read_csv('/content/Final_weather.csv')
  print(df.head()) # Display the first few rows of the DataFrame
  print(df.tail()) # Display the last few rows of the DataFrame
except FileNotFoundError:
  print("Error: File not found. Please check the file path.")
except pd.errors.EmptyDataError:
  print("Error: The CSV file is empty.")
except pd.errors.ParserError:
  print("Error: There was an issue parsing the CSV file. Please check its format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")


         DATE  TMIN  TMAX  TAVG   AWND  PRCP  SNOW WEATHER_CATEGORY
0  2020-01-01    34    41  37.5   8.50  0.00   0.0           cloudy
1  2020-01-02    33    49  41.0   5.37  0.00   0.0           cloudy
2  2020-01-03    44    49  46.5   3.36  0.15   0.0            rainy
3  2020-01-04    41    51  46.0   4.47  0.27   0.0            rainy
4  2020-01-05    35    42  38.5  11.41  0.00   0.0           cloudy
            DATE  TMIN  TMAX  TAVG  AWND  PRCP  SNOW WEATHER_CATEGORY
1116  2025-02-24    35    49  42.0  5.82  0.00   0.0           cloudy
1117  2025-02-25    43    57  50.0  3.58  0.00   0.0           cloudy
1118  2025-02-26    43    56  49.5  5.37  0.00   0.0           cloudy
1119  2025-02-27    44    54  49.0  5.37  0.01   0.0            rainy
1120  2025-02-28    41    51  46.0  6.71  0.00   0.0           cloudy


In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import gc  # garbage collector

# Load static datasets once
df_urban = pd.read_csv('/content/urban_features.csv')
df_weather = pd.read_csv('/content/weather.csv')
df_weather['DATE'] = pd.to_datetime(df_weather['DATE'])
df_weather['date'] = df_weather['DATE'].dt.date
df_weather.drop(columns=['DATE'], inplace=True)

# Output file
output_path = '/content/citibike_full_merged.parquet'
writer = None

for year in [2021, 2022, 2023, 2024]:
    print(f"Processing {year}...")

    # Load trips
    df_trips = pd.read_parquet(f"/content/cpz_station_timeseries_{year}_yearly_15min.parquet")
    df_trips['time_bucket'] = pd.to_datetime(df_trips['time_bucket'])
    df_trips['date'] = df_trips['time_bucket'].dt.date

    # Merge
    df_merged = pd.merge(df_trips, df_urban, on='start_station_id', how='left')
    df_merged = pd.merge(df_merged, df_weather, on='date', how='left')

    # Convert to PyArrow Table and write
    table = pa.Table.from_pandas(df_merged)

    if writer is None:
        writer = pq.ParquetWriter(output_path, table.schema)
    writer.write_table(table)

    # Cleanup to save memory
    del df_trips, df_merged, table
    gc.collect()

# Close writer
if writer:
    writer.close()

print("✅ All years processed and saved to Parquet successfully.")


Processing 2021...
Processing 2022...
Processing 2023...
Processing 2024...
✅ All years processed and saved to Parquet successfully.


In [None]:
# prompt: /content/full_merged.parquet. print the head and tail of this
import pandas as pd

# Read only the first 100,000 rows
df_sample = pd.read_parquet('/content/full_merged.parquet', engine='pyarrow', columns=None)
df_sample = df_sample.head(100_000)  # manually reduce if needed

print(df_sample.shape)
df_sample.head()


KeyboardInterrupt: 

##Crashing due to large dataset Isuues

In [None]:
import duckdb

# Create DuckDB connection (in-memory)
con = duckdb.connect()

# Just preview the schema (to verify it works)
con.execute("DESCRIBE SELECT * FROM '/content/full_merged.parquet'").fetchdf()


Unnamed: 0,column_name,column_type,null,key,default,extra
0,start_station_id,VARCHAR,YES,,,
1,time_bucket,TIMESTAMP_NS,YES,,,
2,booking_count,BIGINT,YES,,,
3,date,DATE,YES,,,
4,start_lat,DOUBLE,YES,,,
5,start_lng,DOUBLE,YES,,,
6,num_parks,DOUBLE,YES,,,
7,num_malls,DOUBLE,YES,,,
8,num_theaters,DOUBLE,YES,,,
9,TMIN,DOUBLE,YES,,,
