In [None]:

import pandas as pd
import glob
import os
import zipfile

# 1. Connect to your Google Drive ---
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# --- 2. Define your folder paths ---
main_data_path = "/content/drive/MyDrive/citi_bike_project/data/"
processed_data_path = "/content/drive/MyDrive/citi_bike_project/processed_data/"
os.makedirs(processed_data_path, exist_ok=True)

# PART A: UNZIP ALL .ZIP FILES

print("--- Starting Part A: Unzipping Files ---")

# Use os.walk to find all .zip files in all subdirectories
all_zip_files = []
for root, dirs, files in os.walk(main_data_path):
    for filename in files:
        if filename.endswith(".zip"):
            all_zip_files.append(os.path.join(root, filename))

print(f"Found {len(all_zip_files)} total .zip files to extract.")

# Loop through each found zip file and extract it into its own folder
for file_path in all_zip_files:
    # The extracted file will go into the same folder where the zip file is.
    extract_path = os.path.dirname(file_path)
    print(f"Extracting: {os.path.basename(file_path)} to {extract_path}")
    try:
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)
        # Optional: Delete the zip file after successful extraction to save space
        # os.remove(file_path)
    except zipfile.BadZipFile:
        print(f" Warning: Skipping {os.path.basename(file_path)} as it's not a valid zip file.")

print("\n Unzipping complete!")

# PART B: COMBINE ALL .CSV FILES

print("\n-Starting Part B: Combining All CSV Files ")

# Use os.walk again to find all .csv files now that everything is unzipped
all_csv_files = []
for root, dirs, files in os.walk(main_data_path):
    for filename in files:
        if filename.endswith(".csv"):
            all_csv_files.append(os.path.join(root, filename))

print(f"Found {len(all_csv_files)} total CSV files to combine.")

if not all_csv_files:
    print("Error: No CSV files were found after unzipping.")
else:
    print("Loading and combining files... This will take a while. ⏳")

    list_of_dataframes = [pd.read_csv(file, low_memory=False) for file in all_csv_files]
    master_df = pd.concat(list_of_dataframes, ignore_index=True)

    print("Data combination complete!")
    print(f"Your master dataset has {len(master_df)} rows (trips).")

    # Save the result as a Parquet file ---
    output_file = os.path.join(processed_data_path, "citibike_master.parquet")
    print(f"\nSaving combined data to Parquet file: {output_file}")
    master_df.to_parquet(output_file)

    print("Master file saved successfully! You are now ready for Phase 3.")


Mounted at /content/drive
--- Starting Part A: Unzipping Files ---
Found 48 total .zip files to extract.
Extracting: 202001-citibike-tripdata.zip to /content/drive/MyDrive/citi_bike_project/data/2020-citibike-tripdata
Extracting: 202002-citibike-tripdata.zip to /content/drive/MyDrive/citi_bike_project/data/2020-citibike-tripdata
Extracting: 202003-citibike-tripdata.zip to /content/drive/MyDrive/citi_bike_project/data/2020-citibike-tripdata
Extracting: 202004-citibike-tripdata.zip to /content/drive/MyDrive/citi_bike_project/data/2020-citibike-tripdata
Extracting: 202005-citibike-tripdata.zip to /content/drive/MyDrive/citi_bike_project/data/2020-citibike-tripdata
Extracting: 202006-citibike-tripdata.zip to /content/drive/MyDrive/citi_bike_project/data/2020-citibike-tripdata
Extracting: 202007-citibike-tripdata.zip to /content/drive/MyDrive/citi_bike_project/data/2020-citibike-tripdata
Extracting: 202008-citibike-tripdata.zip to /content/drive/MyDrive/citi_bike_project/data/2020-citibike-

In [None]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

# --- 1. Define File Paths ---
processed_data_path = "/content/drive/MyDrive/citi_bike_project/processed_data/"
source_file = os.path.join(processed_data_path, "citibike_master.parquet")
# This will create a new, uncorrupted version of your cleaned file
output_file = os.path.join(processed_data_path, "citibike_cleaned.parquet")

print(f"Reading from: {source_file}")
print(f"Will re-create the cleaned data file at: {output_file}")

# 2. Open the source file and create a writer for the new file ---
parquet_file = pq.ParquetFile(source_file)
parquet_writer = None

# 3. Process the file in chunks ---
print("\nStarting cleaning and feature engineering process...")
for i, batch in enumerate(parquet_file.iter_batches(batch_size=1000000)):
    print(f"  -> Processing chunk {i+1}...")
    chunk_df = batch.to_pandas()

    # Perform cleaning and feature engineering on the chunk ---
    chunk_df['started_at'] = pd.to_datetime(chunk_df['started_at'])
    chunk_df['ended_at'] = pd.to_datetime(chunk_df['ended_at'])
    chunk_df['hour'] = chunk_df['started_at'].dt.hour
    chunk_df['day_of_week'] = chunk_df['started_at'].dt.day_name()
    chunk_df['month'] = chunk_df['started_at'].dt.month
    chunk_df['trip_duration_minutes'] = (chunk_df['ended_at'] - chunk_df['started_at']).dt.total_seconds() / 60

    #Write the processed chunk to the new Parquet file ---
    table = pa.Table.from_pandas(chunk_df)
    if i == 0:
        parquet_writer = pq.ParquetWriter(output_file, table.schema)
    parquet_writer.write_table(table)

# Close the writer to finalize the file
if parquet_writer:
    parquet_writer.close()

print("\n New 'citibike_cleaned.parquet' file has been created successfully.")

Reading from: /content/drive/MyDrive/citi_bike_project/processed_data/citibike_master.parquet
Will re-create the cleaned data file at: /content/drive/MyDrive/citi_bike_project/processed_data/citibike_cleaned.parquet

Starting cleaning and feature engineering process...
  -> Processing chunk 1...
  -> Processing chunk 2...
  -> Processing chunk 3...
  -> Processing chunk 4...
  -> Processing chunk 5...
  -> Processing chunk 6...
  -> Processing chunk 7...
  -> Processing chunk 8...
  -> Processing chunk 9...
  -> Processing chunk 10...
  -> Processing chunk 11...
  -> Processing chunk 12...
  -> Processing chunk 13...
  -> Processing chunk 14...
  -> Processing chunk 15...
  -> Processing chunk 16...
  -> Processing chunk 17...
  -> Processing chunk 18...
  -> Processing chunk 19...
  -> Processing chunk 20...
  -> Processing chunk 21...
  -> Processing chunk 22...
  -> Processing chunk 23...
  -> Processing chunk 24...
  -> Processing chunk 25...
  -> Processing chunk 26...
  -> Proces