# 📌 Step 1: Import Required Libraries 📌

In [12]:

import pandas as pd
import os
import requests

# Set display options for better visualization
pd.set_option('display.max_columns', None)

# 📌 Step 2: Define Folder Path & Check Files 📌

In [13]:
# 📌 Step 2: Define Folder Path & Check Files 📌

# Define the accurate folder path where all CSV files are stored
folder_path = r"C:\Users\Asus\Music\CitiBike_2022_Data\2022-citibike-tripdata\202201-citibike-tripdata"

# Check if the folder exists
if not os.path.exists(folder_path):
    print("❌ Error: The specified folder path does not exist.")
else:
    print("✅ Folder path exists. Proceeding with file reading...")

✅ Folder path exists. Proceeding with file reading...


# 📌 Step 3: Read and Merge All Monthly Data Files 📌

In [14]:
# Create an empty list to store dataframes
df_list = []

# Loop through each file in the folder and read it into a dataframe
for file in os.listdir(folder_path):
    if file.endswith(".csv"):  # Ensure we're only reading CSV files
        file_path = os.path.join(folder_path, file)
        print(f"📂 Reading file: {file_path}")  # Debugging statement
        
        # Read CSV with low_memory=False to prevent dtype warning
        df = pd.read_csv(file_path, low_memory=False)
        
        # Append to the list
        df_list.append(df)

# Check if any CSV files were read before concatenating
if df_list:
    # Concatenate all CSV data into a single dataframe
    all_data = pd.concat(df_list, ignore_index=True)
    
    # Display the first few rows
    print("\n✅ Successfully loaded the data!")
    print(all_data.head())
else:
    print("\n⚠ No CSV files found in the specified folder.")

📂 Reading file: C:\Users\Asus\Music\CitiBike_2022_Data\2022-citibike-tripdata\202201-citibike-tripdata\202201-citibike-tripdata_1.csv
📂 Reading file: C:\Users\Asus\Music\CitiBike_2022_Data\2022-citibike-tripdata\202201-citibike-tripdata\202201-citibike-tripdata_2.csv

✅ Successfully loaded the data!
            ride_id  rideable_type               started_at  \
0  BFD29218AB271154  electric_bike  2022-01-21 13:13:43.392   
1  7C953F2FD7BE1302   classic_bike  2022-01-10 11:30:54.162   
2  95893ABD40CED4B8  electric_bike  2022-01-26 10:52:43.096   
3  F853B50772137378   classic_bike  2022-01-03 08:35:48.247   
4  7590ADF834797B4B   classic_bike  2022-01-22 14:14:23.043   

                  ended_at       start_station_name start_station_id  \
0  2022-01-21 13:22:31.463  West End Ave & W 107 St          7650.05   
1  2022-01-10 11:41:43.422             4 Ave & 3 St          4028.04   
2  2022-01-26 11:06:35.227          1 Ave & E 62 St          6753.08   
3  2022-01-03 09:10:50.475      

# 📌 Step 4: Save the Merged CitiBike Data 📌

In [15]:
# Save the merged CitiBike data for reference
all_data.to_csv("citibike_merged.csv", index=False)
print("✅ Merged CitiBike data saved as 'citibike_merged.csv'!")


✅ Merged CitiBike data saved as 'citibike_merged.csv'!


# 📌 Step 5: Get Weather Data for 2022 📌

In [16]:
# NOAA API token
token = "IjlRiTQPicwBnfVgePvsmZGuSVzbRxZa"
headers = {"token": token}

# Define the API URL and parameters
url = "https://www.ncei.noaa.gov/access/services/data/v1"
params = {
    "dataset": "daily-summaries",
    "stations": "USW00094728",  # LaGuardia Airport
    "startDate": "2022-01-01",
    "endDate": "2022-12-31",
    "dataTypes": "TMAX,TMIN,PRCP",
    "format": "csv"
}

# Send the request to the NOAA API
response = requests.get(url, headers=headers, params=params)

# Save the response content to a CSV file
weather_file = "weather_2022.csv"
with open(weather_file, "wb") as f:
    f.write(response.content)

print(f"✅ Weather data saved as '{weather_file}'!")

✅ Weather data saved as 'weather_2022.csv'!


# 📌 Step 6: Merge CitiBike Data with Weather Data 📌

In [17]:
# Load the weather data
weather = pd.read_csv("weather_2022.csv")

# Ensure 'DATE' column in weather is in datetime format
weather['DATE'] = pd.to_datetime(weather['DATE'])

# 🛠 Fix the "NameError: name 'all_data' is not defined" issue by ensuring all_data exists
if 'all_data' in globals():
    # Ensure 'started_at' in all_data is converted to datetime
    all_data['start_date'] = pd.to_datetime(all_data['started_at']).dt.normalize()

    # Merge the datasets on the date
    merged_data = pd.merge(
        all_data,
        weather,
        left_on='start_date',
        right_on='DATE',
        how='left'
    )

    # Save the merged dataset
    merged_data.to_csv("merged_citibike_weather.csv", index=False)
    print("✅ Merged CitiBike & Weather data saved as 'merged_citibike_weather.csv'!")
else:
    print("❌ Error: 'all_data' is not defined. Ensure the CitiBike data is loaded properly.")


✅ Merged CitiBike & Weather data saved as 'merged_citibike_weather.csv'!
