# Import libraries

In [None]:
import seaborn as sns
import matplotlib.dates as mdates
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [None]:
new_york_data = pd.read_csv("/content/final_selected_combined_citibike_data.csv")

# Filter stations outside New York

In [None]:
# Filter stations that are approximately in Manhattan based on latitude and longitude
manhattan_data = new_york_data[(new_york_data['latitude'] >= 40.7000) &
                                    (new_york_data['latitude'] <= 40.8800) &
                                    (new_york_data['longitude'] >= -74.0200) &
                                    (new_york_data['longitude'] <= -73.9100)]

# Keeping only often used stations

In [None]:
total_hours = 41637

In [None]:
# Calculate average hourly pickups for each station
avg_hourly_pickups = manhattan_data.groupby('station_id')['start_count'].sum() / total_hours

# Identify stations with demand less than 1 per hour
low_demand_stations = avg_hourly_pickups[avg_hourly_pickups < 1].index.tolist()

# Filter out the low demand stations from the dataset
manhattan_filtered = manhattan_data[~manhattan_data['station_id'].isin(low_demand_stations)]

# Display the first few rows of the filtered Manhattan data
manhattan_filtered.head()


Unnamed: 0,station_id,day,hour,start_count,end_count,latitude,longitude,station_name
399318,435.0,2020-09-01,0.0,1.0,2.0,40.74174,-73.994156,W 21 St & 6 Ave
399319,435.0,2020-09-01,2.0,1.0,0.0,40.74174,-73.994156,W 21 St & 6 Ave
399320,435.0,2020-09-01,5.0,1.0,0.0,40.74174,-73.994156,W 21 St & 6 Ave
399321,435.0,2020-09-01,6.0,7.0,4.0,40.74174,-73.994156,W 21 St & 6 Ave
399322,435.0,2020-09-01,7.0,7.0,8.0,40.74174,-73.994156,W 21 St & 6 Ave


In [None]:
manhattan_filtered['day'] = pd.to_datetime(manhattan_filtered['day'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manhattan_filtered['day'] = pd.to_datetime(manhattan_filtered['day'])


# Keeping only data from 2021 onwards

In [None]:
# Ensure the 'day' column is in datetime format
manhattan_filtered['day'] = pd.to_datetime(manhattan_filtered['day'])

# Filter to include only rows from '2021-01-01' onward
manhattan_filtered_2021_onward = manhattan_filtered[manhattan_filtered['day'] >= '2021-01-01']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  manhattan_filtered['day'] = pd.to_datetime(manhattan_filtered['day'])


In [None]:
# Group by start_station_name and start_station_id, and count unique years for each station
manhattan_filtered_2021_onward['year'] = manhattan_filtered_2021_onward['day'].dt.year

station_years_short = manhattan_filtered_2021_onward.groupby(['station_name'])['year'].nunique().reset_index()



In [None]:
# Filter stations that exist in all years of the data
stations_all_years_short = station_years_short[station_years_short['year'] == len(manhattan_filtered_2021_onward['year'].unique())]

# Use the filtered list of stations to filter the main dataset
filtered_data_short = manhattan_filtered_2021_onward[manhattan_filtered_2021_onward['station_name'].isin(stations_all_years_short['station_name'])]

filtered_data_short

In [None]:
# Assuming filtered_data_short is your DataFrame

# Group by 'station_name' and get the maximum 'station_id' for each group
max_station_ids = filtered_data_short.groupby('station_name')['station_id'].max()

# Map the station_name in the original dataframe to its highest station_id
filtered_data_short['station_id'] = filtered_data_short['station_name'].map(max_station_ids)

In [None]:
filtered_data_short.to_csv("/content/drive/MyDrive/Colab Notebooks/new_york_data_short.csv", index=False)


# Adding weather variables

In [None]:
merged_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/new_york_data_short.csv")

In [None]:
weather_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/open-meteo-40.70N74.00W51m.csv",  delimiter= ",", skiprows=2)

In [None]:
# Drop doubledayy column
merged_data = merged_data.drop(columns=['day'])
merged_data = merged_data.rename(columns={'day': 'date'})
merged_data['date'] = pd.to_datetime(merged_data['date'])

In [None]:
# Convert the 'time' column to datetime format
weather_data['time'] = pd.to_datetime(weather_data['time'])

# Extract the hour and date from the 'time' column
weather_data['hour'] = weather_data['time'].dt.hour
weather_data['date'] = weather_data['time'].dt.date

In [None]:
# Select the specified columns along with 'hour' and 'date'
selected_columns = [
    'hour', 'date', 'temperature_2m (°C)', 'relativehumidity_2m (%)',
    'precipitation (mm)', 'windspeed_10m (km/h)', 'snowfall (cm)',
    'direct_radiation (W/m²)', 'cloudcover (%)'
]
weather_data = weather_data[selected_columns]

In [None]:
# Merge weather and dataset
merged_data = merged_data.merge(weather_data, on=['hour', 'date'], how='inner')
merged_data['day_of_week'] = merged_data['date'].dt.day_name()


In [None]:
merged_data.to_csv("/content/drive/MyDrive/Colab Notebooks/new_york_data_short.csv", index=False)


# Adding Holiday variable

In [None]:
merged_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/new_york_data_short.csv")

In [None]:
holidays_ny = [

    {"Year": 2021, "Holiday": "New Year's Day", "Date": "January 1"},
    {"Year": 2021, "Holiday": "Martin Luther King Jr. Day", "Date": "January 18"},
    {"Year": 2021, "Holiday": "Washington's Birthday", "Date": "February 15"},
    {"Year": 2021, "Holiday": "Memorial Day", "Date": "May 31"},
    {"Year": 2021, "Holiday": "Juneteenth National Independence Day", "Date": "June 19"},
    {"Year": 2021, "Holiday": "Independence Day", "Date": "July 4"},
    {"Year": 2021, "Holiday": "Labor Day", "Date": "September 6"},
    {"Year": 2021, "Holiday": "Columbus Day", "Date": "October 11"},
    {"Year": 2021, "Holiday": "Election Day", "Date": "November 2"},
    {"Year": 2021, "Holiday": "Veterans Day", "Date": "November 11"},
    {"Year": 2021, "Holiday": "Thanksgiving Day", "Date": "November 25"},
    {"Year": 2021, "Holiday": "Christmas Day", "Date": "December 25"},

    {"Year": 2022, "Holiday": "New Year's Day", "Date": "January 1"},
    {"Year": 2022, "Holiday": "Martin Luther King Jr. Day", "Date": "January 17"},
    {"Year": 2022, "Holiday": "Washington's Birthday", "Date": "February 21"},
    {"Year": 2022, "Holiday": "Memorial Day", "Date": "May 30"},
    {"Year": 2022, "Holiday": "Juneteenth National Independence Day", "Date": "June 19"},
    {"Year": 2022, "Holiday": "Independence Day", "Date": "July 4"},
    {"Year": 2022, "Holiday": "Labor Day", "Date": "September 5"},
    {"Year": 2022, "Holiday": "Columbus Day", "Date": "October 10"},
    {"Year": 2022, "Holiday": "Election Day", "Date": "November 8"},
    {"Year": 2022, "Holiday": "Veterans Day", "Date": "November 11"},
    {"Year": 2022, "Holiday": "Thanksgiving Day", "Date": "November 24"},
    {"Year": 2022, "Holiday": "Christmas Day", "Date": "December 25"},

    {"Year": 2023, "Holiday": "New Year's Day", "Date": "January 1"},
    {"Year": 2023, "Holiday": "Martin Luther King Jr. Day", "Date": "January 16"},
    {"Year": 2023, "Holiday": "Washington's Birthday", "Date": "February 20"},
    {"Year": 2023, "Holiday": "Memorial Day", "Date": "May 29"},
    {"Year": 2023, "Holiday": "Juneteenth National Independence Day", "Date": "June 19"},
    {"Year": 2023, "Holiday": "Independence Day", "Date": "July 4"},
    {"Year": 2023, "Holiday": "Labor Day", "Date": "September 4"},
    {"Year": 2023, "Holiday": "Columbus Day", "Date": "October 9"},
    {"Year": 2023, "Holiday": "Election Day", "Date": "November 7"},
    {"Year": 2023, "Holiday": "Veterans Day", "Date": "November 11"},
    {"Year": 2023, "Holiday": "Thanksgiving Day", "Date": "November 23"},
    {"Year": 2023, "Holiday": "Christmas Day", "Date": "December 25"}
]


In [None]:
holidays_df = pd.DataFrame(holidays_ny)

# Convert the 'Date' column to datetime format for both datasets
holidays_df['date'] = pd.to_datetime(holidays_df['Date'] + '-' + holidays_df['Year'].astype(str))
holidays_df['IsHoliday'] = 1  # Create a binary column indicating this is a holiday
holidays_df = holidays_df[['date', 'IsHoliday']]  # Only keep necessary columns


In [None]:
merged_data['date'] = pd.to_datetime(merged_data['date'])

# Merge the two datasets
merged_df = pd.merge(merged_data, holidays_df, on='date', how='left')

# Fill NaN values with 0 (indicating no holiday)
merged_df['IsHoliday'].fillna(0, inplace=True)
merged_df['IsHoliday'] = merged_df['IsHoliday'].astype(int)  # Convert to integer type
merged_df

In [None]:
merged_data_short = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/merged_data_short.csv")

In [None]:
# Assuming 'station_id' is the column name in both DataFrames that identifies stations
merged_data = merged_df[merged_df['station_id'].isin(unique_stations['station_id'])]


# Adding bike lanes

In [None]:
# Load the GeoJSON file containing the bike lanes
bike_lanes_geojson_path = '/content/drive/MyDrive/Colab Notebooks/New York City Bike Routes (1) (1).geojson'
bike_lanes_gdf = gpd.read_file(bike_lanes_geojson_path)



In [None]:
# Convert the bike stations DataFrame to a GeoDataFrame
gdf_stations = gpd.GeoDataFrame(
    bike_stations_df,
    geometry=[Point(xy) for xy in zip(bike_stations_df.longitude, bike_stations_df.latitude)]
)

# Set the coordinate reference system (CRS) to WGS84 (epsg:4326)
gdf_stations.crs = {'init': 'epsg:4326'}

# Using New York Long Island State Plane (epsg:2263) which is suitable for New York area
gdf_stations = gdf_stations.to_crs({'init': 'epsg:2263'})

# Reproject the bike lanes GeoDataFrame to the same CRS
bike_lanes_gdf = bike_lanes_gdf.to_crs({'init': 'epsg:2263'})

# Check the reprojected station data
gdf_stations.head()


In [None]:
# Function to calculate the length of bike lanes within a 200m buffer of a point
def calculate_bike_lane_length(point, bike_lanes_gdf):
    buffer = point.buffer(200)
    intersecting_lanes = bike_lanes_gdf[bike_lanes_gdf.intersects(buffer)]
    total_length = intersecting_lanes.length.sum()
    return total_length

# Apply the function to each station
gdf_stations['bike_lane_length_km'] = gdf_stations['geometry'].apply(
    lambda x: calculate_bike_lane_length(x, bike_lanes_gdf) / 1000)  # Convert from m to km

gdf_stations[['station_id', 'bike_lane_length_km']].head()


In [None]:
gdf_stations.drop(columns=['latitude', 'longitude', 'geometry'], inplace=True)

In [None]:
merged_data_short = merged_data_short.merge(gdf_stations, on=['station_id'], how='inner')

In [None]:
merged_data.to_csv("/content/drive/MyDrive/Colab Notebooks/merged_data_short.csv", index=False)


# Adding additional spatial features

## Create function to count number of facilities surrounding station within 200 m radius

In [None]:
bike_stations_df = bike_stations

# Convert degrees to radians for latitude and longitude for vectorization
bike_stations_df['latitude_rad'] = np.radians(bike_stations_df['latitude'])
bike_stations_df['longitude_rad'] = np.radians(bike_stations_df['longitude'])
restaurants_df['Latitude_rad'] = np.radians(restaurants_df['Latitude'])
restaurants_df['Longitude_rad'] = np.radians(restaurants_df['Longitude'])

# Vectorized Haversine function
def vectorized_haversine(lon1, lat1, lon2, lat2):
    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371.0 * c
    return km * 1000  # Return in meters

# Function to count the number of facilities within a radius of 200 meters for each bike station using vectorized operations
def count_facilities_within_radius_vectorized(bike_stations, facilities):
    # Preallocate the results list with zeros
    counts = np.zeros(len(bike_stations), dtype=int)
    # For each facility, calculate the distance to all bike stations and count if within 200 meters
    for facility_lat, facility_lon in zip(facilities['Latitude_rad'], facilities['Longitude_rad']):
        distances = vectorized_haversine(
            bike_stations['longitude_rad'].values,
            bike_stations['latitude_rad'].values,
            facility_lon,
            facility_lat
        )
        counts += (distances <= 200)
    return counts


### Restaurants

In [None]:
restaurants_path = '/content/drive/MyDrive/Colab Notebooks/newyork_restaurants.csv'
restaurants_df = pd.read_csv(restaurants_path)

# Count the number of restaurants within 200 meters of each bike station
bike_stations_df['restaurants_count'] = count_facilities_within_radius_vectorized(bike_stations_df, restaurants_df)

# Display the updated bike stations dataframe with the count of nearby restaurants
bike_stations_df.head()

## Rail stations

In [None]:
rail_stations_path = '/content/drive/MyDrive/Colab Notebooks/newyork_rail_stations.csv'
rail_stations_df = pd.read_csv(rail_stations_path)

# Convert degrees to radians for latitude and longitude for vectorization
rail_stations_df['Latitude_rad'] = np.radians(rail_stations_df['Latitude'])
rail_stations_df['Longitude_rad'] = np.radians(rail_stations_df['Longitude'])

# Count the number of rail stations within 200 meters of each bike station
bike_stations_df['rail_stations_count'] = count_facilities_within_radius_vectorized(bike_stations_df, rail_stations_df)

### Universities

In [None]:
# Load the New York universities data
universities_path = '/content/drive/MyDrive/Colab Notebooks/newyork_universities.csv'
universities_df = pd.read_csv(universities_path)

# Convert degrees to radians for latitude and longitude for vectorization
universities_df['Latitude_rad'] = np.radians(universities_df['Latitude'])
universities_df['Longitude_rad'] = np.radians(universities_df['Longitude'])

# Count the number of universities within 200 meters of each bike station
bike_stations_df['universities_count'] = count_facilities_within_radius_vectorized(bike_stations_df, universities_df)


### Bus stations

In [None]:
# Load the New York bus stations data
bus_stations_path = '/content/drive/MyDrive/Colab Notebooks/newyork_bus_stations.csv'
bus_stations_df = pd.read_csv(bus_stations_path)

# Convert degrees to radians for latitude and longitude for vectorization
bus_stations_df['Latitude_rad'] = np.radians(bus_stations_df['Latitude'])
bus_stations_df['Longitude_rad'] = np.radians(bus_stations_df['Longitude'])

# Count the number of bus stations within 200 meters of each bike station
bike_stations_df['bus_stations_count'] = count_facilities_within_radius_vectorized(bike_stations_df, bus_stations_df)


### Businesses

In [None]:
# Load the New York businesses data
businesses_path = '/content/drive/MyDrive/Colab Notebooks/newyork_businesses.csv'
businesses_df = pd.read_csv(businesses_path)

# Convert degrees to radians for latitude and longitude for vectorization
businesses_df['Latitude_rad'] = np.radians(businesses_df['Latitude'])
businesses_df['Longitude_rad'] = np.radians(businesses_df['Longitude'])

# Count the number of businesses within 200 meters of each bike station
bike_stations_df['businesses_count'] = count_facilities_within_radius_vectorized(bike_stations_df, businesses_df)


### Metro stations

In [None]:
# Load the New York metro stations data
metro_stations_path = '/content/drive/MyDrive/Colab Notebooks/newyork_metro_stations.csv'
metro_stations_df = pd.read_csv(metro_stations_path)

# Convert degrees to radians for latitude and longitude for vectorization
metro_stations_df['Latitude_rad'] = np.radians(metro_stations_df['Latitude'])
metro_stations_df['Longitude_rad'] = np.radians(metro_stations_df['Longitude'])

# Count the number of metro stations within 200 meters of each bike station
bike_stations_df['metro_stations_count'] = count_facilities_within_radius_vectorized(bike_stations_df, metro_stations_df)

### Parks

In [None]:
# Load the New York parks data
parks_path = '/content/drive/MyDrive/Colab Notebooks/newyork_parks.csv'
parks_df = pd.read_csv(parks_path)

# Convert degrees to radians for latitude and longitude for vectorization
parks_df['Latitude_rad'] = np.radians(parks_df['Latitude'])
parks_df['Longitude_rad'] = np.radians(parks_df['Longitude'])

# Count the number of parks within 200 meters of each bike station
bike_stations_df['parks_count'] = count_facilities_within_radius_vectorized(bike_stations_df, parks_df)

Merging

In [None]:
bike_stations_df.drop(columns=['latitude', 'longitude', 'latitude_rad', 'longitude_rad', 'metro_stations_count'], inplace=True)

In [None]:
merged_data_short = merged_data_short.merge(bike_stations_df, on=['station_id'], how='inner')

## Adding capacity

In [None]:
# Define the URL of the API
url = "https://gbfs.citibikenyc.com/gbfs/en/station_information.json"

# Fetch the data from the API
response = requests.get(url)
data = response.json()

# Extract the desired information
stations_info = []
for station in data["data"]["stations"]:
    stations_info.append({
        "short_name": station["short_name"],
        "capacity": station["capacity"],
        "name": station["name"]
    })

# Convert stations_info into a DataFrame
stations_df = pd.DataFrame(stations_info)

# Perform a left join using 'station_name' from filtered_data_short and 'name' from stations_df
merged_data_short = pd.merge(merged_data_short, stations_df, left_on='station_name', right_on='name', how='left')

# Drop the duplicate 'name' column from the merged DataFrame
merged_data_short = merged_data_short.drop(columns='name')

# Encoding station ID to integer

In [None]:
# Assuming 'station_id' is the feature we want to encode
# First, we convert the station IDs to integer labels
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(merged_data_short['station_id'])

# Check the encoding and the number of unique stations
num_stations = len(label_encoder.classes_)


In [None]:

# Now we can remove the original 'station_id' from the training data
merged_data_short = merged_data_short.drop('station_id', axis=1)

# Add the integer-encoded station IDs as a new column to the dataframe
merged_data_short['station_id_encoded'] = integer_encoded

# Display the modified dataframe and the number of unique stations
merged_data_short.head(), num_stations

In [None]:
# Find the range of station_id values
station_id_min = merged_data_short['station_id_encoded'].min()
station_id_max = merged_data_short['station_id_encoded'].max()

station_id_min, station_id_max


(0, 263)

In [None]:
# Move 'station_id_encoded' to the beginning
column_order = ['station_id_encoded'] + [col for col in merged_data_short.columns if col != 'station_id_encoded']
merged_data_short = merged_data_short[column_order]

merged_data_short.head()