In [3]:
# Import Python Libraries
import pandas as pd
import os
from keplergl import KeplerGl
from pyproj import CRS
import numpy as np
from matplotlib import pyplot as plt

In [4]:
path = r"C:\Users\okumb\Downloads\CitiBike-New-York\.venv\Scripts\NewYork_data.csv"
df = pd.read_csv(path, index_col = 0)

In [5]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,DATE,TAVG,_merge
0,CA5837152804D4B5,electric_bike,2022-01-26 18:50:39,2022-01-26 18:51:53,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member,2022-01-26,-2.3,both
1,BA06A5E45B6601D2,classic_bike,2022-01-28 13:14:07,2022-01-28 13:20:23,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member,2022-01-28,0.1,both
2,7B6827D7B9508D93,classic_bike,2022-01-10 19:55:13,2022-01-10 20:00:37,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member,2022-01-10,1.6,both
3,6E5864EA6FCEC90D,electric_bike,2022-01-26 07:54:57,2022-01-26 07:55:22,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member,2022-01-26,-2.3,both
4,E24954255BBDE32D,electric_bike,2022-01-13 18:44:46,2022-01-13 18:45:43,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member,2022-01-13,4.0,both


In [6]:
# Drop the column['_merge']
df = df.drop(columns= ['_merge'])

In [7]:
df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
DATE                   object
TAVG                  float64
dtype: object

In [8]:
df['values'] = 1

In [9]:
# Create a value column and group by start and end station 
df_group = df.groupby(['start_station_name', 'end_station_name'])['values'].count().reset_index()

In [10]:
df_group

Unnamed: 0,start_station_name,end_station_name,values
0,11 St & Washington St,11 St & Washington St,1132
1,11 St & Washington St,12 Ave & W 40 St,1
2,11 St & Washington St,12 St & Sinatra Dr N,253
3,11 St & Washington St,14 St Ferry - 14 St & Shipyard Ln,395
4,11 St & Washington St,4 St & Grand St,350
...,...,...,...
6948,York St & Marin Blvd,Van Vorst Park,18
6949,York St & Marin Blvd,Warren St,42
6950,York St & Marin Blvd,Washington St,16
6951,York St & Marin Blvd,Willow Ave & 12 St,1


In [11]:
df_group['values'].sort_index(ascending=False).value_counts()

values
1       676
2       281
3       194
4       187
5       153
       ... 
842       1
1077      1
1213      1
1247      1
1391      1
Name: count, Length: 800, dtype: int64

In [12]:
df_group['values'].describe()

count    6953.000000
mean      128.330361
std       304.091182
min         1.000000
25%         7.000000
50%        28.000000
75%       118.000000
max      5565.000000
Name: values, dtype: float64

In [13]:
df_group.to_csv('df_groupby_final.csv')

In [14]:
df_group.head()

Unnamed: 0,start_station_name,end_station_name,values
0,11 St & Washington St,11 St & Washington St,1132
1,11 St & Washington St,12 Ave & W 40 St,1
2,11 St & Washington St,12 St & Sinatra Dr N,253
3,11 St & Washington St,14 St Ferry - 14 St & Shipyard Ln,395
4,11 St & Washington St,4 St & Grand St,350


In [15]:
df_group.rename(columns = {'values': 'trips'}, inplace = True)

In [16]:
df_group

Unnamed: 0,start_station_name,end_station_name,trips
0,11 St & Washington St,11 St & Washington St,1132
1,11 St & Washington St,12 Ave & W 40 St,1
2,11 St & Washington St,12 St & Sinatra Dr N,253
3,11 St & Washington St,14 St Ferry - 14 St & Shipyard Ln,395
4,11 St & Washington St,4 St & Grand St,350
...,...,...,...
6948,York St & Marin Blvd,Van Vorst Park,18
6949,York St & Marin Blvd,Warren St,42
6950,York St & Marin Blvd,Washington St,16
6951,York St & Marin Blvd,Willow Ave & 12 St,1


# Preparing final Dataframe

In [17]:
df_final = pd.merge(df_group, df, 
                     on=['start_station_name', 'end_station_name'], 
                     how='outer', 
                     indicator='_merge')




In [18]:
df_final.head()

Unnamed: 0,start_station_name,end_station_name,trips,ride_id,rideable_type,started_at,ended_at,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,DATE,TAVG,values,_merge
0,11 St & Washington St,11 St & Washington St,1132.0,6470EBA34F76D73E,electric_bike,2022-01-15 07:50:19,2022-01-15 08:01:34,HB502,HB502,40.749985,-74.02715,40.749985,-74.02715,casual,2022-01-15,-6.7,1,both
1,11 St & Washington St,11 St & Washington St,1132.0,FB7ED15F7B4250E2,classic_bike,2022-01-02 20:13:24,2022-01-02 20:20:52,HB502,HB502,40.749985,-74.02715,40.749985,-74.02715,member,2022-01-02,11.4,1,both
2,11 St & Washington St,11 St & Washington St,1132.0,93A4268148B62A22,classic_bike,2022-01-14 12:45:38,2022-01-14 12:45:50,HB502,HB502,40.749985,-74.02715,40.749985,-74.02715,member,2022-01-14,4.8,1,both
3,11 St & Washington St,11 St & Washington St,1132.0,356DB0611A5395A5,classic_bike,2022-01-02 09:08:10,2022-01-02 09:19:58,HB502,HB502,40.749985,-74.02715,40.749985,-74.02715,casual,2022-01-02,11.4,1,both
4,11 St & Washington St,11 St & Washington St,1132.0,8864E3B821B88A86,classic_bike,2022-01-06 18:07:23,2022-01-06 18:08:39,HB502,HB502,40.749985,-74.02715,40.749985,-74.02715,member,2022-01-06,4.9,1,both


In [19]:
df_final = df_final[df_final['_merge'] == 'both']

In [20]:
df_final

Unnamed: 0,start_station_name,end_station_name,trips,ride_id,rideable_type,started_at,ended_at,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,DATE,TAVG,values,_merge
0,11 St & Washington St,11 St & Washington St,1132.0,6470EBA34F76D73E,electric_bike,2022-01-15 07:50:19,2022-01-15 08:01:34,HB502,HB502,40.749985,-74.027150,40.749985,-74.027150,casual,2022-01-15,-6.7,1,both
1,11 St & Washington St,11 St & Washington St,1132.0,FB7ED15F7B4250E2,classic_bike,2022-01-02 20:13:24,2022-01-02 20:20:52,HB502,HB502,40.749985,-74.027150,40.749985,-74.027150,member,2022-01-02,11.4,1,both
2,11 St & Washington St,11 St & Washington St,1132.0,93A4268148B62A22,classic_bike,2022-01-14 12:45:38,2022-01-14 12:45:50,HB502,HB502,40.749985,-74.027150,40.749985,-74.027150,member,2022-01-14,4.8,1,both
3,11 St & Washington St,11 St & Washington St,1132.0,356DB0611A5395A5,classic_bike,2022-01-02 09:08:10,2022-01-02 09:19:58,HB502,HB502,40.749985,-74.027150,40.749985,-74.027150,casual,2022-01-02,11.4,1,both
4,11 St & Washington St,11 St & Washington St,1132.0,8864E3B821B88A86,classic_bike,2022-01-06 18:07:23,2022-01-06 18:08:39,HB502,HB502,40.749985,-74.027150,40.749985,-74.027150,member,2022-01-06,4.9,1,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895467,York St & Marin Blvd,York St & Marin Blvd,47.0,4CC3080CAC124169,classic_bike,2022-12-07 18:12:17,2022-12-07 18:12:22,JC097,JC097,40.716615,-74.042412,40.716615,-74.042412,member,2022-12-07,13.8,1,both
895468,York St & Marin Blvd,York St & Marin Blvd,47.0,DD1739970EFF4F5B,classic_bike,2022-12-01 12:06:09,2022-12-01 12:12:26,JC097,JC097,40.716615,-74.042412,40.716615,-74.042412,casual,2022-12-01,5.7,1,both
895469,York St & Marin Blvd,York St & Marin Blvd,47.0,118E972FA0234336,classic_bike,2022-12-20 12:35:08,2022-12-20 12:40:41,JC097,JC097,40.716615,-74.042412,40.716615,-74.042412,casual,2022-12-20,1.9,1,both
895470,York St & Marin Blvd,York St & Marin Blvd,47.0,56D61E68DC4F71E5,classic_bike,2022-12-22 11:09:49,2022-12-22 11:11:27,JC097,JC097,40.716615,-74.042412,40.716615,-74.042412,member,2022-12-22,3.8,1,both


# Create KeplerGl instance

## splitting the dataframe because of Memory size limit of github

In [21]:
np.random.seed(32)
red = np.random.rand(len(df_final)) <= 0.92

In [22]:
small = df_final[~red]

In [23]:
small.shape

(71134, 18)

In [24]:
# Create KeplerGl instance

m = KeplerGl(height = 700, data={"data_1": small})
m

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data_1':            start_station_name       end_station_name   trips  \
3       11 St & Washi…

# Configuring the Kepler.gl Map in My Project

In my project, I used Kepler.gl to visualize geographical data with distinct starting and ending points connected by arcs. Below are the detailed steps I took to configure the map:

## Data Preparation
Firstly, I ensured that my data was correctly formatted and linked. The dataset included essential columns such as `start_lat`, `start_lon`, `end_lat`, and `end_lon`.

## Setting Start and End Points
- **Start Point Configuration**: 
  - I added the `start_lat` and `start_lon` column names to their respective fields in the **Start Point** tab.
- **End Point Configuration**: 
  - Similarly, I inserted the `end_lat` and `end_lon` column names in the **End Point** tab.

## Color Adjustments
To enhance visual distinction between different points:
- I selected distinct colors for the start and end points within the Kepler.gl point settings, ensuring they were easily distinguishable from one another.

## Arc Configuration
- To depict the routes between start and end points, I added an arc layer:
  - I navigated to the **Arc** layer in the layer selection menu.
  - Under the **Basic** menu, I chose Line option in the **Start -> End Arc** tab.
  - I selected a color that was both visually striking and distinct from the point colors to represent the arcs.

These configurations helped in effectively visualizing the data, making it easy to identify and analyze the patterns of movement between locations in the map.



## Filtering

# Analyzing Common Trips in New York City Using Kepler.gl

In my analysis of trip data in New York City using Kepler.gl, I focused on identifying the most common trips and observing busy zones. Here’s how I approached the analysis and what I discovered:

## Adding a Filter to Identify Common Trips
To find the most common trips, I added a filter to the map to count the number of trips between the same start and end points. This allowed me to visualize and quantify the most frequent routes.

## Observations from the Filtered Data
### Most Common Trips
1. **South Waterfront Walkway - Sinatra Dr & 1 St**
   - **Trip Count**: 5439 trips
   - **Rideable Type**: Classic Bike
   - **Route**: The route starting and ending at South Waterfront Walkway to Sinatra Dr & 1 St is highly popular, suggesting a busy local loop that is preferred for short, convenient trips within the waterfront area.

2. **Hoboken Terminal - Hudson St & Hudson Pl to Hoboken Ave at Monmouth St**
   - **Trip Count**: 5565 trips
   - **Rideable Type**: Classic Bike
   - **Route**: This route connects major transit hubs and residential areas, indicating its popularity among commuters and residents in Hoboken, likely due to its convenience and accessibility.

### Additional Observations
- **Busy Zones**: The zones around Hoboken Terminal and South Waterfront are particularly busy. These areas are key transit points and popular recreational spots, leading to higher traffic volumes.
- **Implications**: The high frequency of trips in these areas can be attributed to the well-established cycling infrastructure and the proximity to major residential and commercial hubs. This suggests that enhancing bike-related facilities could further promote cycling as a convenient mode of transport.

## Conclusion
The filtered analysis on Kepler.gl not only helped identify the most common trips but also provided insights into urban mobility patterns in New York City. Such data can be instrumental in urban planning and improving public transportation networks.


# Creating a config object

In [25]:
config = m.config

In [26]:
config

{}

In [29]:
import json
with open("config.json", "w") as outfile:
    json.dump(config, outfile)

In [31]:
m.save_to_html(file_name = 'New_York_CitiBikeTrips_Aggregated.html', read_only = False, config = config)

Map saved to New_York_CitiBikeTrips_Aggregated.html!
