In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import re

In [2]:
XF = pd.read_csv("00_Stacked_Dataframe.csv")

In [4]:
XF.shape

(6927165, 16)

In [14]:
Station_shp = pd.read_csv("./XX_Processed_Data/A0_WashingtonDC_StationIDs_01.csv")

In [15]:
# Convert the column to datetime
XF['started_at'] = pd.to_datetime(XF['started_at'], format='%Y-%m-%d %H:%M:%S')
XF['ended_at'] = pd.to_datetime(XF['ended_at'], format='%Y-%m-%d %H:%M:%S')

In [16]:
XF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6927165 entries, 0 to 6927164
Data columns (total 16 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    int64         
 6   end_station_name    object        
 7   end_station_id      int64         
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
 13  Hour_(Starting)     int64         
 14  Year                int64         
 15  StationID_YearHour  object        
dtypes: datetime64[ns](2), float64(4), int64(4), object(6)
memory usage: 845.6+ MB


In [17]:
XF["start_station_id"] = XF["start_station_id"].astype(str)
XF["end_station_id"] = XF["end_station_id"].astype(str)

In [18]:
XF["Station_ID"] = XF["start_station_id"]

### Adding Season Column

In [46]:
# Convert the column to datetime
XF['started_at'] = pd.to_datetime(XF['started_at'], format='%Y-%m-%d %H:%M:%S')
XF['ended_at'] = pd.to_datetime(XF['ended_at'], format='%Y-%m-%d %H:%M:%S')

In [47]:
filt = (XF["started_at"].dt.month >= 3) & (XF['started_at'].dt.month <= 5)
XF.loc[filt,"Season"] = "Spring"

In [48]:
filt = (XF["started_at"].dt.month >= 6) & (XF['started_at'].dt.month <= 8)
XF.loc[filt,"Season"] = "Summer"

In [49]:
filt = (XF["started_at"].dt.month >= 9) & (XF['started_at'].dt.month <= 11)
XF.loc[filt,"Season"] = "Fall"

In [50]:
filt1 = (XF["started_at"].dt.month >= 2)
filt2 = (XF['started_at'].dt.month == 12)
filt = filt1 & filt2

XF.loc[filt,"Season"] = "Winter"

In [51]:
XF["Season"] = XF["Season"].fillna("Winter")

## Creating Day Type Column

In [52]:
# Define a function to label each date as weekday or weekend
def categorize_weekday_or_weekend(date):
    if date.weekday() < 5:  # 0-4 represent Monday to Friday (weekdays)
        return 'Weekday'
    else:
        return 'Weekend'

# Apply the function to create a new column 'Day_Type'
XF['Day_Type'] = XF['started_at'].apply(categorize_weekday_or_weekend)

## Merging Two DataFrames

In [53]:
Station_shp['Station_ID'] = Station_shp['Station_ID'].astype(str)

In [54]:
# Merge the DataFrames based on the common fields 'ID'
merged_df = pd.merge(XF, Station_shp, on='Station_ID')

In [55]:
merged_df.columns

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat_x', 'start_lng_x', 'end_lat', 'end_lng',
       'member_casual', 'Hour_(Starting)', 'Year', 'StationID_YearHour',
       'Station_ID', 'Season', 'Day_Type', 'start_station', 'start_lat_y',
       'start_lng_y', 'Neighb_Cls', 'Ward'],
      dtype='object')

### Mapping New Column for Matching StationID with Wards

In [56]:
len(Station_shp["Ward"].unique())

8

In [57]:
# # Convert two columns to a dictionary
mapping  = dict(zip(Station_shp['Station_ID'], Station_shp['Ward']))

In [58]:
# Apply the mapping code to create the 'Start Neighborhood' column
merged_df['Start Ward'] = merged_df['start_station_id'].apply(lambda x: mapping.get(x, x))

In [59]:
# Apply the mapping code to create the 'Start Neighborhood' column
merged_df['End Ward'] = merged_df['end_station_id'].apply(lambda x: mapping.get(x, x))

In [60]:
merged_df.shape

(6128884, 26)

In [61]:
merged_df["End Ward"].value_counts()

Ward 2    2807477
Ward 6    1612624
Ward 1     763532
Ward 5     380025
Ward 8     136576
           ...   
32251           1
32090           1
32205           1
32241           1
32268           1
Name: End Ward, Length: 338, dtype: int64

In [62]:
len(merged_df["Start Ward"].unique())

8

In [63]:
len(merged_df["End Ward"].unique())

338

In [64]:
Ward = list (Station_shp['Ward'].unique())
Ward

['Ward 6',
 'Ward 2',
 'Ward 1',
 'Ward 5',
 'Ward 8',
 'Ward 4',
 'Ward 3',
 'Ward 7']

In [65]:
# Filter rows where 'Column1' starts with "cluster"
filtered_df = merged_df[merged_df['End Ward'].str.startswith('Ward')]

### Based on All Seasons

In [69]:
trips = filtered_df.groupby(['Start Ward', 'End Ward']).size() 
trips

Start Ward  End Ward
Ward 1      Ward 1      292969
            Ward 2      418332
            Ward 3       27786
            Ward 4       31609
            Ward 5       44195
                         ...  
Ward 8      Ward 4         233
            Ward 5        4232
            Ward 6       63462
            Ward 7        4268
            Ward 8       30143
Length: 64, dtype: int64

In [70]:
Ward = list (Station_shp['Ward'].unique())

In [40]:
len(Ward)

8

In [41]:
matrix = [[trips.get(k, {}).get(kk, 0) for kk in Ward] for k in Ward]

In [42]:
matrix

[[876686, 405610, 47692, 115776, 75286, 3299, 2320, 41197],
 [430729, 1786418, 312402, 86895, 17437, 16043, 29242, 7777],
 [65860, 418332, 292969, 44195, 2511, 31609, 27786, 1311],
 [125335, 96531, 40753, 112529, 5535, 8871, 1008, 5524],
 [63462, 13923, 1603, 4232, 30143, 233, 99, 4268],
 [5646, 26026, 34484, 9655, 244, 33503, 2411, 136],
 [4231, 52335, 32770, 1660, 266, 2236, 59624, 99],
 [40675, 8302, 859, 5083, 5154, 133, 54, 20314]]

### Based on Separate Seasons

In [78]:
DF = filtered_df.groupby(["Season"]).get_group("Fall")
trips = DF.groupby(['Start Ward', 'End Ward']).size() 
trips

Start Ward  End Ward
Ward 1      Ward 1      102382
            Ward 2      137049
            Ward 3        9554
            Ward 4       11288
            Ward 5       15017
                         ...  
Ward 8      Ward 4         101
            Ward 5        1531
            Ward 6       23823
            Ward 7        1656
            Ward 8       11498
Length: 64, dtype: int64

In [79]:
matrix = [[[trips.get(k, {}).get(kk, 0) for kk in Ward] for k in Ward]]
matrix

[[[222461, 101565, 12389, 28721, 18237, 802, 553, 10807],
  [108546, 452644, 79722, 21161, 4253, 3861, 7451, 2206],
  [16915, 105436, 69711, 10821, 583, 7621, 6820, 365],
  [31261, 23269, 10040, 26866, 1358, 2143, 245, 1414],
  [14886, 3222, 357, 988, 7189, 32, 25, 1083],
  [1420, 6548, 8722, 2309, 56, 8133, 618, 34],
  [1083, 14470, 8585, 453, 63, 592, 16728, 20],
  [10575, 2335, 240, 1352, 1344, 19, 9, 4648]]]

In [81]:
DF = filtered_df.groupby(["Season"]).get_group("Summer")
trips = DF.groupby(['Start Ward', 'End Ward']).size() 
trips

Start Ward  End Ward
Ward 1      Ward 1      102382
            Ward 2      137049
            Ward 3        9554
            Ward 4       11288
            Ward 5       15017
                         ...  
Ward 8      Ward 4         101
            Ward 5        1531
            Ward 6       23823
            Ward 7        1656
            Ward 8       11498
Length: 64, dtype: int64

In [82]:
matrix = [[[trips.get(k, {}).get(kk, 0) for kk in Ward] for k in Ward]]
matrix

[[[297472, 140201, 16341, 38133, 28052, 1212, 855, 14051],
  [149140, 614838, 102854, 28865, 6499, 5540, 10551, 2488],
  [23037, 137049, 102382, 15017, 994, 11288, 9554, 430],
  [41292, 31641, 13430, 38509, 1974, 3050, 355, 1866],
  [23823, 5182, 637, 1531, 11498, 101, 45, 1656],
  [2040, 9028, 12214, 3372, 94, 12528, 818, 55],
  [1589, 18100, 10979, 506, 102, 799, 20500, 33],
  [13902, 2494, 290, 1621, 1904, 54, 16, 7746]]]

In [83]:
DF = filtered_df.groupby(["Season"]).get_group("Winter")
trips = DF.groupby(['Start Ward', 'End Ward']).size() 
trips

Start Ward  End Ward
Ward 1      Ward 1      47738
            Ward 2      68184
            Ward 3       4262
            Ward 4       4525
            Ward 5       6832
                        ...  
Ward 8      Ward 4         28
            Ward 5        564
            Ward 6       8001
            Ward 7        415
            Ward 8       3820
Length: 64, dtype: int64

In [84]:
matrix = [[[trips.get(k, {}).get(kk, 0) for kk in Ward] for k in Ward]]
matrix

[[[129223, 51584, 6508, 18105, 9315, 394, 305, 6109],
  [54170, 241030, 47980, 13301, 1684, 2025, 3608, 1024],
  [9078, 68184, 47738, 6832, 230, 4525, 4262, 185],
  [20037, 15168, 6466, 17079, 743, 1301, 117, 809],
  [8001, 1446, 151, 564, 3820, 28, 11, 415],
  [728, 3452, 4868, 1446, 28, 4640, 322, 17],
  [484, 7070, 5168, 263, 29, 271, 8739, 24],
  [5981, 1201, 111, 754, 546, 13, 8, 2695]]]

In [87]:
DF = filtered_df.groupby(["Season"]).get_group("Spring")
trips = DF.groupby(['Start Ward', 'End Ward']).size() 
trips

Start Ward  End Ward
Ward 1      Ward 1       73138
            Ward 2      107663
            Ward 3        7150
            Ward 4        8175
            Ward 5       11525
                         ...  
Ward 8      Ward 4          72
            Ward 5        1149
            Ward 6       16752
            Ward 7        1114
            Ward 8        7636
Length: 64, dtype: int64

In [88]:
matrix = [[[trips.get(k, {}).get(kk, 0) for kk in Ward] for k in Ward]]
matrix

[[[227530, 112260, 12454, 30817, 19682, 891, 607, 10230],
  [118873, 477906, 81846, 23568, 5001, 4617, 7632, 2059],
  [16830, 107663, 73138, 11525, 704, 8175, 7150, 331],
  [32745, 26453, 10817, 30075, 1460, 2377, 291, 1435],
  [16752, 4073, 458, 1149, 7636, 72, 18, 1114],
  [1458, 6998, 8680, 2528, 66, 8202, 653, 30],
  [1075, 12695, 8038, 438, 72, 574, 13657, 22],
  [10217, 2272, 218, 1356, 1360, 47, 21, 5225]]]