In [2]:
import h5py
import numpy as np
import pandas as pd

In [6]:
manhattan_dimensions = {
    'min_latitude': 40.69331716,
    'max_latitude': 40.76915505,
    'min_longitude': -74.01713445,
    'max_longitude': -73.95381995
    }

grid_rows = 16
grid_cols = 8


class DataHandler():

    def __init__(self, datapath, starttime, endtime):
        self.df = self.load_manhattan_data(datapath)
        self.data_intervals = self.return_timestamp_intervals(pd.to_datetime(starttime), pd.to_datetime(endtime))


    def is_in_manhattan(self, latitude, longitude):        
        return (manhattan_dimensions['min_latitude'] <= latitude <= manhattan_dimensions['max_latitude'] and
                manhattan_dimensions['min_longitude'] <= longitude <= manhattan_dimensions['max_longitude'])
    


    def load_manhattan_data(self, datapath):
        df = pd.read_csv(datapath)

        manhattan_filter = (df.apply(lambda row: self.is_in_manhattan(row['start station latitude'], row['start station longitude']) and
                                                 self.is_in_manhattan(row['end station latitude'], row['end station longitude']), axis=1))

        df = df[manhattan_filter]

        df.loc[:, 'starttime'] = pd.to_datetime(df['starttime'], errors='coerce')
        df.loc[:, 'stoptime'] = pd.to_datetime(df['stoptime'], errors='coerce')

        return df
    


    def return_timestamp_intervals(self, start_time, end_time):
        dates = []

        for start_date in pd.date_range(start=start_time, end=end_time, freq='30T'):
            end_date = start_date + pd.Timedelta(hours=1)
            
            dates.append([start_date, end_date])

        return dates
    

    
    def create_matrix(self, station_counts):
        # Calculate step sizes for latitude and longitude
        lat_step = (manhattan_dimensions['max_latitude'] - manhattan_dimensions['min_latitude']) / grid_rows #roughly 0.01108 degrees
        lon_step = (manhattan_dimensions['max_longitude'] - manhattan_dimensions['min_longitude']) / grid_cols #roughly 0.0136125 degrees

        grid = np.zeros((grid_rows, grid_cols))

        for index, row in station_counts.iterrows():
            station_lat = row['station latitude']
            station_lon = row['station longitude']
            station_traffic = row["count"]
            
            # Check if the current cell contains the bike station
            for i in range(grid_rows):

                for j in range(grid_cols):

                    lat_min = manhattan_dimensions['min_latitude'] + i * lat_step
                    lat_max = lat_min + lat_step

                    lon_min = manhattan_dimensions['min_longitude'] + j * lon_step
                    lon_max = lon_min + lon_step
                    
                    if lat_min <= station_lat < lat_max and lon_min <= station_lon < lon_max:
                        grid[i][j] += station_traffic

        # Reverse grid so that bottom of the grid now represents bottom of Manhattan
        return grid[::-1]



    def generate_inflow_matrix(self, start_interval, stop_interval):
        # Filter rows for a specific date, e.g., '2014-01-01'
        temp_df = self.df[(self.df['stoptime'] >= pd.to_datetime(start_interval)) & (self.df['stoptime'] <= pd.to_datetime(stop_interval))]

        # Print the filtered DataFrame
        stop_df = temp_df.drop(columns=[
                    "tripduration",
                    "starttime",
                    "start station id", 
                    "start station name", 
                    "start station latitude", 
                    "start station longitude", 
                    "bikeid", 
                    "usertype", 
                    "birth year", 
                    "gender"
                    ])

        stop_df.rename(columns = {
                                "end station id": "station id",
                                "end station name": "station name",
                                "end station latitude": "station latitude", 
                                "end station longitude": "station longitude"
                                }, inplace = True)

        station_counts = stop_df.groupby(['station id', 'station name', 'station latitude', 'station longitude']).size().reset_index(name='count')
        inflow_matrix = self.create_matrix(station_counts)

        return inflow_matrix   



    def generate_outflow_matrix(self, start_interval, stop_interval):
        # Filter rows for a specific date, e.g., '2014-01-01'
        temp_df = self.df[(self.df['starttime'] >= pd.to_datetime(start_interval)) & (self.df['starttime'] <= pd.to_datetime(stop_interval))]

        # Print the filtered DataFrame
        start_df = temp_df.drop(columns=[
                    "tripduration",
                    "stoptime",
                    "end station id", 
                    "end station name", 
                    "end station latitude", 
                    "end station longitude", 
                    "bikeid", 
                    "usertype", 
                    "birth year", 
                    "gender"
                    ])
        
        start_df.rename(columns = {
                          "start station id": "station id",
                          "start station name": "station name",
                          "start station latitude": "station latitude", 
                          "start station longitude": "station longitude"
                          }, inplace = True)
        
        station_counts = start_df.groupby(['station id', 'station name', 'station latitude', 'station longitude']).size().reset_index(name='count')
        
        outflow_matrix = self.create_matrix(station_counts)

        return outflow_matrix
    
        

    def generate_dataset(self, filepath="data/processed_data.h5"):
        traffic_data = []
        dates = []

        for entry in range(len(self.data_intervals)):
            start_interval, stop_interval = self.data_intervals[entry]

            inflow_matrix = self.generate_inflow_matrix(start_interval, stop_interval)
            outflow_matrix = self.generate_outflow_matrix(start_interval, stop_interval)

            matrices = (outflow_matrix, inflow_matrix)

            traffic_data.append(matrices)
            dates.append(stop_interval.strftime('ISO8601'))

    
        with h5py.File(filepath, 'w') as hf:
            # Store dates as strings
            hf.create_dataset('timeslot', data=dates)
            hf.create_dataset('trip', data=traffic_data)


In [9]:
data_handler_object = DataHandler("raw_traffic_data/201911-citibike-tripdata.csv", 
                starttime="2019-11-01 00:00:00",
                endtime="2019-11-30 23:59:59")

data_handler_object.generate_dataset("data/201911_dataset.h5")