In [2]:
import numpy as np
import pandas as pd
import sys
import os
import re
import time

In [68]:
class timeSpending():
    def __init__(self, stop_file, output_folder):
        self.stop_file = stop_file
        self.output_folder = output_folder
        self.df_list = {}
    
    def route_list(self):
        routes = pd.read_csv(self.stop_file, usecols=['RouteName'])
        return list(routes['RouteName'].unique())
    
    def route_extractor(self, routeName, direction):
        route = pd.read_csv(self.stop_file)
        single_route = route[(route['RouteName'] == routeName) & (route['Direction'] == direction)].drop_duplicates()
        
        if single_route.shape[0] == 0:
            single_route = route[(route['RouteName'] == int(routeName)) & (route['Direction'] == direction)].drop_duplicates()
        
        return single_route 
    
    def replace_dash_time(self, value):
        if value == '--':
            return None
        else:
            return value
        
    def eta_extractor(self, routeName, direction, directory, filename):
        eta = pd.read_csv('{}/{}'.format(directory, filename), index_col = 0)
        eta['RouteName'] = eta['RouteName'].astype(str)
        single_eta = eta[(eta['RouteName'] == routeName) & (eta['Direction'] == direction)].copy()
        single_eta['EstimateTime(Sec)'] = single_eta['EstimateTime(Sec)'].apply(self.replace_dash_time)
        single_eta['EstimateTime(Sec)'] = pd.to_numeric(single_eta['EstimateTime(Sec)'], downcast='integer')
        single_eta['StopName'] = eta['StopName']
        return single_eta
    
    
    def single_runtime(self, routeName, direction,directory,filename): 
        single_eta = self.eta_extractor(routeName, direction, directory, filename)
        single_route = self.route_extractor(routeName, direction)
        df = single_eta[['StopUID','Direction','RouteName','EstimateTime(Sec)']]\
            .merge(single_route[['StopUID','Direction','StopSequence']],on=['StopUID','Direction'])\
            .sort_values(by='StopSequence',ascending=False)
        running_time = df['EstimateTime(Sec)'].diff()*-1
        return running_time    
    
    def route_frame(self, routeName, direction, directory, filename):
        single_route = self.route_extractor(routeName, direction)
        result = single_route[['StopName','StopUID','Direction','RouteName','StopSequence']] \
        .sort_values(by='StopSequence', ascending=False)
        return result


    def replace_nega(self, value):
        if value < 0:
            return None #None: refine_time_before_negative won't work, -1 will work
        else:
            return value
    
    def refine_time_before_negative(self, series):
        series_re_idx = series.reset_index(drop=True)
        for idx, v in series_re_idx.items():
            if v == -1:
                series_re_idx[idx-1] = -1
        return series_re_idx 
    
    
    def runtime_generator(self, route_list, directory):
        df_list = []
        for route in route_list:   
            
            count = 0
            for direction in [0,1]:
                start = time.time()   
                frame = self.route_frame(route,
                                         direction,
                                         directory,
                                         'eta_2020-6-22_0750.csv').reset_index(drop=True)

                pattern1 = re.compile('0[1-5]..') #data time is not btw 1am and 5am 
                pattern2 = re.compile('...[0-9]') #get data only at XX:X5 or XX:X0~ XX:X9
                
                # Get related files
                file_list = []
                
                #filename example: "672-eta_2020-6-26_1435" or "eta_2020-6-26_1435"
                for filename in os.listdir('{}'.format(directory)):
                    if '{}-eta'.format(route) in filename:
                        if not re.match(pattern1,filename.split('_')[2]):
                            if re.match(pattern2, filename.split('_')[2]):
                                file_list.append(filename)

                for filename in sorted(file_list):
                    #filename example: "672-eta_2020-6-26_1435" or "eta_2020-6-26_1435"
                    extract_time = filename.split('_')[1] + '_' +filename.split('_')[2].split('.')[0]
                    sys.stdout.write('\rprocessing route: {}, direction: {}, time: {} {}' \
                                     .format(route,direction,filename.split('_')[1],filename.split('_')[2]))
                    run_time = self.single_runtime(route, direction, directory, filename)
                    run_time = run_time.apply(self.replace_nega)
                    run_time = self.refine_time_before_negative(run_time)
                    frame[extract_time] = run_time



                frame['mean'] = frame.iloc[:,5:].mean(axis=1)
                frame.to_csv('{}/{}_{}X.csv'.format(self.output_folder, route, direction), encoding='utf-8', index=False)
                self.df_list['{}-{}'.format(route,direction)] = frame
                
                sys.stdout.write(' process time: {}'.format(time.time()-start))
                print('-')

        #df_output = pd.concat(df_list)
        #df_output.to_csv('{}/eta_all.csv'.format(self.output_folder, route, direction), encoding='utf-8', index=False)

        print('\nDone')

        return sorted(file_list)



In [69]:
route_list = ['672']
obj = timeSpending(stop_file='support-file/bus-stops.csv',
                   output_folder='output')
                         
df = obj.runtime_generator(route_list=route_list,
                           directory='bus-data')

  res_values = method(rvalues)


processing route: 672, direction: 0, time: 2020-6-22 0855.csv process time: 1.043247938156128-
processing route: 672, direction: 1, time: 2020-6-22 0855.csv process time: 1.0385751724243164-

Done
