# Data Analytics - Bus Data Understanding and Prep

## Audit

Author: Danning

Last Modified By: Adam

Module: COMP47630

DC:     2021-06-20

DLM:    2021-06-23

Desc:   This file contains an analysis of the historic bus data

Dict:   The Data Dictionary for the Data Set is available in Brightspace

## Table of Contents

00. Introduction

01. Exec Summary and Results

02. Modules

03. Constants

04. Ingestion

05. Cleansing


## 00. Introduction

### 00.01 Background
(here)

### 00.02 Problem Scope
(here)

### 00.03 Data
(here)

### 00.04 Approach
(here)

## 01. Exec Summary

(here)

---
# ------BEGIN---------- #

## 01. Static

### 01.01 Modules
Import all modules here

In [1]:
import sys
print(sys.executable)

/home/team10/comp47360/bin/python3


In [2]:
####--------------------------------------
#00.Import Modules
####--------------------------------------

######---------BEGIN
#      SUPPRESS DEPRECIATION WARNINGS: Applicable to datetime_is_numeric=True
######--------END

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

######---------BEGIN
#      ML
######--------END

#import nltk as nl
import sklearn as sk
import matplotlib as mp
#import xgboost as xg
#import pymc3 as pymc
#import sympy as sym



######---------BEGIN
#      SQL/API
######--------END


#import requests as rq
#import sqlalchemy as sqla
#import pyodbc
#import cx_oracle as cx


######---------BEGIN
#     GENERAL
######--------END

import pandas as pd
import datetime as dt
import numpy as np
import sys
import os
from dask import dataframe as dask_df
#import pyspark as spk
#import json
#import time
#import socket
#import traceback as tb
#import platform
#from psutil import virtual_memory
import pickle as pck


######---------BEGIN
#     VISUALISATIONS
######--------END


import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import matplotlib.dates as mdates


#For showing plots directly in the notebook run the command below
#%matplotlib inline


###HTML Output Hiding
#Install hide input extension
#!pip install jupyter_contrib_nbextensions
#!jupyter contrib nbextension install --user
#!jupyter nbextension enable hide_input_all/main
#!jupyter nbextension enable hide_input/main
#!jupyter nbextension enable codefolding/main

#!jupyter nbextension disable hide_input_all/main
#!jupyter nbextension disable hide_input/main
#!jupyter nbextension disable codefolding/main

#Update with Filename: Run in Terminal, post completion, after hiding all cells for report
#!jupyter nbconvert --to=html bus_Data.ipynb


### 01.02 Constants
Import all Constants here

In [3]:
data_year='2018'
bus_leavetimes_filepath="./data/rt_leavetimes_DB_{}.txt".format(data_year)
bus_trips_filepath="./data/rt_trips_DB_{}.txt".format(data_year)

bus_leavetimes_sep=";"
bus_trips_sep=";"

bus_leavetimes_data_dictionary={
        'DATASOURCE':['Description','category']
        ,'DAYOFSERVICE':['Description','datetime']
        ,'TRIPID':['Description','category']
        ,'PROGRNUMBER':['Description','category']
        ,'STOPPOINTID':['Description','category']
        ,'PLANNEDTIME_ARR':['Description','int64']
        ,'PLANNEDTIME_DEP':['Description','int64']
        ,'ACTUALTIME_ARR':['Description','int64']
        ,'ACTUALTIME_DEP':['Description','int64']
        ,'VEHICLEID':['Description','category']
        ,'PASSENGERS':['Description','float64']
        ,'PASSENGERSIN':['Description','float64']
        ,'PASSENGERSOUT':['Description','float64']
        ,'DISTANCE': ['Description','float64']
        ,'SUPPRESSED':['Description','category']
        ,'JUSTIFICATIONID':['Description','category']
        ,'LASTUPDATE':['Description','datetime']
        ,'NOTE': ['Description','string']
                        }


lt_datetime_columns=[column_headers for column_headers, column_desc_array in bus_leavetimes_data_dictionary.items() if column_desc_array[1] == 'datetime']
lt_categorical_columns=[column_headers for column_headers, column_desc_array in bus_leavetimes_data_dictionary.items() if column_desc_array[1] == 'category']
lt_num_columns=[column_headers for column_headers, column_desc_array in bus_leavetimes_data_dictionary.items() if column_desc_array[1] in ('numeric','int64','float64')]



bus_leavetimes_metadata_dictionary={}

for column_headers, column_desc_array in bus_leavetimes_data_dictionary.items():
    bus_leavetimes_metadata_dictionary[column_headers]=column_desc_array[1]
    
    
bus_trips_data_dictionary={
        'DATASOURCE':['Description','category']
        ,'DAYOFSERVICE':['Description','datetime']
        ,'TRIPID':['Description','category']
        ,'LINEID':['Description','category']
        ,'ROUTEID':['Description','category']
    
        ,'DIRECTION':['Description','category']
        
        ,'PLANNEDTIME_ARR':['Description','int64']
        ,'PLANNEDTIME_DEP':['Description','int64']
        ,'ACTUALTIME_ARR':['Description','int64']
        ,'ACTUALTIME_DEP':['Description','int64']
    
        ,'BASIN':['Description','category']
        ,'TENDERLOT':['Description','float64']
    
        ,'SUPPRESSED':['Description','category']
        ,'JUSTIFICATIONID':['Description','category']
        ,'LASTUPDATE':['Description','datetime']
        ,'NOTE': ['Description','string']
                        }

bus_trips_metadata_dictionary={}

for column_headers, column_desc_array in bus_trips_data_dictionary.items():
    bus_trips_metadata_dictionary[column_headers]=column_desc_array[1]

    
tp_datetime_columns=[column_headers for column_headers, column_desc_array in bus_trips_data_dictionary.items() if column_desc_array[1] == 'datetime']
tp_categorical_columns=[column_headers for column_headers, column_desc_array in bus_trips_data_dictionary.items() if column_desc_array[1] == 'category']
tp_num_columns=[column_headers for column_headers, column_desc_array in bus_trips_data_dictionary.items() if column_desc_array[1] in ('numeric','int64','float64')]




    

#Dates for File Additions if needed
today_date=dt.datetime.now()

#DateTime objects
today_year=today_date.year
today_month=today_date.month
today_day=today_date.day

#Convert to ISO Standard for Filename
str_year=str(today_date.year)

#Month should have two digits
str_month=str(today_date.month)
if len(str_month)==1:
    str_month="0{}".format(str_month)

#Day should have two digits
str_day=str(today_date.day)
if len(str_day)==1:
    str_day="0{}".format(str_day)


str_today_date="{}-{}-{}".format(str_year,str_month,str_day)

datetime_format='%d-%b-%Y %H:%M:%S'

## 02. Exploration

In [1]:
def ingest_data(fp,delim,data_dictionary,chunks=10000000,pandas=False):
    """A function to read in CSV Data and Validate.
    
    Memory error after 50M rows"""

    print("Inside ingest_data({},dictionary)".format(fp))
    
    def print_shape(raw_df):
        """A function to print the shape of a dataframe"""
        #row_column data
        shape_of_df=raw_df.shape
        row_count=shape_of_df[0]
        column_count=shape_of_df[1]

        #print info to user
        row_column_print_statement='Your file contains: \n{} rows x {} columns.\n\n'
        row_column_print_statement=row_column_print_statement.format(row_count,column_count)
        print(row_column_print_statement)
        header_statement='The following columns are present:\n'

        #print the headers
        for header in raw_df.columns:
            header_statement+='"{}"\n'.format(header)

        print(header_statement)
        return
        
    def verify_schema(raw_df,data_dictionary):
        """A function to validate the schema of a dataframe"""
        
        match=False
        #check if the schema is correct
        if set(raw_df.columns)==set(data_dictionary.keys()) and len(raw_df.columns)==len(data_dictionary.keys()):
            print('The columns in this data sample match the schema')
            match=True

        else:
            print('The columns in this data sample do not match the schema')
            
        return match

        

    if pandas==True:
        #Valid Filepath
        if os.path.isfile(fp):

            raw_df = pd.DataFrame()
            chunk_count=0
            
            for chunk in pd.read_csv(fp,sep=delim,dtype=str,chunksize=chunks):
                chunk_count+=1
                print('On Chunk: {}'.format(chunk_count))
                raw_df = pd.concat([raw_df,chunk])
                
            display(raw_df)

            print_shape(raw_df)

            verify_schema(raw_df,data_dictionary)

            print("\n\n\nSample Data:\n\n\n")
            display(raw_df.head())
            
            return raw_df

        #Not Valid Filepath
        else:
            print("Invalid filepath - Correct the filepath and re-ingest")

            return
        
    elif pandas==False:
        
        #Valid Filepath
        if os.path.isfile(fp):

            #read_csv - Do Not Let Pandas Manipulate the Data First - Auto-assign is more memory intensive.
            raw_df=dask_df.read_csv(fp, sep=delim)
            display(raw_df)

            print_shape(raw_df)


            verify_schema(raw_df,data_dictionary)
            
            print("\n\n\nSample Data:\n\n\n")
            display(raw_df.head())

            return raw_df

        #Not Valid Filepath
        else:
            print("Invalid filepath - Correct the filepath and re-ingest")

            return
        
    else:
        print('No opinion on using Dask or Pandas - Defaulting to Dask')
        
        #Valid Filepath
        if os.path.isfile(fp):

            #read_csv - Do Not Let Pandas Manipulate the Data First - Auto-assign is more memory intensive.
            raw_df=dask_df.read_csv(fp, sep=delim)
            display(raw_df)

            print_shape(raw_df)


            verify_schema(raw_df,data_dictionary)

            print("\n\n\nSample Data:\n\n\n")
            display(raw_df.head())
            
            return raw_df
        
        #Not Valid Filepath
        else:
            print("Invalid filepath - Correct the filepath and re-ingest")

            return

In [5]:
def data_convert(df,types,columnlist,dt_format):
    """A function to convert all columns in a list into the appropriate type"""
    
    print("Inside data_convert()")
    
    ###Check if empty
    if len(df.index) != 0:
        
        ##Check if datetime or other
        if types=='datetime':
       
            ###Check if 0
            if len(columnlist)>0:
                print('Converting to {}'.format(types))
                
                for column in columnlist:
                    df[column]=df[column].apply(pd.to_datetime,format=dt_format,errors='ignore')
                
            else:
                print('No need to convert to: {}'.format(types))
              
        ###Numeric type
        elif types=='category':
            ###Check if 0
            if len(categorical_columns)>0:
                print('Converting to {}'.format(types))
                
                for column in columnlist:
                    df[column]=df[column].astype('category')
                
            ###Nothing to convert
            else:
                print('No need to convert')
                
        ###Numeric type
        elif types=='numeric':
            
            ###Check if 0
            if len(num_columns)>0:
                print('Converting to Numerical')
                
                for column in columnlist:
                    df[column]=df[column].apply(pd.to_numeric, errors='ignore')
                
            else:
                print('No need to convert')
                
        ###Other type - e.g. Boolean, string - Dont do anything - force the above types.
        else:
            print('Unknown type')
                
    ###Empty data set          
    else:
        print("Empty dataframe")

In [6]:

def missing_check(row):
    """Highlight rows with potential missing_values"""

    #Configuration Values
    col_to_check=10
    default_colour = 'white'
    flag_colour=''
    high_flag_colour_val='red'
    med_flag_colour_val='orange'
    low_flag_colour_val='yellow'
    val_to_check=0

    #Row length valid
    if len(row)>=col_to_check:

        #
        if row.values[col_to_check] == 'High':
            flag_colour = high_flag_colour_val
            
        elif row.values[col_to_check] == 'Medium':
            flag_colour = med_flag_colour_val
            
        elif row.values[col_to_check] == 'Low':
            flag_colour = low_flag_colour_val

        if flag_colour=='':
            colour=default_colour
        else:
            colour=flag_colour

        return ['background-color: {}'.format(colour)]*len(row.values)

    else:
        print('Row too short - Reconfigure Column Number')
        return ['background-color: {}'.format(default_colour)]*len(row.values)
    
def dt_missing_check(row):
    """Highlight rows with potential missing_values"""

    #Configuration Values
    col_to_check=9
    default_colour = 'white'
    flag_colour=''
    high_flag_colour_val='red'
    med_flag_colour_val='orange'
    low_flag_colour_val='yellow'
    val_to_check=0

    #Row length valid
    if len(row)>=col_to_check:

        #
        if row.values[col_to_check] == 'High':
            flag_colour = high_flag_colour_val
            
        elif row.values[col_to_check] == 'Medium':
            flag_colour = med_flag_colour_val
            
        elif row.values[col_to_check] == 'Low':
            flag_colour = low_flag_colour_val

        if flag_colour=='':
            colour=default_colour
        else:
            colour=flag_colour

        return ['background-color: {}'.format(colour)]*len(row.values)

    else:
        print('Row too short - Reconfigure Column Number')
        return ['background-color: {}'.format(default_colour)]*len(row.values)
    
def group_over_single_categories(df,categorical_columns,pdf_fn, save_fig=True):
    """A function to group over the categories"""
    
    print("Inside group_over_single_categories()")
    row_count=len(df)

    grouping_type={}
    timestamp_now=dt.datetime.timestamp(dt.datetime.now())
    
    #Dataframe is not empty, and there are categorical columns to group over:
    if df.empty==False and len(categorical_columns)>0:
        with PdfPages(pdf_fn) as pp: #lab
            column=''

            #Let's go through the category column type
            for column in categorical_columns:

                #Separator
                print('\n\n----------------------\n\n')
                agg_df=df.groupby([column]).agg({df.columns[0]:"count"})
                print(agg_df)
                agg_df=agg_df.reset_index()
                agg_df=agg_df.rename(columns={df.columns[0]:'Rows'})

                #Note: Could also do value_counts but I prefer that for graphing.
                agg_df['% Frequency']=100*(agg_df['Rows']/row_count)

                #Be explicit over what we're displaying
                print('Grouping over {} results in:\n'.format(column))

                #Display the result
                display(agg_df)
                #Graphing Section:

                figure = (
                            df[column]
                              .value_counts(dropna=True, normalize=True)
                              .plot(kind='bar'
                                    ,title='Count of values for {}'.format(column)
                                    , xlabel='Field Values'
                                    , ylabel='Count of Values'
                                    , figsize=(35,35)
                                   )
                 )
                #This grid style is from the sample Lab5 as I like how it looks
                plt.ylim([0,1])
                plt.grid(b=True, which='major', color='#666666', linestyle='-')
                plt.setp(figure.get_xticklabels(), ha="right", rotation=0)
                plt.minorticks_on()
                plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)
                plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)
                plt.show()
                grouping_type[column]=agg_df

                if save_fig:
                    pp.savefig(figure.get_figure())


                





    
    return grouping_type

def group_over_multi_categories(df,categorical_columns,pdf_fn,save_output=False,save_fig=False):
    """A function to group over all pairs of categories
    
    Warning: This can be memory intensive as we have (columnCount)C(2) pairings, so only run this if your device is able!"""
    
    print("Inside group_over_multi_categories()")
    row_count=len(df)
    grouping_type={}
    timestamp_now=dt.datetime.timestamp(dt.datetime.now())
    
    #Try run this
    try:
    
        #Dataframe is not empty, and there are categorical columns to group over:
        if df.empty==False and len(categorical_columns)>0:
            with PdfPages(pdf_fn) as pp:
                column=''
                second_column=''
                
                #Let's go through the category column type
                for column in categorical_columns:

                    #Second index, n^2
                    for second_column in categorical_columns:
                        multi_column=[column]


                        #Create a key to access - pipe delimited as columns contain _
                        grouping_key="{}|{}"

                        #No point in grouping the same column twice
                        if second_column!=column:
                            multi_column+=[second_column]
                            grouping_key=grouping_key.format(column,second_column)

                            #Separator
                            print('\n\n----------------------\n\n')
                            agg_df=df.groupby(multi_column).agg({df.columns[0]:"count"})
                            agg_df=agg_df.reset_index()
                            agg_df=agg_df.rename(columns={df.columns[0]:'Rows'})
                            agg_df['% Frequency']=100*(agg_df['Rows']/row_count)

                            #Be explicit over what we're displaying
                            print('Grouping over {} results in:\n'.format(grouping_key))

                            #Display the result
                            display(agg_df)
                            
                            #Graph
                            figure = (
                                        (df[multi_column]
                                              .dropna()
                                              .value_counts(normalize=True)
                                              .reset_index()
                                              .pivot_table(index=column,columns=second_column)
                                              .fillna(0))[0]
                                                      .plot(kind='bar'
                                                        , stacked=True
                                                        , title='Count of values for {} vs {}'.format(second_column,column)
                                                        , xlabel='Field Values'
                                                        , ylabel='Count of Values'
                                                        , figsize=(35,35)

                                                            
                                                           )
                                     )
                
                            #This grid style is from the sample Lab5 as I like how it looks
                            plt.ylim([0,1])
                            plt.grid(b=True, which='major', color='#666666', linestyle='-')
                            plt.setp(figure.get_xticklabels(), ha="right", rotation=0)
                            plt.minorticks_on()
                            plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)
                            plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)
                            plt.show()

                            

                            if save_fig:
                                pp.savefig(figure.get_figure())

                            #Only save if explicitly passed - This could kill your memory.
                            if save_output:
                                grouping_type[grouping_key]=agg_df

                        
    #Catch exceptions
    except Exception as exc:
        print("Function exception:\n")
        #check exception is memory error
        if exc==MemoryError:
            print("Sorry, your device is not able to run this function as you have hit a memory limit")
            
        print(exc)
        

    return grouping_type


def cat_missing_check_cleanse(row):
    """Highlight rows with potential missing_values"""

    #Configuration Values
    col_to_check=8
    default_colour = 'green'
    flag_colour=''
    high_flag_colour_val='red'
    med_flag_colour_val='orange'
    low_flag_colour_val='yellow'
    val_to_check=0

    #Row length valid
    if len(row)>=col_to_check:

        #
        if row.values[col_to_check] == 'High':
            flag_colour = high_flag_colour_val
            
        elif row.values[col_to_check] == 'Medium':
            flag_colour = med_flag_colour_val
            
        elif row.values[col_to_check] == 'Low':
            flag_colour = low_flag_colour_val

        if flag_colour=='':
            colour=default_colour
        else:
            colour=flag_colour

        return ['background-color: {}'.format(colour)]*len(row.values)

    else:
        print('Row too short - Reconfigure Column Number')
        return ['background-color: {}'.format(default_colour)]*len(row.values)
    
def stacked_group_over_multi_categories(df,categorical_columns,pdf_fn,save_output=False,save_fig=False):
    """A function to group over all pairs of categories
    
    Warning: This can be memory intensive as we have (columnCount)C(2) pairings, so only run this if your device is able!"""
    
    print("Inside group_over_multi_categories()")
    row_count=len(df)
    grouping_type={}
    timestamp_now=dt.datetime.timestamp(dt.datetime.now())
    
    #Try run this
    try:
    
        #Dataframe is not empty, and there are categorical columns to group over:
        if df.empty==False and len(categorical_columns)>0:
            with PdfPages(pdf_fn) as pp:
                column=''
                second_column=''
                
                #Let's go through the category column type
                for column in categorical_columns:

                    #Second index, n^2
                    for second_column in categorical_columns:
                        multi_column=[column]


                        #Create a key to access - pipe delimited as columns contain _
                        grouping_key="{}|{}"

                        #No point in grouping the same column twice
                        if second_column!=column:
                            multi_column+=[second_column]
                            grouping_key=grouping_key.format(column,second_column)

                            #Separator
                            print('\n\n----------------------\n\n')
                            agg_df=df.groupby(multi_column).agg({df.columns[0]:"count"})
                            agg_df=agg_df.reset_index()
                            agg_df=agg_df.rename(columns={df.columns[0]:'Rows'})
                            agg_df['% Frequency']=100*(agg_df['Rows']/row_count)

                            #Be explicit over what we're displaying
                            print('Grouping over {} results in:\n'.format(grouping_key))

                            #Display the result
                            display(agg_df)
                            agg_df=agg_df.reset_index()

                            sagg_df=(
                                    df
                                    .groupby([column])
                                    .agg({df.columns[0]:"count"})
                                    .reset_index()
                                    .rename(columns={df.columns[0]:'TotalRows'})
                                    )

                            join_df=agg_df.merge(sagg_df,left_on=column,right_on=column,suffixes=('_subbed','_group'))
                            join_df['% Stacked']=join_df['Rows']/join_df['TotalRows']

                            figure=((join_df
                                  .pivot_table(index=column,columns=second_column,values='% Stacked')
                                  .fillna(0))

                                          .plot(kind='bar'
                                            , stacked=True
                                            , title='Distribution of values for {} vs {}'.format(second_column,column)
                                            , xlabel='Field Values'
                                            , ylabel='Makeup of Values'
                                            , figsize=(35,35)

                            ))

                            #This grid style is from the sample Lab5 as I like how it looks
                            plt.ylim([0,1])
                            plt.grid(b=True, which='major', color='#666666', linestyle='-')
                            plt.setp(figure.get_xticklabels(), ha="right", rotation=0)
                            plt.minorticks_on()
                            plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)
                            plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)
                            plt.show()

                            

                            if save_fig:
                                pp.savefig(figure.get_figure())

                            #Only save if explicitly passed - This could kill your memory.
                            if save_output:
                                grouping_type[grouping_key]=agg_df

                        
    #Catch exceptions
    except Exception as exc:
        print("Function exception:\n")
        #check exception is memory error
        if exc==MemoryError:
            print("Sorry, your device is not able to run this function as you have hit a memory limit")
            
        print(exc)
        

    return grouping_type


def stacked_group_over_target_categories(df,categorical_columns,pdf_fn,save_output=False,save_fig=False):
    """A function to group over all pairs of categories and the target death_yn
    
    Warning: This can be memory intensive as we have so only run this if your device is able!"""
    
    print("Inside group_over_multi_categories()")
    row_count=len(df)
    grouping_type={}
    timestamp_now=dt.datetime.timestamp(dt.datetime.now())
    
    #Try run this
    try:
    
        #Dataframe is not empty, and there are categorical columns to group over:
        if df.empty==False and len(categorical_columns)>0:
            with PdfPages(pdf_fn) as pp:
                column=''
                second_column=''
                
                #Let's go through the category column type
                for column in categorical_columns:

                    #Second index, n^2
                    for second_column in ['death_yn']:
                        multi_column=[column]


                        #Create a key to access - pipe delimited as columns contain _
                        grouping_key="{}|{}"

                        #No point in grouping the same column twice
                        if second_column!=column:
                            multi_column+=[second_column]
                            grouping_key=grouping_key.format(column,second_column)

                            #Separator
                            print('\n\n----------------------\n\n')
                            agg_df=df.groupby(multi_column).agg({df.columns[0]:"count"})
                            agg_df=agg_df.reset_index()
                            agg_df=agg_df.rename(columns={df.columns[0]:'Rows'})
                            agg_df['% Frequency']=100*(agg_df['Rows']/row_count)

                            #Be explicit over what we're displaying
                            print('Grouping over {} results in:\n'.format(grouping_key))

                            #Display the result
                            display(agg_df)
                            agg_df=agg_df.reset_index()

                            sagg_df=(
                                    df
                                    .groupby([column])
                                    .agg({df.columns[0]:"count"})
                                    .reset_index()
                                    .rename(columns={df.columns[0]:'TotalRows'})
                                    )

                            join_df=agg_df.merge(sagg_df,left_on=column,right_on=column,suffixes=('_subbed','_group'))
                            join_df['% Stacked']=join_df['Rows']/join_df['TotalRows']

                            figure=((join_df
                                  .pivot_table(index=column,columns=second_column,values='% Stacked')
                                  .fillna(0))

                                          .plot(kind='bar'
                                            , stacked=True
                                            , title='Distribution of values for {} vs {}'.format(second_column,column)
                                            , xlabel='Field Values'
                                            , ylabel='Makeup of Values'
                                            , figsize=(35,35)

                            ))

                            #This grid style is from the sample Lab5 as I like how it looks
                            plt.ylim([0,1])
                            plt.grid(b=True, which='major', color='#666666', linestyle='-')
                            plt.setp(figure.get_xticklabels(), ha="right", rotation=0)
                            plt.minorticks_on()
                            plt.grid(b=True, which='minor', color='#999999', linestyle='-', alpha=0.2)
                            plt.legend(loc='upper left', bbox_to_anchor=(1,1), ncol=1)
                            plt.show()

                            

                            if save_fig:
                                pp.savefig(figure.get_figure())

                            #Only save if explicitly passed - This could kill your memory.
                            if save_output:
                                grouping_type[grouping_key]=agg_df

                        
    #Catch exceptions
    except Exception as exc:
        print("Function exception:\n")
        #check exception is memory error
        if exc==MemoryError:
            print("Sorry, your device is not able to run this function as you have hit a memory limit")
            
        print(exc)
        

    return grouping_type

In [25]:
def group_by_column(df,groupby_columns,agg_dict):
    """A function to group by columns given and aggregate according to a dictionary.
    
    Input: df, columns to group by, agg_dictionary
    """
    
    print("inside group_by_column(df,{},{})".format(groupby_columns,agg_dict))
    
    #Possible Errors
    error_dictionary={0:'No Error'
                     ,1:'The dataframe is empty'
                     ,2:"The columns to group by is empty or not a list"
                     ,3: 'The dictionary is empty'
                     ,4: 'The dataframe does not contain the required columns'
                      ,999: 'Uncaught exception'
                     }
    
    #Set as empty
    summary_df=pd.DataFrame()
    required_columns=[]
    
    error_code=0
    
    try:

        #Dictionary is non-empty
        if len(agg_dict)>0 and type(agg_dict)==dict:

            #df not empty
            if len(df)>0:

                #List and non-empty
                if type(groupby_columns)==list and len(groupby_columns)>0:
                    required_columns=list(df.columns)+list(agg_dict.keys())

                    #Required columns found
                    if set(required_columns).issubset(set(df.columns)):

                        #begin groupby - note: not catching summary issues as they are plentiful
                        summary_df=(df
                                        .groupby(groupby_columns)
                                        .agg(agg_dict)
                                        .reset_index()
                                    )


                    #Required columns not found    
                    else:
                        error_code=4
                        error_message=error_dictionary[error_code]
                        print(error_message)

                #Not a list or empty
                else:
                    error_code=2
                    error_message=error_dictionary[error_code]
                    print(error_message)

            #df is not empty
            else:
                error_code=1
                error_message=error_dictionary[error_code]
                print(error_message)

        #empty Dictionary
        else:
            error_code=3
            error_message=error_dictionary[error_code]
            print(error_message)
            
    except Exception as e:
        error_code=999
        print("Uncaught exception: {}".format(e))
        
    return [error_code,summary_df]

### 02.02 Begin Looking at the Trips dataset

Using Pandas to ingest the dataset and display some summary statistics

The features of the trips dataset are:

From a brief look it looks as if the the columns Ternderlot, Suppressed, justification might be all empty. Empty columns can be removed from the dataset.

The datasource feature could be a constant column so will not include any valuable information for our task, so i suggest it be removed.

There are some missing values in the feature actual departure time, which will raise some issues in the future, this feature might be important depending on what we choose to do, so we may want to find a way to repair the feature.

The feature "basin" could be a constant feature as well, using a similar argument to datasource , we can remove the column.

This would mean that form the "trips" dataset,

[DayOfService

TripID

LineID

ROuteID

Direction

PLannedTime_arr

PlannedTime_DEP

ActualTime_arr

actualtime_dep

last_update

note]

 would be the only columns remaining.

### 02.02 Looking at the Leavetime dataset

I will use dask to look at the dataset as it can load more things and is supposed to be faster for larger datasets.

# opening everything with dask

because using pandas crashes my connection with the server

## Reading and cehcking the data for irregularities

In [2]:
import dask.dataframe as dd

In [3]:
bus_leavetimes_df = dd.read_csv("./data/rt_leavetimes_DB_2018.txt", sep=";")

In [3]:
bus_leavetimes_df

Unnamed: 0_level_0,DATASOURCE,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,PASSENGERS,PASSENGERSIN,PASSENGERSOUT,DISTANCE,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE,NOTE
npartitions=337,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
,object,object,int64,int64,int64,int64,int64,int64,int64,int64,float64,float64,float64,float64,float64,float64,object,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [4]:
bus_trip_df = dd.read_csv("./data/rt_trips_DB_2018.txt", sep=";")

In [5]:
bus_trip_df

Unnamed: 0_level_0,DATASOURCE,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,BASIN,TENDERLOT,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE,NOTE
npartitions=7,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
,object,object,int64,object,object,int64,int64,int64,float64,float64,object,float64,float64,float64,object,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [6]:
LIST_CATEGORIES = []

bus_trip_df["DATASOURCE"] = bus_trip_df["DATASOURCE"].astype("category")
bus_trip_df["DIRECTION"] = bus_trip_df["DIRECTION"].astype("category")
bus_trip_df["BASIN"] = bus_trip_df["BASIN"].astype("category")
bus_trip_df["LINEID"] = bus_trip_df["LINEID"].astype("category")
bus_trip_df["ROUTEID"] = bus_trip_df["ROUTEID"].astype("category")

#float data
bus_trip_df["TRIPID"] = bus_trip_df["TRIPID"].astype("int64")
bus_trip_df["PLANNEDTIME_ARR"] = bus_trip_df["PLANNEDTIME_ARR"].astype("int64")
bus_trip_df["PLANNEDTIME_DEP"] = bus_trip_df["PLANNEDTIME_DEP"].astype("int64")
#bus_trip_df["NOTE"] = bus_trip_df["NOTE"].astype("int64")

#bus_trip_df["ACTUALTIME_ARR"] = bus_trip_df["ACTUALTIME_ARR"].astype("int64")
#bus_trip_df["ACTUALTIME_DEP"] = bus_trip_df["ACTUALTIME_DEP"].astype("int64")



From looking at the historical data after looking at the realtime, current data, we see that the tripid is formatted differently, 

In [7]:
# check the data is imported correctly
bus_trip_df.head(10)

Unnamed: 0,DATASOURCE,DAYOFSERVICE,TRIPID,LINEID,ROUTEID,DIRECTION,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,BASIN,TENDERLOT,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE,NOTE
0,DB,07-FEB-18 00:00:00,6253783,68,68_80,1,87245,84600,87524.0,84600.0,BasDef,,,,28-FEB-18 12:05:11,",2967409,"
1,DB,07-FEB-18 00:00:00,6262138,25B,25B_271,2,30517,26460,32752.0,,BasDef,,,,28-FEB-18 12:05:11,",2580260,"
2,DB,07-FEB-18 00:00:00,6254942,45A,45A_70,2,35512,32100,36329.0,32082.0,BasDef,,,,28-FEB-18 12:05:11,",2448968,"
3,DB,07-FEB-18 00:00:00,6259460,25A,25A_273,1,57261,54420,58463.0,54443.0,BasDef,,,,28-FEB-18 12:05:11,",3094242,"
4,DB,07-FEB-18 00:00:00,6253175,14,14_15,1,85383,81600,84682.0,81608.0,BasDef,,,,28-FEB-18 12:05:11,",2526331,"
5,DB,07-FEB-18 00:00:00,6248240,77A,77A_28,2,41648,37200,42019.0,37538.0,BasDef,,,,28-FEB-18 12:05:11,",2966500,"
6,DB,07-FEB-18 00:00:00,6251760,39,39_21,2,34768,28920,35709.0,28929.0,BasDef,,,,28-FEB-18 12:05:11,",2422850,"
7,DB,07-FEB-18 00:00:00,6262909,16,16_20,1,43936,38880,43721.0,38955.0,BasDef,,,,28-FEB-18 12:05:11,",3045261,"
8,DB,07-FEB-18 00:00:00,6251147,40D,40D_102,1,33558,30600,34540.0,31185.0,BasDef,,,,28-FEB-18 12:05:11,",2837123,"
9,DB,07-FEB-18 00:00:00,6249435,27B,27B_34,2,52682,49800,53281.0,49974.0,BasDef,,,,28-FEB-18 12:05:11,",2329338,"


In [8]:
# che k number of unique lines in the trips dataset
bus_trip_df["LINEID"].nunique().compute()

130

In [9]:
len(bus_leavetimes_df)

116949113

In [10]:
# the number of routes,
# contaisn different versions of the same line as the 
bus_trip_df["ROUTEID"].nunique().compute()

588

In [5]:
# collecting the trips dataset with respect to the Line and collecting all of the tripids in a list
filter_result = bus_trip_df.groupby('LINEID')['TRIPID'].apply(list).reset_index()

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  filter_result = bus_trip_df.groupby('LINEID')['TRIPID'].apply(list).reset_index()


In [25]:
filter_result["LINEID"]

Unnamed: 0,LINEID,TRIPID
0,111,"[7321397, 6736637, 6736639, 6736647, 6736630, ..."
1,145,"[6636273, 6631234, 6641668, 6643671, 6643699, ..."
2,15D,"[7175345, 8070285, 8052192, 7168575, 8050929, ..."
3,238,"[7146929, 8098733, 7328510, 7328508, 7319958, ..."
4,25X,"[8068486, 8104083, 8052715, 8124060, 6737359, ..."


In [12]:
# list the routes associated with the line
filter_route_line_result = bus_trip_df.groupby('LINEID')['ROUTEID'].apply(list).reset_index()

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  filter_route_line_result = bus_trip_df.groupby('LINEID')['ROUTEID'].apply(list).reset_index()


In [13]:
filter_route_line_result.compute()

Unnamed: 0,LINEID,ROUTEID
0,111,"[111_8, 111_8, 111_9, 111_10, 111_7, 111_8, 11..."
1,145,"[145_105, 145_105, 145_105, 145_102, 145_105, ..."
2,15D,"[15D_63, 15D_62, 15D_65, 15D_62, 15D_62, 15D_6..."
3,238,"[238_11, 238_15, 238_15, 238_15, 238_11, 238_1..."
4,25X,"[25X_10, 25X_11, 25X_11, 25X_11, 25X_12, 25X_1..."
...,...,...
17,66,"[66_11, 66_13, 66_13, 66_11, 66_11, 66_13, 66_..."
18,75,"[75_18, 75_19, 75_17, 75_17, 75_17, 75_19, 75_..."
19,79A,"[79A_27, 79A_27, 79A_27, 79A_27, 79A_27, 79A_2..."
20,7A,"[7A_85, 7A_85, 7A_88, 7A_86, 7A_87, 7A_87, 7A_..."


In [14]:
len(filter_route_line_result)

130

In [15]:
list_routes = list(filter_route_line_result["ROUTEID"].compute())

In [16]:
from collections import Counter

Of the 102 lines it has this number of routes, and the frequencies

In [17]:
print(Counter(list_routes[0]))

Counter({'111_7': 4005, '111_8': 2886, '111_9': 719, '111_10': 238})


In [18]:
%time len(filter_result)

CPU times: user 14.6 s, sys: 1.69 s, total: 16.3 s
Wall time: 10.5 s


130

Tells us that there are 130 unique lines, these include their own routes , the routes differ by the different starting points or different ending points

In [7]:
%time filter_result.head()

CPU times: user 8.62 s, sys: 2.08 s, total: 10.7 s
Wall time: 8.73 s


Unnamed: 0,LINEID,TRIPID
0,102,"[6121819, 6858629, 6855177, 6855178, 6646206, ..."
1,104,"[6856598, 6647566, 6647571, 6276763, 6759749, ..."
2,116,"[7159266, 6291853, 6099533, 6088623, 6291512, ..."
3,118,"[6752981, 6118699, 6665591, 7147145, 6242604, ..."
4,120,"[6115001, 6115928, 6852405, 6860169, 6853319, ..."


In [31]:
# get the list of bus routes
bus_routes = list(filter_result.LINEID.compute())

In [29]:
filter_result.LINEID.compute()

0     111
1     145
2     15D
3     238
4     25X
     ... 
17     66
18     75
19    79A
20     7A
21     84
Name: LINEID, Length: 130, dtype: object

In [32]:
# we have the unique bus routes
print(bus_routes)

['111', '145', '15D', '238', '25X', '27A', '27X', '46E', '49', '51D', '65', '65B', '67', '7', '76A', '77X', '114', '14', '15A', '16C', '185', '25', '27', '270', '33', '39', '39A', '40', '40E', '41', '41B', '45A', '68X', '76', '77A', '83', '104', '116', '120', '220', '236', '31', '31D', '33B', '4', '40B', '42D', '46A', '53', '54A', '66X', '68', '70D', '118', '122', '123', '18', '184', '25B', '31B', '40D', '41A', '42', '43', '56A', '61', '67X', '84X', '1', '13', '150', '151', '161', '25A', '29A', '31A', '33A', '33X', '37', '47', '66A', '66B', '69X', '7D', '9', '102', '130', '142', '14C', '15', '16', '17A', '26', '32', '33D', '33E', '38B', '38D', '44', '44B', '51X', '68A', '69', '70', '79', '7B', '83A', '84A', '11', '140', '15B', '16D', '17', '239', '25D', '27B', '32X', '38', '38A', '39X', '41C', '41D', '41X', '59', '63', '66', '75', '79A', '7A', '84']


In [6]:
#list the trips that are associated iwth route 102, route chosen randomly
# testing for the first line in the list
%time line_102 = list(bus_trip_df["TRIPID"][bus_trip_df["LINEID"] == "102"].compute())

CPU times: user 6.37 s, sys: 959 ms, total: 7.33 s
Wall time: 5.29 s


In [23]:
line_102

[7764193,
 7761736,
 7762172,
 6277493,
 6277492,
 6266653,
 7761744,
 7100366,
 7100367,
 7101507,
 6278971,
 6266657,
 6266656,
 6266658,
 6278972,
 6620297,
 6617210,
 6623766,
 6620258,
 6389409,
 6387381,
 6387383,
 7099681,
 7101512,
 6613665,
 6617208,
 6617209,
 6620257,
 6620294,
 6387380,
 8024207,
 8024212,
 6389406,
 7101518,
 7110511,
 7110509,
 7110513,
 7025238,
 7025241,
 6266654,
 6266652,
 6277494,
 6278969,
 8017278,
 8016530,
 8017981,
 8027705,
 8017277,
 8017980,
 8017651,
 6401158,
 6401168,
 8093343,
 6266649,
 8085867,
 8082702,
 8090455,
 8090453,
 8093348,
 8091211,
 8086416,
 7025103,
 7025120,
 7025109,
 6266648,
 8016526,
 8017979,
 7161974,
 6620306,
 6625125,
 6620304,
 8093353,
 8091214,
 8086419,
 8086410,
 8091204,
 8082697,
 7025094,
 7025147,
 7025105,
 7025123,
 7025111,
 7025154,
 8024211,
 8017653,
 8016529,
 8027703,
 8027712,
 6389450,
 6389451,
 6389411,
 6387385,
 7769069,
 7772164,
 7772170,
 7761743,
 8090447,
 8091202,
 8093342,
 7025226,


In [14]:
#dataframe is the datframe from which the selection is chosen from
# route is the string representaztion of the route
def route_selection(dataframe, route):
    route_leavetimes = dataframe.map_partitions(lambda x: x[x.TRIPID.isin(route)])
    return route_leavetimes

In [8]:
%time line_102 = list(bus_trip_df["TRIPID"][bus_trip_df["LINEID"] == "102"].compute())

CPU times: user 7.07 s, sys: 1.15 s, total: 8.22 s
Wall time: 4.58 s


In [26]:
print(route_102_8)

[6277493, 6277492, 6266653, 7100366, 7101507, 6266656, 6266658, 6278972, 6620297, 6617210, 6623766, 6620258, 6389409, 6387381, 6387383, 6613665, 6617208, 8024212, 8017278, 8016530, 8017981, 6401158, 6401168, 6266649, 8085867, 8082702, 8090455, 8090453, 8093348, 8091211, 8086416, 7025103, 7025120, 7025109, 8016526, 8017979, 7025094, 7025147, 7025105, 7025123, 7025111, 7025154, 8027712, 6389450, 6389451, 6389411, 6387385, 7769069, 7772164, 7772170, 7761743, 6620305, 6625124, 6625126, 6620299, 6389416, 6389414, 6387389, 7148431, 6390169, 6403108, 6613668, 6625122, 6625116, 6390167, 6389407, 7135164, 6266651, 6278964, 6278970, 6277495, 7025155, 7025112, 7025095, 7148438, 7145597, 7148440, 6389453, 6387387, 6230505, 8017652, 8017276, 8016528, 8027704, 6623773, 6236035, 6229364, 8017984, 8016534, 8027710, 8017282, 8017986, 8024214, 8027712, 8017657, 6620303, 7773579, 7145589, 7148433, 8082698, 6230673, 8017284, 8024216, 8017988, 8017659, 8027714, 8024218, 7145593, 7135161, 7148435, 7135162, 

In [15]:
# testing if above function works
%time route_102_df = route_selection(bus_leavetimes_df, line_102)


CPU times: user 24.2 ms, sys: 40 µs, total: 24.3 ms
Wall time: 22.6 ms


In [17]:
%time route_102_df.head()

CPU times: user 747 ms, sys: 99.3 ms, total: 846 ms
Wall time: 948 ms


Unnamed: 0,DAYOFSERVICE,TRIPID,PROGRNUMBER,STOPPOINTID,PLANNEDTIME_ARR,PLANNEDTIME_DEP,ACTUALTIME_ARR,ACTUALTIME_DEP,VEHICLEID,SUPPRESSED,JUSTIFICATIONID,LASTUPDATE
3755,01-JAN-18 00:00:00,5957731,2,935,77743,77743,77715,77715,2868314,,,08-JAN-18 17:21:10
3756,01-JAN-18 00:00:00,5968222,2,935,82843,82843,82844,82844,2534829,,,08-JAN-18 17:21:10
3757,01-JAN-18 00:00:00,5957733,2,935,84643,84643,84643,84643,2868314,,,08-JAN-18 17:21:10
3758,01-JAN-18 00:00:00,5968214,1,4381,42300,42300,42326,42326,1001151,,,08-JAN-18 17:21:10
3759,01-JAN-18 00:00:00,5957726,1,4381,48300,48300,48298,48298,1000198,,,08-JAN-18 17:21:10


In [18]:
%time len(route_102_df)

CPU times: user 5min 51s, sys: 47.7 s, total: 6min 39s
Wall time: 2min 38s


998739

In [None]:
stations_list_102_8 = list(route_102_8_df.STOPPOINTID.compute())


In [None]:
list_route_102_8_stops = list(Counter(stations_list_102_8).keys())
for stop in list_route_102_8_stops:
    print(stop)

In [None]:
#checking if the lines were pulled correctly
print(line_102)

In [10]:
# change datatypes of the leavetimes dataset
bus_leavetimes_df["DATASOURCE"] = bus_leavetimes_df["DATASOURCE"].astype("category")
bus_leavetimes_df["DAYOFSERVICE"] = dd.to_datetime(bus_leavetimes_df["DAYOFSERVICE"])
bus_leavetimes_df["LASTUPDATE"] = dd.to_datetime(bus_leavetimes_df["LASTUPDATE"])


In [11]:
#the datatypes of the features should be correct
bus_leavetimes_df.dtypes

DATASOURCE               category
DAYOFSERVICE       datetime64[ns]
TRIPID                      int64
PROGRNUMBER                 int64
STOPPOINTID                 int64
PLANNEDTIME_ARR             int64
PLANNEDTIME_DEP             int64
ACTUALTIME_ARR              int64
ACTUALTIME_DEP              int64
VEHICLEID                   int64
PASSENGERS                float64
PASSENGERSIN              float64
PASSENGERSOUT             float64
DISTANCE                  float64
SUPPRESSED                float64
JUSTIFICATIONID           float64
LASTUPDATE         datetime64[ns]
NOTE                      float64
dtype: object

# looking at the data and cleaning it accordingly

performing the min and max of the features can give us insight into the features themselves, so since the dataypes should all be correct it will point us towards outliers or irregular data

In [36]:
%time bus_leavetimes_df.max().compute()

CPU times: user 7min 44s, sys: 1min 18s, total: 9min 2s
Wall time: 2min 55s


DAYOFSERVICE       2018-12-31 00:00:00
TRIPID                         8592207
PROGRNUMBER                        109
STOPPOINTID                       7692
PLANNEDTIME_ARR                  91680
PLANNEDTIME_DEP                  91680
ACTUALTIME_ARR                   97177
ACTUALTIME_DEP                   97177
VEHICLEID                      3394131
PASSENGERS                         NaN
PASSENGERSIN                       NaN
PASSENGERSOUT                      NaN
DISTANCE                           NaN
SUPPRESSED                           1
JUSTIFICATIONID            4.84981e+17
LASTUPDATE         2019-01-16 18:27:21
NOTE                               NaN
dtype: object

In [37]:
%time bus_leavetimes_df.min().compute()

CPU times: user 7min 32s, sys: 1min 30s, total: 9min 2s
Wall time: 2min 58s


DAYOFSERVICE       2018-01-01 00:00:00
TRIPID                         5955221
PROGRNUMBER                          1
STOPPOINTID                          2
PLANNEDTIME_ARR                  16200
PLANNEDTIME_DEP                  16200
ACTUALTIME_ARR                   15974
ACTUALTIME_DEP                   15974
VEHICLEID                      1000110
PASSENGERS                         NaN
PASSENGERSIN                       NaN
PASSENGERSOUT                      NaN
DISTANCE                           NaN
SUPPRESSED                           0
JUSTIFICATIONID                 177856
LASTUPDATE         2018-01-08 17:21:10
NOTE                               NaN
dtype: object

In [11]:
# these items are all empty or constant
bus_leavetimes_df = bus_leavetimes_df.drop(["DATASOURCE", "PASSENGERS", "PASSENGERSIN", "PASSENGERSOUT", "DISTANCE", "NOTE"], axis=1)

ValueError: Metadata inference failed in `drop_by_shallow_copy`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
KeyError("['DATASOURCE' 'PASSENGERS' 'PASSENGERSIN' 'PASSENGERSOUT' 'DISTANCE'\n 'NOTE'] not found in axis")

Traceback:
---------
  File "/home/team10/miniconda3/lib/python3.9/site-packages/dask/dataframe/utils.py", line 176, in raise_on_meta_error
    yield
  File "/home/team10/miniconda3/lib/python3.9/site-packages/dask/dataframe/core.py", line 5612, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "/home/team10/miniconda3/lib/python3.9/site-packages/dask/dataframe/utils.py", line 710, in drop_by_shallow_copy
    df2.drop(columns=columns, inplace=True, errors=errors)
  File "/home/team10/miniconda3/lib/python3.9/site-packages/pandas/core/frame.py", line 4308, in drop
    return super().drop(
  File "/home/team10/miniconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 4153, in drop
    obj = obj._drop_axis(labels, axis, level=level, errors=errors)
  File "/home/team10/miniconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 4188, in _drop_axis
    new_axis = axis.drop(labels, errors=errors)
  File "/home/team10/miniconda3/lib/python3.9/site-packages/pandas/core/indexes/base.py", line 5592, in drop
    raise KeyError(f"{labels[mask]} not found in axis")


In [12]:
bus_leavetimes_df.dtypes

DAYOFSERVICE        object
TRIPID               int64
PROGRNUMBER          int64
STOPPOINTID          int64
PLANNEDTIME_ARR      int64
PLANNEDTIME_DEP      int64
ACTUALTIME_ARR       int64
ACTUALTIME_DEP       int64
VEHICLEID            int64
SUPPRESSED         float64
JUSTIFICATIONID    float64
LASTUPDATE          object
dtype: object

From the calculating the max values we can see that the max value of the features: [ passeengers* , distance , note] are all none, which means that the columns are empty which means that we can remove them.

The feature "datasource" should be a constant columns as all the data comes form a database.

We can split the leavetimes dataset with respect to the route of the bus, so that only the information of each route is displayed. This will make the dataset smaller and easier to maniupulate and work with.

In [17]:
import sys
!{sys.executable} -m pip install tqdm



In [33]:
from tqdm import tqdm

In [None]:
print(tqdm)

In [None]:
# splitting the data with respect to the lineid, the route id are different versions of the same line
filename = "~/data/leavetimes_split_by_route/route{}.csv"
for i in tqdm(range(130)):
    route_leavetimes = bus_leavetimes_df.map_partitions(lambda x: x[x.TRIPID.isin([filter_result["LINEID"] == bus_routes[i]])])
    route_leavetimes.to_csv(filename.format(bus_routes[i]))

  0%|          | 0/130 [00:00<?, ?it/s]

we loop through each route and get the 

In [34]:
filename = "~/data/leavetimes_split_by_route/route{}.csv"

i = 0
for bus in tqdm(bus_routes):
    print(i)
    i += 1
    line_tripid = list(bus_trip_df["TRIPID"][bus_trip_df["LINEID"] == bus].compute())
    single_route_df = route_selection(bus_leavetimes_df, line_tripid)
    single_route_df.to_csv(filename.format(bus))

  0%|          | 0/130 [00:00<?, ?it/s]

0


  1%|          | 1/130 [02:44<5:54:36, 164.93s/it]

1


  2%|▏         | 2/130 [06:35<7:13:38, 203.27s/it]

2


  2%|▏         | 3/130 [09:11<6:25:21, 182.06s/it]

3


  3%|▎         | 4/130 [11:54<6:05:55, 174.25s/it]

4


  4%|▍         | 5/130 [14:33<5:51:28, 168.70s/it]

5


  5%|▍         | 6/130 [17:24<5:50:54, 169.79s/it]

6


  5%|▌         | 7/130 [20:02<5:40:06, 165.91s/it]

7


  6%|▌         | 8/130 [22:37<5:30:17, 162.43s/it]

8


  7%|▋         | 9/130 [25:32<5:35:10, 166.20s/it]

9


  8%|▊         | 10/130 [28:05<5:24:33, 162.28s/it]

10


  8%|▊         | 11/130 [31:13<5:37:21, 170.09s/it]

11


  9%|▉         | 12/130 [34:31<5:50:54, 178.43s/it]

12


 10%|█         | 13/130 [38:00<6:06:01, 187.71s/it]

13


 11%|█         | 14/130 [41:22<6:11:35, 192.21s/it]

14


 12%|█▏        | 15/130 [44:20<5:59:54, 187.78s/it]

15


 12%|█▏        | 16/130 [47:21<5:53:00, 185.80s/it]

16


 13%|█▎        | 17/130 [50:30<5:51:30, 186.64s/it]

17


 14%|█▍        | 18/130 [54:19<6:12:25, 199.51s/it]

18


 15%|█▍        | 19/130 [57:44<6:12:04, 201.12s/it]

19


 15%|█▌        | 20/130 [1:00:48<5:59:36, 196.15s/it]

20


 16%|█▌        | 21/130 [1:03:56<5:51:49, 193.66s/it]

21


 17%|█▋        | 22/130 [1:07:08<5:47:19, 192.96s/it]

22


 18%|█▊        | 23/130 [1:11:23<6:17:43, 211.81s/it]

23


 18%|█▊        | 24/130 [1:14:32<6:02:07, 204.98s/it]

24


 19%|█▉        | 25/130 [1:17:58<5:59:10, 205.24s/it]

25


 20%|██        | 26/130 [1:21:24<5:56:12, 205.51s/it]

26


 21%|██        | 27/130 [1:24:44<5:49:27, 203.57s/it]

27


 22%|██▏       | 28/130 [1:28:16<5:50:30, 206.18s/it]

28


 22%|██▏       | 29/130 [1:31:25<5:38:41, 201.20s/it]

29


 23%|██▎       | 30/130 [1:35:01<5:42:19, 205.40s/it]

30


 24%|██▍       | 31/130 [1:37:50<5:21:00, 194.56s/it]

31


 25%|██▍       | 32/130 [1:41:09<5:20:02, 195.94s/it]

32


 25%|██▌       | 33/130 [1:44:01<5:05:12, 188.79s/it]

33


 26%|██▌       | 34/130 [1:47:07<5:00:43, 187.95s/it]

34


 27%|██▋       | 35/130 [1:50:51<5:14:44, 198.78s/it]

35


 28%|██▊       | 36/130 [1:55:27<5:47:46, 221.99s/it]

36


 28%|██▊       | 37/130 [1:58:19<5:20:32, 206.80s/it]

37


 29%|██▉       | 38/130 [2:01:11<5:01:22, 196.55s/it]

38


 30%|███       | 39/130 [2:04:10<4:49:53, 191.13s/it]

39


 31%|███       | 40/130 [2:07:00<4:37:08, 184.77s/it]

40


 32%|███▏      | 41/130 [2:09:40<4:22:59, 177.30s/it]

41


 32%|███▏      | 42/130 [2:12:40<4:21:13, 178.11s/it]

42


 33%|███▎      | 43/130 [2:15:10<4:06:10, 169.77s/it]

43


 34%|███▍      | 44/130 [2:17:46<3:57:31, 165.72s/it]

44


 35%|███▍      | 45/130 [2:20:57<4:05:19, 173.17s/it]

45


 35%|███▌      | 46/130 [2:23:36<3:56:24, 168.86s/it]

46


 36%|███▌      | 47/130 [2:26:12<3:48:23, 165.11s/it]

47


 37%|███▋      | 48/130 [2:30:01<4:11:44, 184.20s/it]

48


 38%|███▊      | 49/130 [2:32:37<3:57:17, 175.77s/it]

49


 38%|███▊      | 50/130 [2:35:39<3:56:59, 177.75s/it]

50


 39%|███▉      | 51/130 [2:38:29<3:50:52, 175.34s/it]

51


 40%|████      | 52/130 [2:41:32<3:50:50, 177.57s/it]

52


 41%|████      | 53/130 [2:44:21<3:44:48, 175.17s/it]

53


 42%|████▏     | 54/130 [2:47:07<3:38:21, 172.39s/it]

54


 42%|████▏     | 55/130 [2:51:03<3:59:28, 191.58s/it]

55


 43%|████▎     | 56/130 [2:55:25<4:22:09, 212.56s/it]

56


 44%|████▍     | 57/130 [2:59:30<4:30:24, 222.25s/it]

57


 45%|████▍     | 58/130 [3:04:28<4:54:09, 245.13s/it]

58


 45%|████▌     | 59/130 [3:08:41<4:52:35, 247.26s/it]

59


 46%|████▌     | 60/130 [3:13:20<4:59:32, 256.75s/it]

60


 47%|████▋     | 61/130 [3:19:19<5:30:50, 287.69s/it]

61


 48%|████▊     | 62/130 [3:22:58<5:02:33, 266.97s/it]

62


 48%|████▊     | 63/130 [3:27:11<4:53:33, 262.88s/it]

63


 49%|████▉     | 64/130 [3:30:31<4:28:21, 243.96s/it]

64


 50%|█████     | 65/130 [3:33:38<4:05:39, 226.75s/it]

65


 51%|█████     | 66/130 [3:37:33<4:04:39, 229.37s/it]

66


 52%|█████▏    | 67/130 [3:40:23<3:42:01, 211.45s/it]

67


 52%|█████▏    | 68/130 [3:43:11<3:25:03, 198.44s/it]

68


 53%|█████▎    | 69/130 [3:46:29<3:21:36, 198.30s/it]

69


 54%|█████▍    | 70/130 [3:50:06<3:23:51, 203.86s/it]

70


 55%|█████▍    | 71/130 [3:53:42<3:24:10, 207.64s/it]

71


 55%|█████▌    | 72/130 [3:57:01<3:18:09, 204.98s/it]

72


 56%|█████▌    | 73/130 [4:00:32<3:16:36, 206.95s/it]

73


 57%|█████▋    | 74/130 [4:05:01<3:30:22, 225.39s/it]

74


 58%|█████▊    | 75/130 [4:08:34<3:23:17, 221.77s/it]

75


 58%|█████▊    | 76/130 [4:11:36<3:08:45, 209.72s/it]

76


 59%|█████▉    | 77/130 [4:15:15<3:07:47, 212.59s/it]

77


 60%|██████    | 78/130 [4:19:51<3:20:37, 231.50s/it]

78


 61%|██████    | 79/130 [4:24:57<3:35:50, 253.94s/it]

79


 62%|██████▏   | 80/130 [4:27:56<3:12:48, 231.37s/it]

80


 62%|██████▏   | 81/130 [4:30:30<2:50:05, 208.27s/it]

81


 63%|██████▎   | 82/130 [4:33:26<2:38:45, 198.46s/it]

82


 64%|██████▍   | 83/130 [4:36:04<2:26:05, 186.49s/it]

83


 65%|██████▍   | 84/130 [4:38:45<2:17:09, 178.89s/it]

84


 65%|██████▌   | 85/130 [4:41:49<2:15:07, 180.16s/it]

85


 66%|██████▌   | 86/130 [4:44:34<2:08:49, 175.67s/it]

86


 67%|██████▋   | 87/130 [4:47:39<2:07:55, 178.51s/it]

87


 68%|██████▊   | 88/130 [4:50:03<1:57:42, 168.16s/it]

88


 68%|██████▊   | 89/130 [4:52:34<1:51:23, 163.02s/it]

89


 69%|██████▉   | 90/130 [4:56:16<2:00:32, 180.80s/it]

90


 70%|███████   | 91/130 [4:59:50<2:04:03, 190.85s/it]

91


 71%|███████   | 92/130 [5:02:55<1:59:41, 188.99s/it]

92


 72%|███████▏  | 93/130 [5:05:37<1:51:28, 180.76s/it]

93


 72%|███████▏  | 94/130 [5:08:24<1:46:01, 176.70s/it]

94


 73%|███████▎  | 95/130 [5:10:55<1:38:39, 169.14s/it]

95


 74%|███████▍  | 96/130 [5:13:24<1:32:17, 162.87s/it]

96


 75%|███████▍  | 97/130 [5:15:58<1:28:07, 160.22s/it]

97


 75%|███████▌  | 98/130 [5:18:30<1:24:06, 157.71s/it]

98


 76%|███████▌  | 99/130 [5:21:17<1:22:59, 160.63s/it]

99


 77%|███████▋  | 100/130 [5:23:52<1:19:26, 158.87s/it]

100


 78%|███████▊  | 101/130 [5:26:26<1:16:03, 157.38s/it]

101


 78%|███████▊  | 102/130 [5:29:01<1:13:05, 156.64s/it]

102


 79%|███████▉  | 103/130 [5:31:46<1:11:38, 159.19s/it]

103


 80%|████████  | 104/130 [5:34:32<1:09:56, 161.40s/it]

104


 81%|████████  | 105/130 [5:37:19<1:07:53, 162.95s/it]

105


 82%|████████▏ | 106/130 [5:39:54<1:04:15, 160.65s/it]

106


 82%|████████▏ | 107/130 [5:42:38<1:01:56, 161.59s/it]

107


 83%|████████▎ | 108/130 [5:45:13<58:30, 159.58s/it]  

108


 84%|████████▍ | 109/130 [5:48:15<58:14, 166.42s/it]

109


 85%|████████▍ | 110/130 [5:51:22<57:30, 172.54s/it]

110


 85%|████████▌ | 111/130 [5:54:25<55:36, 175.62s/it]

111


 86%|████████▌ | 112/130 [5:56:59<50:44, 169.14s/it]

112


 87%|████████▋ | 113/130 [5:59:54<48:24, 170.86s/it]

113


 88%|████████▊ | 114/130 [6:02:32<44:34, 167.19s/it]

114


 88%|████████▊ | 115/130 [6:05:06<40:46, 163.12s/it]

115


 89%|████████▉ | 116/130 [6:08:08<39:25, 168.94s/it]

116


 90%|█████████ | 117/130 [6:10:40<35:27, 163.67s/it]

117


 91%|█████████ | 118/130 [6:13:26<32:51, 164.29s/it]

118


 92%|█████████▏| 119/130 [6:16:02<29:42, 162.01s/it]

119


 92%|█████████▏| 120/130 [6:18:37<26:37, 159.79s/it]

120


 93%|█████████▎| 121/130 [6:21:27<24:26, 162.90s/it]

121


 94%|█████████▍| 122/130 [6:24:24<22:16, 167.01s/it]

122


 95%|█████████▍| 123/130 [6:27:26<20:00, 171.48s/it]

123


 95%|█████████▌| 124/130 [6:30:25<17:23, 173.88s/it]

124


 96%|█████████▌| 125/130 [6:33:38<14:58, 179.71s/it]

125


 97%|█████████▋| 126/130 [6:37:05<12:31, 187.78s/it]

126


 98%|█████████▊| 127/130 [6:40:21<09:30, 190.16s/it]

127


 98%|█████████▊| 128/130 [6:43:17<06:11, 185.88s/it]

128


 99%|█████████▉| 129/130 [6:46:14<03:03, 183.31s/it]

129


100%|██████████| 130/130 [6:49:05<00:00, 188.81s/it]


In [24]:
%time filter_result["TRIPID"][filter_result["LINEID" =="102"]]

KeyError: False