# Create a postgreSQL database -- Jupyter Notebook version

*Converted from add_warnigns_with_status_time_open_to_db.py*

Create a driver_schedule table, and for each month, and for each route, add all records to that single table

We may care to sort the records before adding to the database table

First, we need to know if it is safe to use vehicle_assignment_id as the primary key for driver schedule records, so we test for uniqueness across all data files: for each VehiclesThatRanRoute file across all routes and months, read vehicle_assignment_id values into an array, count the unique array entries and compare for equality with the array length.

In [1]:
import numpy as np
from os import path, listdir
import pandas as pd
from sqlalchemy import create_engine
import datetime

In [2]:
def preprocess_warning_name(elem):
  warning_name = elem.split(' - StatusTimeOpen:')[0]
  return warning_name if warning_name in warning_name_list else None

def preprocess_bus_number(elem):
  return elem.split()[-1]

#assume that the warnings folder only has warning spreadsheet files as children
project_root_dir = r'\\vntscex.local\DFS\3BC-Share$_Mobileye_Data\Data\Data Integration' 
data_root_dir = path.join(project_root_dir, 'warnings') # 'warnings'

warning_data = []

warning_name_list = [
  'ME - Pedestrian Collision Warning', 'ME - Pedestrian In Range Warning',
  'PCW-LF', 'PCW-LR', 'PCW-RR', 'PDZ - Left Front', 'PDZ-LR', 'PDZ-R',
  'Safety - Braking - Aggressive', 'Safety - Braking - Dangerous']

In [3]:
StartTime = datetime.datetime.now()

allfiles = listdir(data_root_dir)
# skip one bad file
usefiles = (x for x in allfiles if x not in 'DASH_Report_2018_04_16_to_2018_04_20.xlsx')

for file_name in usefiles:
    
    file_path = path.join(data_root_dir, file_name)
    print(file_name)
    # file_path = path.join(data_root_dir, listdir(data_root_dir)[0])
    # print(file_path)

    # only read columns loc_time (0), Vehicle Name (2), Address (7),
    # warning_name (9), Latitude (11), Longitude (12), and skip the Ituran header
    # (first 7 rows)
    df = pd.read_excel(file_path,
                       skiprows = [0, 1, 2, 3, 4, 5, 6, 7],
                       usecols = [0, 1, 3, 4, 5, 6],
                       names=['loc_time', 'bus_number', 'address', 'warning_name', 'latitude', 'longitude'],
                       header=None, parse_dates=[0])#, dtype={0: object, 1: object, 3: object, 4: object, 5: np.float64, 6: np.float64})

    # print(df.describe())
    # df.head()
    # remove extraneous StatusTimeOpen suffix from warning messages and set other
    # messages to null, then drop those null records
    df.loc[:, 'warning_name'] = df.loc[:, 'warning_name'].apply(
    preprocess_warning_name)

    df.loc[:, 'bus_number'] = df.loc[:, 'bus_number'].apply(
    preprocess_bus_number).astype(np.uint32)
    # print(df.head().loc[:, 'warning_name'])

    df.dropna(subset=['warning_name'], inplace=True)
    # print(df.describe())
    # print(df.head().loc[:, 'warning_name'])

   # print(df.head(2))
   # print(df.dtypes)
    EndTime = datetime.datetime.now()
    print(EndTime-StartTime)
    warning_data.append(df)

DASH_Report_2018_06_01_to_2018_06_05.xlsx
0:00:24.281876
DASH_Report_2018_06_06_to_2018_06_10.xlsx
0:00:52.018358
DASH_Report_2018_06_11_to_2018_06_14.xlsx
0:01:24.223020
DASH_Report_2018_06_15_to_2018_06_19.xlsx
0:01:50.595656
DASH_Report_2018_06_20_to_2018_06_24.xlsx
0:02:20.681409
DASH_Report_2018_06_25_to_2018_06_29.xlsx
0:02:57.764290
DASH_Report_2018_06_30.xlsx
0:02:59.474831


In [4]:
warning_data = pd.concat(
  warning_data, ignore_index=True, verify_integrity=True)

print('init warning_data:\n{}'.format(warning_data.describe()))

# count the unique stop_tim_id and compare with the number of records to
# identify duplicates (and do it per route in case duplicates occur across
# routes but not within a single route - which is okay) we learn that indeed
# the stop ids are unique within a given route
# find_duplicates(warning_data)

# drop duplicates if found
warning_data.drop_duplicates(inplace=True)

print('de-duplicated warning_data:\n{}'.format(warning_data.describe()))
print('\n{}'.format(warning_data.head()))



          bus_number       latitude      longitude
count  783235.000000  783235.000000  783235.000000
mean    14253.054352      34.046502    -118.254150
std      2033.392978       0.008839       0.010047
min      6303.000000      34.018186    -118.291708
25%     12311.000000      34.043236    -118.259266
50%     13327.000000      34.047960    -118.254770
75%     15341.000000      34.052240    -118.249606
max     17312.000000      34.067246    -118.231073
          bus_number       latitude      longitude
count  783232.000000  783232.000000  783232.000000
mean    14253.059203      34.046502    -118.254150
std      2033.395145       0.008839       0.010047
min      6303.000000      34.018186    -118.291708
25%     12311.000000      34.043236    -118.259266
50%     13327.000000      34.047960    -118.254770
75%     15341.000000      34.052240    -118.249606
max     17312.000000      34.067246    -118.231073

             loc_time  bus_number  \
0 2018-06-01 06:09:01       12301   
1 2018-

We temporarily also drop records with missing values to prove our concept.
Key attributes that require values include 1) __, 2) route_id,
3) vehicle_id, 4) arrived_at, 5) departed_at, and 6) stop_time_id. For now,
we exclude the stop_id because many relevant records have missing stop_ids.

TODO: Infer missing values where possible using warning and route data

In [5]:
# key_column_names = ['route_id', 'vehicle_id', 'arrived_at', 'departed_at']
#
# warning_data.dropna(subset=key_column_names, inplace=True)

# we make no assumption about the order in which source files are input
warning_data.sort_values(['loc_time', 'bus_number'], inplace=True)

# reset indices after sorting records
warning_data.set_index(pd.RangeIndex(warning_data.shape[0]), inplace=True)

# excel_writer = pd.ExcelWriter('processed_warnings.xlsx')
#
# chunk_size = pow(2, 20) - 1
#
# idx_limit = warning_data.shape[0]
#
# for i in range(int(ceil(idx_limit / chunk_size))):
#   chunk = warning_data.iloc[i * chunk_size:max((i + 1) * chunk_size, idx_limit)]
#
#   print('{}_th chunk_data:\n{}\n{}\n'.format(i, chunk.describe(), chunk.head()))
#
#   chunk.to_excel(excel_writer, 'warnings_{}'.format(i), index=False)
#
# excel_writer.save()
        
#db_path = 'sqlite:///ituran_synchromatics_data_DanTest.sqlite'

db_path = 'sqlite:///' + path.join(project_root_dir, 'ituran_synchromatics_data_DanTest.sqlite')
db = create_engine(db_path)

# poor performance has been observed when adding more than one million records
# at a time
warning_data.to_sql(
  'warning', db, if_exists='replace', chunksize=1000000, index=False)
