## Step 1: Connect to the database and read the values

In [356]:
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine('postgresql://postgres@localhost/shuttle_database')

conn = engine.connect()
input_df = pd.read_sql("SELECT * FROM shuttle_locations LIMIT 1000000", conn)
input_df

Unnamed: 0,shuttle_id,tech_provider_id,shuttle_company_id,local_timestamp,location,cnn
0,252,2,2,2017-09-19 22:46:22,"(-122.3967933,37.7457141)",5988000.0
1,125,2,2,2017-09-19 22:46:22,"(-122.3963909,37.745213)",
2,115,2,1,2017-09-19 22:52:47,"(-122.3930867,37.7555999)",10336000.0
3,313,2,1,2017-09-19 22:52:47,"(-122.3935873,37.7558359)",
4,242,2,2,2017-09-19 22:58:07,"(-122.396788,37.7456755)",
5,100,2,2,2017-09-19 22:58:06,"(-122.3963553,37.745296)",
6,319,2,2,2017-09-19 22:58:05,"(-122.3966214,37.7456051)",5988000.0
7,21,2,1,2017-09-19 22:58:11,"(-122.3849934,37.7544446)",
8,113,4,5,2017-09-19 17:53:16,"(-122.423787,37.788707)",5823000.0
9,97,2,2,2017-09-19 23:02:52,"(-122.396892,37.745798)",5988000.0


## Step 2: fill in the blanks on data that does not have CNN information

In [357]:
import math
import functools
import datetime as dt

def start_time(series):
    return functools.reduce(lambda x, y: x if x < y else y, series)
 
def start_time(series):
    return functools.reduce(lambda x, y: x if x < y else y, series)
        
def new_cnn(series):
    if len(series)<2:
        return True
    else:
        return series[0] != series[1]
    
def flatten_aggregations(df):
    df.columns = [' '.join(col).strip() for col in df.columns.values]
    return df

def create_cnn_events(df):
    return df['cnn'].rolling(2, min_periods=1).apply(new_cnn).cumsum()

def prep_df_for_summary(df):
    def to_summary_format(df):
       return pd.concat([
            df['shuttle_id first'],
            df['cnn first'],
            df['ts min'],
            df['ts max'],
            df['ts count']] ,
            axis=1, 
            keys=['shuttle_id','cnn','start_time', 'end_time','num_points']) 

    dfs = df.groupby(['shuttle_id'])
    shuttle_map = {}
    for name, grouped in dfs:
        sorted_df = grouped.sort_values('local_timestamp')
        sorted_df['cnn'].fillna(method='ffill',inplace=True)
        sorted_df['cnn_event'] = create_cnn_events(sorted_df)
        sorted_df['ts'] = sorted_df['local_timestamp'].astype('int64')

        cnn_grouped_df = sorted_df.groupby(['cnn_event'])
        cnn_aggregated_df = cnn_grouped_df.agg({'shuttle_id': 'first', 'cnn': 'first', 'ts': ['min','max','count']})
        
        cnn_aggregated_df = flatten_aggregations(cnn_aggregated_df)        

        df = cnn_aggregated_df
        summary_facts_df = pd.concat([
            df['shuttle_id first'],
            df['cnn first'],
            df['ts min'],
            df['ts max'],
            df['ts count']] ,
            axis=1, 
            keys=['shuttle_id','cnn','start_time', 'end_time','num_points'])

        summary_facts_df['start_time'] = pandas.to_datetime(summary_facts_df['start_time'], unit='ns')
        summary_facts_df['end_time'] = pandas.to_datetime(summary_facts_df['end_time'], unit='ns')
        shuttle_map[name] = summary_facts_df

    return shuttle_map
result = prep_df_for_summary(input_df)


## Step 4: Insert into shuttle_summary_facts Table

In [358]:
for a in result.values():
    a.to_sql(name='shuttle_summary_facts', index=False, if_exists='append', chunksize=10000, con=conn)