## Step 1: Connect to the database and read the values

In [None]:
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine('postgresql://postgres@localhost/shuttle_database')

conn = engine.connect()
input_df = pd.read_sql("SELECT * FROM shuttle_locations WHERE local_timestamp > '2017-09-15' AND local_timestamp < '2017-09-20'", conn)
input_df

Unnamed: 0,shuttle_id,tech_provider_id,shuttle_company_id,local_timestamp,location,cnn
0,252,2,2,2017-09-19 22:46:22,"(-122.3967933,37.7457141)",5988000.0
1,125,2,2,2017-09-19 22:46:22,"(-122.3963909,37.745213)",
2,115,2,1,2017-09-19 22:52:47,"(-122.3930867,37.7555999)",10336000.0
3,313,2,1,2017-09-19 22:52:47,"(-122.3935873,37.7558359)",
4,242,2,2,2017-09-19 22:58:07,"(-122.396788,37.7456755)",
5,100,2,2,2017-09-19 22:58:06,"(-122.3963553,37.745296)",
6,319,2,2,2017-09-19 22:58:05,"(-122.3966214,37.7456051)",5988000.0
7,21,2,1,2017-09-19 22:58:11,"(-122.3849934,37.7544446)",
8,113,4,5,2017-09-19 17:53:16,"(-122.423787,37.788707)",5823000.0
9,97,2,2,2017-09-19 23:02:52,"(-122.396892,37.745798)",5988000.0


## Step 2: fill in the blanks on data that does not have CNN information

In [34]:
import math
import functools
import datetime as dt

def start_time(series):
    return functools.reduce(lambda x, y: x if x < y else y, series)
 
def start_time(series):
    return functools.reduce(lambda x, y: x if x < y else y, series)
        
def new_cnn(series):
    if len(series)<2:
        return True
    else:
        return series[0] != series[1]
    
def flatten_aggregations(df):
    df.columns = [' '.join(col).strip() for col in df.columns.values]
    return df

def create_cnn_events(df):
    return df['cnn'].rolling(2, min_periods=1).apply(new_cnn).cumsum()

def to_summary_format(df):
   return pd.concat([
        df['shuttle_id first'],
        df['cnn first'],
        df['ts min'],
        df['ts max'],
        df['ts count']] ,
        axis=1, 
        keys=['shuttle_id','cnn','start_time', 'end_time','num_points']) 

def aggregate_by_cnn_event(df):
        df['ts'] = df['local_timestamp'].astype('int64')
        df = df.groupby(['cnn_event'])
        df_agg = df.agg({'shuttle_id': 'first', 'cnn': 'first', 'ts': ['min','max','count']})
        
        df_flat = flatten_aggregations(df_agg)        

        res_df = to_summary_format(df_flat)

        res_df['start_time'] = pd.to_datetime(res_df['start_time'], unit='ns')
        res_df['end_time'] = pd.to_datetime(res_df['end_time'], unit='ns')
        return res_df
    
def prep_df_for_summary(df):


    dfs = df.groupby(['shuttle_id'])
    shuttle_map = {}
    for name, grouped in dfs:
        sorted_df = grouped.sort_values('local_timestamp')
        sorted_df['cnn'].fillna(method='ffill',inplace=True)
        print(sorted_df)
        sorted_df['cnn_event'] = create_cnn_events(sorted_df)        
        shuttle_map[name] = aggregate_by_cnn_event(sorted_df)

    return shuttle_map
result = prep_df_for_summary(input_df)
result[8]

     shuttle_id  tech_provider_id  shuttle_company_id     local_timestamp  \
295           1                 1                   1 2017-09-19 22:30:19   
602           1                 1                   1 2017-09-19 22:33:55   
514           1                 1                   1 2017-09-19 22:36:37   
763           1                 1                   1 2017-09-19 22:44:07   
769           1                 1                   1 2017-09-19 22:44:13   
381           1                 1                   1 2017-09-19 22:47:49   
638           1                 1                   1 2017-09-19 23:21:13   
215           1                 1                   1 2017-09-19 23:21:31   
340           1                 1                   1 2017-09-19 23:23:25   
563           1                 1                   1 2017-09-19 23:27:01   
570           1                 1                   1 2017-09-19 23:27:07   
139           1                 1                   1 2017-09-19 23:27:19   

     shuttle_id  tech_provider_id  shuttle_company_id     local_timestamp  \
684          66                 2                   6 2017-09-19 20:53:21   
454          66                 2                   6 2017-09-19 20:53:30   
65           66                 2                   6 2017-09-19 20:53:57   
703          66                 2                   6 2017-09-19 21:00:25   
472          66                 2                   6 2017-09-19 21:06:16   
477          66                 2                   6 2017-09-19 21:06:28   
943          66                 2                   6 2017-09-19 21:18:02   
276          66                 2                   6 2017-09-19 21:18:27   
309          66                 2                   6 2017-09-19 21:18:56   
314          66                 2                   6 2017-09-19 21:19:08   
493          66                 2                   6 2017-09-19 21:25:27   
758          66                 2                   6 2017-09-19 21:25:57   

86   (-122.3987554,37.7159955)  377000.0  
     shuttle_id  tech_provider_id  shuttle_company_id     local_timestamp  \
490         145                 2                   6 2017-09-19 21:25:23   
753         145                 2                   6 2017-09-19 21:25:53   
322         145                 2                   6 2017-09-19 21:26:44   
548         145                 2                   6 2017-09-19 21:33:09   
429         145                 2                   6 2017-09-19 22:06:27   
432         145                 2                   6 2017-09-19 22:06:30   
581         145                 2                   6 2017-09-19 22:10:13   
150         145                 2                   6 2017-09-19 22:10:36   
899         145                 2                   6 2017-09-19 22:20:43   
94          145                 2                   6 2017-09-19 22:21:48   
102         145                 2                   6 2017-09-19 22:22:01   
828         145                 2

Unnamed: 0_level_0,shuttle_id,cnn,start_time,end_time,num_points
cnn_event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


## Step 4: Insert into shuttle_summary_facts Table

In [358]:
for a in result.values():
    a.to_sql(name='shuttle_summary_facts', index=False, if_exists='append', chunksize=10000, con=conn)