# Descriptive analysis of input data
1. Number of stops, users, active days, and time span.
2. Users' active days description: count, # of stops per active day, duration of activities per active day.

In [1]:
%load_ext autoreload
%autoreload 2
%cd D:\mad4abm

D:\mad4abm


In [42]:
# Load libs
import pandas as pd
import geopandas as gpd
import sqlalchemy
import numpy as np
from scipy import stats
from tqdm import tqdm
from lib import preprocess as preprocess

In [3]:
# Data location
user = preprocess.keys_manager['database']['user']
password = preprocess.keys_manager['database']['password']
port = preprocess.keys_manager['database']['port']
db_name = preprocess.keys_manager['database']['name']
engine = sqlalchemy.create_engine(f'postgresql://{user}:{password}@localhost:{port}/{db_name}')

## 1. Load data

In [4]:
df = pd.read_sql_query(sql="""SELECT * FROM vgr_stops_2019;""", con=engine)
df.head(3)

Unnamed: 0,sid,geom,device_uid,x,y,arrive_id,arrive_time,arrive_type,depart_id,depart_time,depart_type,duration,data_points,deso_code
0,15153607,0101000020BE0B00006DF1953B330B254170B279971A26...,9bd75715-496e-45f1-ae40-7393ce2a378a,689562,6592618,3394,2019-09-15 13:22:54+02,stop,3397,2019-09-15 13:30:33+02,start,459.0,4,0187C1060
1,15153608,0101000020BE0B000006F86AE8340B254109031A891926...,9bd75715-496e-45f1-ae40-7393ce2a378a,689562,6592614,3399,2019-09-15 13:30:45+02,stop,3400,2019-09-15 13:31:17+02,start,32.0,2,0187C1060
2,15153609,0101000020BE0B000097BDFBDB2E0B25413D9139771826...,9bd75715-496e-45f1-ae40-7393ce2a378a,689559,6592610,3401,2019-09-15 13:31:22+02,stop,3402,2019-09-15 13:31:48+02,start,26.0,2,0187C1060


In [5]:
print("Data cover %s devices of %s stop points."%(df['device_uid'].nunique(), len(df)))

Data cover 66487 devices of 7532329 stop points.


In [8]:
df.loc[:, 'date'] = df.loc[:, 'arrive_time'].apply(lambda x: x.split(' ')[0])
print("Data have %s active days from %s to %s."%(df.loc[:, 'date'].nunique(), df.loc[:, 'date'].min(), df.loc[:, 'date'].max()))

Data have 91 active days from 2019-09-01 to 2019-11-30.


## 2. User statistics

In [10]:
df_test = df.loc[df.device_uid == df.device_uid[0], :]
df_test.head()

Unnamed: 0,sid,geom,device_uid,x,y,arrive_id,arrive_time,arrive_type,depart_id,depart_time,depart_type,duration,data_points,deso_code,date
0,15153607,0101000020BE0B00006DF1953B330B254170B279971A26...,9bd75715-496e-45f1-ae40-7393ce2a378a,689562,6592618,3394,2019-09-15 13:22:54+02,stop,3397,2019-09-15 13:30:33+02,start,459.0,4,0187C1060,2019-09-15
1,15153608,0101000020BE0B000006F86AE8340B254109031A891926...,9bd75715-496e-45f1-ae40-7393ce2a378a,689562,6592614,3399,2019-09-15 13:30:45+02,stop,3400,2019-09-15 13:31:17+02,start,32.0,2,0187C1060,2019-09-15
2,15153609,0101000020BE0B000097BDFBDB2E0B25413D9139771826...,9bd75715-496e-45f1-ae40-7393ce2a378a,689559,6592610,3401,2019-09-15 13:31:22+02,stop,3402,2019-09-15 13:31:48+02,start,26.0,2,0187C1060,2019-09-15
3,15153610,0101000020BE0B0000D1532C02290B25415AEF72251C26...,9bd75715-496e-45f1-ae40-7393ce2a378a,689557,6592625,3403,2019-09-15 13:31:51+02,stop,3422,2019-09-15 14:08:02+02,start,2171.0,20,0187C1060,2019-09-15
4,15153611,0101000020BE0B0000C7B98C184F0B25414369C3072326...,9bd75715-496e-45f1-ae40-7393ce2a378a,689576,6592652,3423,2019-09-15 14:08:06+02,stop,3429,2019-09-15 14:10:00+02,start,114.0,7,0187C1060,2019-09-15


In [24]:
def user_stats(data):
    # No. of active days
    num_days = data.loc[:, 'date'].nunique()
    # No. of stays
    num_stays = len(data)
    # Description of active days
    ## Median value of stays/active day
    num_stays_act = data['date'].value_counts().median()
    ## Median value of total duration of stays/active day (h)
    dur_total_act = np.median(data.groupby('date')['duration'].sum()/60/60)
    ## Share of isolated stays
    dur_na_share = len(data.loc[data['duration'].isna(), :])/len(data)
    ## Median value of duration/stay (min)
    dur_median = data['duration'].median()/60
    return pd.Series(dict(num_days=num_days, num_stays=num_stays, num_stays_act=num_stays_act,
                          dur_total_act=dur_total_act, dur_na_share=dur_na_share, dur_median=dur_median))
tqdm.pandas()
df_user = df.groupby('device_uid').progress_apply(user_stats).reset_index()

100%|██████████| 66487/66487 [02:28<00:00, 447.12it/s]


In [25]:
df_user.describe()

Unnamed: 0,num_days,num_stays,num_stays_act,dur_total_act,dur_na_share,dur_median
count,66487.0,66487.0,66487.0,66487.0,66487.0,65040.0
mean,14.348083,113.290252,5.018176,2.095566,0.210973,27.599589
std,17.58446,228.665485,7.511723,4.079162,0.235394,137.342562
min,1.0,1.0,1.0,0.0,0.0,0.016667
25%,2.0,6.0,2.0,0.158611,0.038462,1.533333
50%,7.0,29.0,3.0,0.7025,0.137931,7.395833
75%,20.0,117.0,6.0,2.157153,0.285714,28.8
max,91.0,5717.0,271.0,268.994167,1.0,16139.65


In [26]:
preprocess.dump2db_df(df_user, user, password, port, db_name, table_name='vgr_stops_2019', schema_name='description')

## 3. Home locations

In [27]:
df_home = pd.read_sql_query(sql="""SELECT * FROM vgr_homes_2019;""", con=engine)
df_home.head(3)

Unnamed: 0,sid,geom,device_uid,x,y,duration,count_days,count_stops,deso_code
0,4,0101000020BE0B0000D20BD31991BF1341F74D76F3D371...,0002541e-bcd8-448a-91a4-7ca1e1022f29,323556,6408016,152501,24,58,1480C3730
1,7,0101000020BE0B0000C6989540D8321741B2BC05B30E6B...,00044851-1774-4c16-bef4-e7968dd3e6e5,380086,6401083,25134,3,5,1490C1420
2,9,0101000020BE0B00008100201524C41841CAD7F8293454...,0004804a-e342-4b82-9270-62bb5c51be91,405769,6377681,0,5,7,1452A0020


In [28]:
print("Homes cover %s devices."%len(df_home))

Homes cover 27483 devices.


In [39]:
df_home_desc = df_home['deso_code'].value_counts().rename(index='count').reset_index().rename(columns={'index': 'deso'})

### 3.1 Population representativeness

In [41]:
gdf = gpd.GeoDataFrame.from_postgis(sql="""SELECT * FROM zones;""", con=engine)
gdf = gdf.loc[:, ['deso', 'befolkning', 'geom']]
gdf_home_desc = gdf.merge(df_home_desc, on='deso')

In [47]:
gdf_home_desc.to_file('results/zones_vgr_homes_2019.shp')

In [45]:
stats.kendalltau(gdf_home_desc['befolkning'], gdf_home_desc['count'])

KendalltauResult(correlation=0.34531041460408923, pvalue=3.463599678593136e-73)