# Summary of multi-region results

Summarize geotagged tweets and generated visits of the multiple regions.

In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import numpy as np
import pandas as pd
import yaml
from scipy import stats

def load_region_tweets(region=None):
    df = pd.read_csv(f'../../dbs/{region}/geotweets.csv')
    num_users = len(df.userid.unique())
    num_geo = len(df)
    return region, num_users, num_geo


def user_proc(data):
    return pd.Series({'pkt': data.pkt.sum(),
                      'pkt_inland': data.pkt_inland.sum(),
                      'num_trips': data.num_trip.sum(),
                      'num_trips_inland': data.num_trip_inland.sum(),
                      'days': len(data.loc[data['pkt'] != 0, :])})


def quantify_visits(region=None, runid=3):
    df_v = pd.read_csv(f'../../dbs/{region}/visits/visits_{runid}_stats.csv')
    df_users = df_v.groupby('userid').apply(user_proc).reset_index()
    return region, df_users.pkt.sum(), df_users.pkt_inland.sum(), \
           df_users.num_trips.sum(), df_users.num_trips_inland.sum(), df_users.days.sum()

def get_recent_records_gt(data):
    data = data.loc[data['Year'] == max(data['Year']), ['Year', 'Unit', 'Value']].reset_index(drop=True)
    return data

region_list = ['sweden', 'netherlands', 'saopaulo', 'australia', 'austria', 'barcelona',
               'capetown', 'cebu', 'egypt', 'guadalajara', 'jakarta',
               'johannesburg', 'kualalumpur', 'lagos', 'madrid', 'manila', 'mexicocity', 'moscow', 'nairobi',
               'rio', 'saudiarabia', 'stpertersburg', 'surabaya']

region_list2 = ['australia', 'austria', 'barcelona', 'capetown', 'cebu', 'egypt', 'guadalajara', 'jakarta',
               'johannesburg', 'kualalumpur', 'lagos', 'madrid', 'manila', 'mexicocity', 'moscow', 'nairobi',
               'rio', 'saudiarabia', 'stpertersburg', 'surabaya']

with open('../../lib/regions.yaml', encoding='utf8') as f:
    region_manager = yaml.load(f, Loader=yaml.FullLoader)

## 1 Summarize the geotagged tweets used as input to the model
Geotagged tweets: No. of geotagged tweets, No. of Twitter users.

Generated visits: No. of trips, PKT

Days in total is 260, therefore, it needs to scale up by multiplying 365/260 to
represent all the weekday trips in a year.

In [26]:
df = pd.DataFrame([load_region_tweets(region=x) for x in region_list],
                  columns=('region', 'num_users', 'num_geotweets'))

## 2 Explore the visits generated from the model

In [27]:
runid = 5
df_v = pd.DataFrame([quantify_visits(region=x, runid=runid) for x in region_list],
                    columns=('region', 'pkt', 'pkt_inland',
                             'num_trips', 'num_trips_inland', 'days'))
df_v = pd.merge(df, df_v, on='region', how='inner')
df_v.loc[:, 'pkt_yr'] = df_v.loc[:, 'pkt']/10e3 /df_v.loc[:, 'days'] * 365 * df_v.loc[:, 'num_users']

df_v.loc[:, 'num_trips_yr'] = df_v.loc[:, 'num_trips']/10e3 /df_v.loc[:, 'days'] * 365 * df_v.loc[:, 'num_users']
df_v.loc[:, 'pkt_inland_yr'] = df_v.loc[:, 'pkt_inland']/10e3 /df_v.loc[:, 'days'] * 365 * df_v.loc[:, 'num_users']
df_v.loc[:, 'num_trips_inland_yr'] = df_v.loc[:, 'num_trips_inland']/10e3 /df_v.loc[:, 'days'] * 365 * df_v.loc[:, 'num_users']

## 3 Combine different dimensions

In [28]:
df_v.loc[:, 'gdp_capita'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['gdp_capita'])
df_v.loc[:, 'region_name'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['name'])
df_v.loc[:, 'country'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['country'])
df_v.loc[:, 'pop_country'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['pop_country'])

In [29]:
df_v.loc[:, 'pkt_yr_capita'] = df_v.loc[:, 'pkt_yr'] / df_v.loc[:, 'num_users']
df_v.loc[:, 'pkt_inland_yr_capita'] = df_v.loc[:, 'pkt_inland_yr'] / df_v.loc[:, 'num_users']

## 4 Compare with the 'ground truth' of PKT

In [30]:
df_p = pd.read_csv('../../dbs/T000_input.txt')
df_p = df_p.loc[df_p['Variable'] == 'Total inland passenger transport']
df_p = df_p.groupby('Country').apply(get_recent_records_gt).reset_index().drop(columns=['level_1'])

In [31]:
df_n = pd.merge(df_v, df_p, left_on='country', right_on='Country', how='outer').dropna(subset=['region'])
df_n.loc[:, 'pkt_inland_yr_capita_gt'] = df_n.apply(lambda row: row['Value']/row['pop_country']/10e3 if not np.isnan(row['Value']) else np.nan,
                                             axis=1)# million / capita /yr
df_n = df_n.drop(columns=['Country', 'Unit', 'Value'])

In [34]:
df_n.to_csv(f'../../results/multi-region_stats_rid_{runid}.csv', index=False, encoding='utf-8-sig')

## 5 Correlation between gdp_capita and pkt_yr_capita

In [32]:
print('Total:')
print(stats.pearsonr(df_n.loc[:, 'pkt_yr_capita'], df_n.loc[:, 'gdp_capita']))

print('Total without Australia:')
df_n_outlier_removed = df_n.loc[df_n.region != 'australia']
print(stats.pearsonr(df_n_outlier_removed.loc[:, 'pkt_yr_capita'], df_n_outlier_removed.loc[:, 'gdp_capita']))

print('Inland:')
print(stats.pearsonr(df_n.loc[:, 'pkt_inland_yr_capita'], df_n.loc[:, 'gdp_capita']))


Total:
(0.15107099300222424, 0.4914068393109051)
Total without Australia:
(0.018889749848707334, 0.9335048314453946)
Inland:
(0.15107099300222424, 0.4914068393109051)


## 6 Compare inland PKT between model output and iTem source

In [33]:
df_n2comp = df_n.loc[~np.isnan(df_n.pkt_inland_yr_capita_gt),
                     ['region_name', 'num_users', 'pkt_inland_yr_capita', 'pkt_inland_yr_capita_gt', 'Year']]
df_n2comp.loc[:, 'ratio'] = df_n2comp.loc[:, 'pkt_inland_yr_capita'] / df_n2comp.loc[:, 'pkt_inland_yr_capita_gt']
df_n2comp

Unnamed: 0,region_name,num_users,pkt_inland_yr_capita,pkt_inland_yr_capita_gt,Year,ratio
0,Sweden,3961.0,7.237595,1.371046,2018.0,5.278886
4,Australia,3310.0,12.140278,1.32648,2018.0,9.152253
5,Austria,729.0,4.181634,0.907722,1992.0,4.606734
6,"Barcelona, Spain",1891.0,7.322928,0.86134,2016.0,8.501786
7,"Madrid, Spain",3172.0,9.092355,0.86134,2016.0,10.556058
13,"Guadalajara, Mexico",684.0,7.084237,0.428044,2018.0,16.550241
14,"Mexico City, Mexico",15615.0,5.940794,0.428044,2018.0,13.878922
19,"Moscow, Russia",4206.0,14.814421,0.174695,2018.0,84.801723
20,"Saint Petersburg, Russia",1386.0,13.906543,0.174695,2018.0,79.604785
