# Summary of multi-region results

Summarize geotagged tweets and generated visits of the multiple regions.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import yaml
from scipy import stats

def load_region_tweets(region=None):
    df = pd.read_csv(f'../../dbs/{region}/geotweets.csv')
    num_users = len(df.userid.unique())
    num_geo = len(df)
    return region, num_users, num_geo


def user_proc(data):
    return pd.Series({'pkt': data.pkt.sum(),
                      'pkt_inland': data.pkt_inland.sum(),
                      'num_trips': data.num_trip.sum(),
                      'num_trips_inland': data.num_trip_inland.sum()})


def quantify_visits(region=None, runid=3):
    df_v = pd.read_csv(f'../../dbs/{region}/visits/visits_{runid}_stats.csv')
    df_users = df_v.groupby('userid').apply(user_proc).reset_index()
    return region, df_users.pkt.sum(), df_users.pkt_inland.sum(), df_users.num_trips.sum(), df_users.num_trips_inland.sum()

def get_recent_records_gt(data):
    data = data.loc[data['Year'] == max(data['Year']), ['Year', 'Unit', 'Value']].reset_index(drop=True)
    return data

region_list = ['sweden', 'netherlands', 'saopaulo', 'australia', 'austria', 'barcelona',
               'capetown', 'cebu', 'egypt', 'guadalajara', 'jakarta',
               'johannesburg', 'kualalumpur', 'lagos', 'madrid', 'manila', 'mexicocity', 'moscow', 'nairobi',
               'rio', 'saudiarabia', 'stpertersburg', 'surabaya']

region_list2 = ['australia', 'austria', 'barcelona', 'capetown', 'cebu', 'egypt', 'guadalajara', 'jakarta',
               'johannesburg', 'kualalumpur', 'lagos', 'madrid', 'manila', 'mexicocity', 'moscow', 'nairobi',
               'rio', 'saudiarabia', 'stpertersburg', 'surabaya']

with open('../../lib/regions.yaml', encoding='utf8') as f:
    region_manager = yaml.load(f, Loader=yaml.FullLoader)

## 1 Summarize the geotagged tweets used as input to the model
Geotagged tweets: No. of geotagged tweets, No. of Twitter users.

Generated visits: No. of trips, PKT

Days in total is 120, therefore, it needs to scale up by multiplying 365/120 to represent all the weekday trips in a year.

In [3]:
df = pd.DataFrame([load_region_tweets(region=x) for x in region_list],
                  columns=('region', 'num_users', 'num_geotweets'))

## 2 Explore the visits generated from the model

In [4]:
runid = 3
df_v = pd.DataFrame([quantify_visits(region=x, runid=runid) for x in region_list],
                    columns=('region', 'pkt', 'pkt_inland', 'num_trips', 'num_trips_inland'))
df_v.loc[:, 'pkt_yr'] = df_v.loc[:, 'pkt']/10e6 /120 * 365
df_v.loc[:, 'num_trips_yr'] = df_v.loc[:, 'num_trips']/10e6 /120 * 365
df_v.loc[:, 'pkt_inland_yr'] = df_v.loc[:, 'pkt_inland']/10e6 /120 * 365
df_v.loc[:, 'num_trips_inland_yr'] = df_v.loc[:, 'num_trips_inland']/10e6 /120 * 365

## 3 Combine different dimensions

In [5]:
df_v.loc[:, 'gdp_capita'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['gdp_capita'])
df_v.loc[:, 'region_name'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['name'])
df_v.loc[:, 'country'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['country'])
df_v.loc[:, 'pop_country'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['pop_country'])

In [6]:
df_n = pd.merge(df, df_v, on='region', how='inner')
df_n.loc[:, 'pkt_yr_capita'] = df_n.loc[:, 'pkt_yr'] / df_n.loc[:, 'num_users']
df_n.loc[:, 'pkt_inland_yr_capita'] = df_n.loc[:, 'pkt_inland_yr'] / df_n.loc[:, 'num_users']

## 4 Compare with the 'ground truth' of PKT

In [7]:
df_p = pd.read_csv('../../dbs/T000_input.txt')
df_p = df_p.loc[df_p['Variable'] == 'Total inland passenger transport']
df_p = df_p.groupby('Country').apply(get_recent_records_gt).reset_index().drop(columns=['level_1'])

In [8]:
df_n = pd.merge(df_n, df_p, left_on='country', right_on='Country', how='outer').dropna(subset=['region'])
df_n.loc[:, 'pkt_inland_yr_capita_gt'] = df_n.apply(lambda row: row['Value']/row['pop_country']/10e6 if not np.isnan(row['Value']) else np.nan,
                                             axis=1)# million / capita /yr
df_n = df_n.drop(columns=['Country', 'Unit', 'Value'])

In [9]:
df_n.to_csv(f'../../results/multi-region_stats_rid_{runid}.csv', index=False, encoding='utf-8-sig')

## 5 Correlation between gdp_capita and pkt_yr_capita

In [10]:
print('Total:')
print(stats.pearsonr(df_n.loc[:, 'pkt_yr_capita'], df_n.loc[:, 'gdp_capita']))

print('Total without Australia:')
df_n_outlier_removed = df_n.loc[df_n.region != 'australia']
print(stats.pearsonr(df_n_outlier_removed.loc[:, 'pkt_yr_capita'], df_n_outlier_removed.loc[:, 'gdp_capita']))

print('Inland:')
print(stats.pearsonr(df_n.loc[:, 'pkt_inland_yr_capita'], df_n.loc[:, 'gdp_capita']))


Total:
(0.12391439344859402, 0.5732234327645306)
Total without Australia:
(0.012489719298106444, 0.9560075550650615)
Inland:
(0.0782649140934574, 0.7226204217309067)


## 6 Compare inland PKT between model output and iTem source

In [11]:
df_n2comp = df_n.loc[~np.isnan(df_n.pkt_inland_yr_capita_gt),
                     ['region_name', 'num_users', 'pkt_inland_yr_capita', 'pkt_inland_yr_capita_gt', 'Year']]
df_n2comp.loc[:, 'ratio'] = df_n2comp.loc[:, 'pkt_inland_yr_capita'] / df_n2comp.loc[:, 'pkt_inland_yr_capita_gt']
df_n2comp

Unnamed: 0,region_name,num_users,pkt_inland_yr_capita,pkt_inland_yr_capita_gt,Year,ratio
0,Sweden,3961.0,0.005386,0.001371,2018.0,3.92867
4,Australia,3310.0,0.012546,0.001326,2018.0,9.457843
5,Austria,729.0,0.002343,0.000908,1992.0,2.58073
6,"Barcelona, Spain",1891.0,0.00457,0.000861,2016.0,5.30621
7,"Madrid, Spain",3172.0,0.009057,0.000861,2016.0,10.515147
13,"Guadalajara, Mexico",684.0,0.00793,0.000428,2018.0,18.525857
14,"Mexico City, Mexico",15615.0,0.006066,0.000428,2018.0,14.171953
19,"Moscow, Russia",4206.0,0.019033,0.000175,2018.0,108.94976
20,"Saint Petersburg, Russia",1386.0,0.01903,0.000175,2018.0,108.935418
