# Summary of multi-region results

Summarize geotagged tweets and generated visits of the multiple regions.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import yaml
from scipy import stats

def load_region_tweets(region=None):
    df = pd.read_csv(f'../../dbs/{region}/geotweets.csv')
    num_users = len(df.userid.unique())
    num_geo = len(df)
    return region, num_users, num_geo


def user_proc(data):
    return pd.Series({'pkt': data.pkt.sum(),
                      'pkt_inland': data.pkt_inland.sum(),
                      'num_trips': data.num_trip.sum(),
                      'num_trips_inland': data.num_trip_inland.sum(),
                      'days': len(data.loc[data['pkt'] != 0, :])})


def quantify_visits(region=None, runid=3):
    df_v = pd.read_csv(f'../../dbs/{region}/visits/visits_{runid}_stats.csv')
    df_users = df_v.groupby('userid').apply(user_proc).reset_index()
    return region, df_users.pkt.sum(), df_users.pkt_inland.sum(), \
           df_users.num_trips.sum(), df_users.num_trips_inland.sum(), df_users.days.sum()

def get_recent_records_gt(data):
    data = data.loc[data['Year'] == max(data['Year']), ['Year', 'Unit', 'Value']].reset_index(drop=True)
    return data

region_list = ['sweden', 'netherlands', 'saopaulo', 'australia', 'austria', 'barcelona',
               'capetown', 'cebu', 'egypt', 'guadalajara', 'jakarta',
               'johannesburg', 'kualalumpur', 'lagos', 'madrid', 'manila', 'mexicocity', 'moscow', 'nairobi',
               'rio', 'saudiarabia', 'stpertersburg', 'surabaya']

region_list2 = ['australia', 'austria', 'barcelona', 'capetown', 'cebu', 'egypt', 'guadalajara', 'jakarta',
               'johannesburg', 'kualalumpur', 'lagos', 'madrid', 'manila', 'mexicocity', 'moscow', 'nairobi',
               'rio', 'saudiarabia', 'stpertersburg', 'surabaya']

with open('../../lib/regions.yaml', encoding='utf8') as f:
    region_manager = yaml.load(f, Loader=yaml.FullLoader)

## 1 Summarize the geotagged tweets used as input to the model
Geotagged tweets: No. of geotagged tweets, No. of Twitter users.

Generated visits: No. of trips, PKT

Days in total is 260, therefore, it needs to scale up by multiplying 365/260 to
represent all the weekday trips in a year.

In [5]:
df = pd.DataFrame([load_region_tweets(region=x) for x in region_list],
                  columns=('region', 'num_users', 'num_geotweets'))

## 2 Explore the visits generated from the model

In [4]:
runid = 7
df_v = pd.DataFrame([quantify_visits(region=x, runid=runid) for x in region_list],
                    columns=('region', 'pkt', 'pkt_inland',
                             'num_trips', 'num_trips_inland', 'days'))

NameError: name 'df' is not defined

In [6]:
df_v = pd.merge(df, df_v, on='region', how='inner')
df_v.loc[:, 'pkt_yr'] = df_v.loc[:, 'pkt']/10e3 /df_v.loc[:, 'days'] * 365 * df_v.loc[:, 'num_users']

df_v.loc[:, 'num_trips_yr'] = df_v.loc[:, 'num_trips']/10e3 /df_v.loc[:, 'days'] * 365 * df_v.loc[:, 'num_users']
df_v.loc[:, 'pkt_inland_yr'] = df_v.loc[:, 'pkt_inland']/10e3 /df_v.loc[:, 'days'] * 365 * df_v.loc[:, 'num_users']
df_v.loc[:, 'num_trips_inland_yr'] = df_v.loc[:, 'num_trips_inland']/10e3 /df_v.loc[:, 'days'] * 365 * df_v.loc[:, 'num_users']

## 3 Combine different dimensions

In [7]:
df_v.loc[:, 'gdp_capita'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['gdp_capita'])
df_v.loc[:, 'region_name'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['name'])
df_v.loc[:, 'country'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['country'])
df_v.loc[:, 'pop_country'] = df_v.loc[:, 'region'].apply(lambda x: region_manager[x]['pop_country'])

In [8]:
df_v.loc[:, 'pkt_yr_capita'] = df_v.loc[:, 'pkt_yr'] / df_v.loc[:, 'num_users']
df_v.loc[:, 'pkt_inland_yr_capita'] = df_v.loc[:, 'pkt_inland_yr'] / df_v.loc[:, 'num_users']
df_v.loc[:, 'city'] = df_v.loc[:, 'region_name'].apply(lambda x: 1 if ',' in x else 0)

In [9]:
df_v.to_csv(f'../../results/multi-region_stats_rid_{runid}.csv', index=False, encoding='utf-8-sig')

## 4 Process the 'ground truth' of PKT vs GDP

In [10]:
country_list = ['Australia', 'Canada', 'China', 'Austria', 'Belgium',
       'Bulgaria', 'Czech Republic', 'Denmark', 'Finland', 'France', 'Germany',
       'Greece', 'Hungary', 'Italy', 'Lithuania', 'Malta', 'Netherlands',
       'Poland', 'Portugal', 'Slovak Republic', 'Slovenia', 'Spain', 'Sweden',
       'United Kingdom', 'India', 'Japan', 'Albania', 'Armenia', 'Azerbaijan',
       'Belarus', 'Georgia', 'Iceland', 'Montenegro, Republic of',
       'North Macedonia', 'Norway', 'Serbia, Republic of', 'Switzerland',
       'Turkey', 'Argentina', 'Russian Federation',
       "Korea, Democratic People's Republic of", 'United States of America']

country_name_mapping = {'United Kingdom': 'U.K.', 'United States of America': 'U.S.',
                        'Montenegro, Republic of': 'Montenegro',
                        'Netherlands': 'The Netherlands',
                        'Serbia, Republic of': 'Serbia', 'Russian Federation': 'Russia',
                        "Korea, Democratic People's Republic of": 'South Korea'}

df_pkt_list = []
df_pkt = pd.read_excel('../../dbs/pkt_gdp.xlsx', sheet_name='Sheet6')
for c in country_list:
    df_tp = df_pkt.loc[:, ['Country', c]].rename(columns={'Country': 'year',
                                                          c: 'pkt_inland_yr_capita'})
    df_tp.loc[:, 'country'] = c
    df_pkt_list.append(df_tp)
df_pkt = pd.concat(df_pkt_list)
df_pkt.loc[:, 'country'] = df_pkt.loc[:, 'country'].apply(lambda x: x if x not in country_name_mapping else country_name_mapping[x])

df_gdp_list = []
df_gdp = pd.read_excel('../../dbs/pkt_gdp.xlsx', sheet_name='Sheet7')
for c in country_list:
    df_tp = df_gdp.loc[:, ['Country', c]].rename(columns={'Country': 'year',
                                                          c: 'gdp_capita'})
    df_tp.loc[:, 'country'] = c
    df_gdp_list.append(df_tp)
df_gdp = pd.concat(df_gdp_list)
df_gdp.loc[:, 'country'] = df_gdp.loc[:, 'country'].apply(lambda x: x if x not in country_name_mapping else country_name_mapping[x])

df_gt = pd.merge(df_pkt, df_gdp, on=['country', 'year'])
df_gt = df_gt.dropna()


In [11]:
# The below countries appear in the Twitter-based estimations
gt_list = ['Australia', 'Austria', 'The Netherlands', 'Spain', 'Sweden', 'Russia']
df_gt.loc[:, 'tw'] = df_gt.loc[:, 'country'].apply(lambda x: 1 if x in gt_list else 0)
df_gt.head()

Unnamed: 0,year,pkt_inland_yr_capita,country,gdp_capita,tw
0,1980,10.494643,Australia,31.382796,1
1,1981,10.547323,Australia,32.015521,1
2,1982,10.915987,Australia,32.63492,1
3,1983,10.876303,Australia,31.456766,1
4,1984,11.25259,Australia,32.411512,1


In [12]:
df_gt.to_csv(f'../../results/multi-region_stats_gt.csv', index=False, encoding='utf-8-sig')

## 5 Correlation between gdp_capita and pkt_yr_capita

In [13]:
print('Total:')
print(stats.pearsonr(df_v.loc[:, 'pkt_yr_capita'], df_v.loc[:, 'gdp_capita']))

print('Total without Australia:')
df_n_outlier_removed = df_v.loc[df_v.region != 'australia']
print(stats.pearsonr(df_n_outlier_removed.loc[:, 'pkt_yr_capita'], df_n_outlier_removed.loc[:, 'gdp_capita']))

Total:
(0.14019227607774684, 0.523468749512717)
Total without Australia:
(0.009881197080280756, 0.9651894411871331)
