# Descriptive analysis for the manuscript

Summarize geotagged tweets of the multiple regions used for the experiment and the application.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import yaml
import scipy.stats as stats

def load_region_tweets(region=None):
    df = pd.read_csv(f'../../dbs/{region}/geotweets.csv')
    df['day'] = df['createdat'].apply(lambda x: x.split(' ')[0])
    df['createdat'] = pd.to_datetime(df['createdat'], infer_datetime_format=True)
    t_max, t_min = df.createdat.max(), df.createdat.min()
    time_span = f'{t_min} - {t_max}'
    num_users = len(df.userid.unique())
    num_geo = len(df)
    num_days = np.median(df.groupby(['userid'])['day'].nunique())
    num_geo_freq = np.median(df.groupby(['userid']).size() / df.groupby(['userid'])['day'].nunique())
    return region, time_span, num_users, num_geo, num_days, num_geo_freq

region_list = ['sweden', 'netherlands', 'saopaulo', 'australia', 'austria', 'barcelona',
               'capetown', 'cebu', 'egypt', 'guadalajara', 'jakarta',
               'johannesburg', 'kualalumpur', 'lagos', 'madrid', 'manila', 'mexicocity', 'moscow', 'nairobi',
               'rio', 'saudiarabia', 'stpertersburg', 'surabaya']

with open('../../lib/regions.yaml', encoding='utf8') as f:
    region_manager = yaml.load(f, Loader=yaml.FullLoader)

## 1 Summarize the geotagged tweets used as input to the model
Geotagged tweets: Time span, No. of Twitter users, No. of geotagged tweets,
Days covered/user, No. of geotagged tweets/day/user

In [7]:
df = pd.DataFrame([load_region_tweets(region=x) for x in region_list],
                  columns=('region', 'time_span', 'num_users', 'num_geo', 'num_days', 'num_geo_freq'))
df.loc[:, 'gdp_capita'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['gdp_capita'])
df.loc[:, 'country'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['country'])
df.loc[:, 'pop'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['pop'])
df.loc[:, 'time_span'] = df.loc[:, 'time_span'].apply(lambda x: ' - '.join([x_t.split(' ')[0] for x_t in x.split(' - ')]))
df.loc[:, 'region'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['name'])
df

Unnamed: 0,region,time_span,num_users,num_geo,num_days,num_geo_freq,gdp_capita,country,pop
0,Sweden,2010-09-15 - 2019-03-31,3961,1248158,111.0,1.432292,54.61,Sweden,10.23
1,The Netherlands,2010-09-12 - 2019-04-22,5375,1479674,100.0,1.402878,53.02,The Netherlands,17.28
2,"São Paulo, Brazil",2010-09-15 - 2019-06-07,10943,3513796,96.0,1.519231,27.13,Brazil,12.18
3,Australia,2010-09-11 - 2019-05-09,3310,847668,84.0,1.457786,57.4,Australia,24.99
4,Austria,2010-09-13 - 2019-09-23,729,168364,88.0,1.357143,51.5,Austria,8.56
5,"Barcelona, Spain",2010-09-14 - 2019-06-24,1891,482357,95.0,1.371429,33.1,Spain,5.56
6,"Cape Town, South Africa",2011-01-20 - 2019-06-05,1092,193993,62.0,1.517549,6.2,South Africa,0.43
7,"Cebu, Philippines",2010-09-16 - 2019-09-22,1486,400094,83.0,1.593162,3.1,Philippines,0.92
8,Egypt,2010-10-06 - 2019-09-26,1464,234322,47.0,1.642324,2.55,Egypt,98.42
9,"Guadalajara, Mexico",2011-02-10 - 2019-09-19,684,212681,102.5,1.490107,17.88,Mexico,1.5


In [8]:
df.to_clipboard(index=False)

## 2 Merge ODMs for visualisation
This part applies to Sweden, The Netherlands, and Sao Paulo, Brazil.

Separate files will be deleted.

In [7]:
for region in ['sweden', 'netherlands', 'saopaulo']:
    df = pd.read_csv(f'../../dbs/{region}/odm_gt.csv')
    df_c = pd.read_csv(f'../../dbs/{region}/calibration_odm.csv')
    df_v = pd.read_csv(f'../../dbs/{region}/validation_odm.csv')
    df = pd.merge(df, df_c, on=['ozone', 'dzone'])
    df = df.rename(columns={'model': 'model_c'})
    df = pd.merge(df, df_v, on=['ozone', 'dzone'])
    df = df.rename(columns={'model': 'model_v'})
    df.loc[:, ['ozone', 'dzone', 'gt', 'model_c', 'model_v']].to_csv(f'../../dbs/{region}/odms.csv', index=False)

## 3 Quantify the od-pair similarity
This part applies to Sweden, The Netherlands, and Sao Paulo, Brazil.

The overall similarity.

In [6]:
for region in ['sweden', 'netherlands', 'saopaulo']:
    df = pd.read_csv(f'../../dbs/{region}/odms.csv')
    print(region, 'GT vs. calibrated model', stats.kendalltau(df.loc[:, 'gt'], df.loc[:, 'model_c']))
    print(region, 'GT vs. validated model', stats.kendalltau(df.loc[:, 'gt'], df.loc[:, 'model_v']))

sweden GT vs. calibrated model KendalltauResult(correlation=0.12707912414478398, pvalue=0.0)
sweden GT vs. validated model KendalltauResult(correlation=0.12001211410870506, pvalue=0.0)
netherlands GT vs. calibrated model KendalltauResult(correlation=0.20725705094829874, pvalue=0.0)
netherlands GT vs. validated model KendalltauResult(correlation=0.21088191229229744, pvalue=0.0)
saopaulo GT vs. calibrated model KendalltauResult(correlation=0.3129177858473001, pvalue=0.0)
saopaulo GT vs. validated model KendalltauResult(correlation=0.3081899966684245, pvalue=0.0)


## 4 Check the domestic PKT of The Netherlands