# Descriptive analysis for the manuscript

Summarize geotagged tweets of the multiple regions used for the model experiment.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd
import yaml
import scipy.stats as stats
from  tqdm import tqdm
import helpers as hp

In [3]:
def load_region_tweets(region=None):
    df = pd.read_csv(f'../../dbs/{region}/geotweets.csv')
    df['day'] = df['createdat'].apply(lambda x: x.split(' ')[0])
    df['createdat'] = pd.to_datetime(df['createdat'], infer_datetime_format=True)
    t_max, t_min = df.createdat.max(), df.createdat.min()
    time_span = f'{t_min} - {t_max}'
    num_users = len(df.userid.unique())
    num_geo = len(df)
    num_days = np.median(df.groupby(['userid'])['day'].nunique())
    num_geo_freq = np.median(df.groupby(['userid']).size() / df.groupby(['userid'])['day'].nunique())
    return region, time_span, num_users, num_geo, num_days, num_geo_freq

def user_stats_cal(data):
    time_span = data.createdat.max() - data.createdat.min()
    time_span = time_span.days
    if time_span == 0:
        time_span += 1
    num_days = data['day'].nunique()
    num_geo = len(data)
    geo_freq = num_geo / num_days
    share_active = num_days / time_span
    return pd.DataFrame.from_dict({'time_span': [time_span],
            'num_days': [num_days],
            'num_geo': [num_geo],
            'geo_freq': [geo_freq],
            'share_active': [share_active]
            })

def region_tweets_stats_per_user(region=None):
    df = pd.read_csv(f'../../dbs/{region}/geotweets.csv')
    df['day'] = df['createdat'].apply(lambda x: x.split(' ')[0])
    df['createdat'] = pd.to_datetime(df['createdat'], infer_datetime_format=True)
    tqdm.pandas(desc=region)
    df_users = df.groupby('userid').progress_apply(user_stats_cal).reset_index()
    df_users.loc[:, 'region'] = region
    df_users.drop(columns=['level_1'], inplace=True)
    return df_users

region_list = ['sweden', 'netherlands', 'saopaulo']

with open('../../lib/regions.yaml', encoding='utf8') as f:
    region_manager = yaml.load(f, Loader=yaml.FullLoader)

## 1 Summarize the geotagged tweets used as input to the model
Geotagged tweets: Time span, No. of Twitter users, No. of geotagged tweets,
Days covered/user, No. of geotagged tweets/day/user

In [4]:
df = pd.DataFrame([load_region_tweets(region=x) for x in region_list],
                  columns=('region', 'time_span', 'num_users', 'num_geo', 'num_days', 'num_geo_freq'))
df.loc[:, 'gdp_capita'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['gdp_capita'])
df.loc[:, 'country'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['country'])
df.loc[:, 'pop'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['pop'])
df.loc[:, 'time_span'] = df.loc[:, 'time_span'].apply(lambda x: ' - '.join([x_t.split(' ')[0] for x_t in x.split(' - ')]))
df.loc[:, 'region'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['name'])
df

Unnamed: 0,region,time_span,num_users,num_geo,num_days,num_geo_freq,gdp_capita,country,pop
0,Sweden,2010-09-15 - 2019-03-31,3961,1248158,111.0,1.432292,54.61,Sweden,10.23
1,The Netherlands,2010-09-12 - 2019-04-22,5375,1479674,100.0,1.402878,53.02,The Netherlands,17.28
2,"São Paulo, Brazil",2010-09-15 - 2019-06-07,10943,3513796,96.0,1.519231,27.13,Brazil,12.18


In [None]:
df.to_clipboard(index=False)

## 2 Merge ODMs for visualisation
This part applies to Sweden, The Netherlands, and Sao Paulo, Brazil.

Separate files will be deleted.

In [4]:
for region in ['sweden', 'netherlands', 'saopaulo']:
    df = pd.read_csv(f'../../dbs/{region}/odm_gt.csv')
    df_c = pd.read_csv(f'../../dbs/{region}/odm_calibration.csv')
    df_v = pd.read_csv(f'../../dbs/{region}/odm_validation.csv')
    df_cb = pd.read_csv(f'../../dbs/{region}/odm_benchmark_c.csv')
    df_vb = pd.read_csv(f'../../dbs/{region}/odm_benchmark_v.csv')
    df = pd.merge(df, df_c, on=['ozone', 'dzone'])
    df = df.rename(columns={'model': 'model_c'})
    df = pd.merge(df, df_v, on=['ozone', 'dzone'])
    df = df.rename(columns={'model': 'model_v'})
    df = pd.merge(df, df_cb, on=['ozone', 'dzone'])
    df = df.rename(columns={'benchmark': 'benchmark_c'})
    df = pd.merge(df, df_vb, on=['ozone', 'dzone'])
    df = df.rename(columns={'benchmark': 'benchmark_v'})
    df.loc[:, ['ozone', 'dzone',
               'gt', 'model_c', 'model_v',
               'benchmark_c', 'benchmark_v']].to_csv(f'../../dbs/{region}/odms.csv', index=False)
    os.remove(f'../../dbs/{region}/odm_gt.csv')
    os.remove(f'../../dbs/{region}/odm_calibration.csv')
    os.remove(f'../../dbs/{region}/odm_validation.csv')
    os.remove(f'../../dbs/{region}/odm_benchmark_c.csv')
    os.remove(f'../../dbs/{region}/odm_benchmark_v.csv')

## 3 Quantify the od-pair similarity
This part applies to Sweden, The Netherlands, and Sao Paulo, Brazil.

The overall similarity.

In [5]:
quant_list = []
for region in ['sweden', 'netherlands', 'saopaulo']:
    df = pd.read_csv(f'../../dbs/{region}/odms.csv')
    df_c = df.loc[(df.gt != 0) & (df.model_c != 0) & (df.benchmark_c != 0), :]
    mc = stats.kendalltau(df_c.loc[:, 'gt'], df_c.loc[:, 'model_c'])
    quant_list.append((region, 'model', 'c', mc.correlation, mc.pvalue, hp.ssi_dataframe(df, 'gt', 'model_c')))

    bc = stats.kendalltau(df_c.loc[:, 'gt'], df_c.loc[:, 'benchmark_c'])
    quant_list.append((region, 'benchmark', 'c', bc.correlation, bc.pvalue, hp.ssi_dataframe(df, 'gt', 'benchmark_c')))

    df_v = df.loc[(df.gt != 0) & (df.model_v != 0) & (df.benchmark_v != 0), :]
    mv = stats.kendalltau(df_v.loc[:, 'gt'], df_v.loc[:, 'model_v'])
    quant_list.append((region, 'model', 'v', mv.correlation, mv.pvalue, hp.ssi_dataframe(df, 'gt', 'model_v')))

    bv = stats.kendalltau(df_v.loc[:, 'gt'], df_v.loc[:, 'benchmark_v'])
    quant_list.append((region, 'benchmark', 'v', bv.correlation, bv.pvalue, hp.ssi_dataframe(df, 'gt', 'benchmark_v')))
df_stats = pd.DataFrame(quant_list, columns=['region', 'type', 'data', 'cor', 'p', 'ssi'])
df_stats

Unnamed: 0,region,type,data,cor,p,ssi
0,sweden,model,c,0.18727,6.653739999999999e-246,0.301279
1,sweden,benchmark,c,0.220507,9.577697000000001e-277,0.301584
2,sweden,model,v,0.18407,2.009274e-128,0.311876
3,sweden,benchmark,v,0.266851,3.4163179999999996e-215,0.327934
4,netherlands,model,c,0.439008,0.0,0.431704
5,netherlands,benchmark,c,0.331307,0.0,0.391578
6,netherlands,model,v,0.432798,0.0,0.432729
7,netherlands,benchmark,v,0.40465,0.0,0.387912
8,saopaulo,model,c,0.414933,0.0,0.510122
9,saopaulo,benchmark,c,0.296695,0.0,0.446349


In [6]:
df_stats.groupby(['region', 'type'])['cor', 'ssi'].mean()

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,cor,ssi
region,type,Unnamed: 2_level_1,Unnamed: 3_level_1
netherlands,benchmark,0.367978,0.389745
netherlands,model,0.435903,0.432216
saopaulo,benchmark,0.326389,0.450639
saopaulo,model,0.447656,0.543193
sweden,benchmark,0.243679,0.314759
sweden,model,0.18567,0.306577


In [16]:
stats_av = df_stats.groupby(['region', 'type'])['cor', 'ssi'].mean().reset_index()

  """Entry point for launching an IPython kernel.


In [19]:
pd.pivot_table(stats_av, index='region', columns=['type'], values=['cor', 'ssi']).to_clipboard()