# Descriptive analysis for the manuscript

Summarize geotagged tweets of the multiple regions used for the experiment and the application.

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import numpy as np
import pandas as pd
import yaml
import scipy.stats as stats
from  tqdm import tqdm

def load_region_tweets(region=None):
    df = pd.read_csv(f'../../dbs/{region}/geotweets.csv')
    df['day'] = df['createdat'].apply(lambda x: x.split(' ')[0])
    df['createdat'] = pd.to_datetime(df['createdat'], infer_datetime_format=True)
    t_max, t_min = df.createdat.max(), df.createdat.min()
    time_span = f'{t_min} - {t_max}'
    num_users = len(df.userid.unique())
    num_geo = len(df)
    num_days = np.median(df.groupby(['userid'])['day'].nunique())
    num_geo_freq = np.median(df.groupby(['userid']).size() / df.groupby(['userid'])['day'].nunique())
    return region, time_span, num_users, num_geo, num_days, num_geo_freq

def user_stats_cal(data):
    time_span = data.createdat.max() - data.createdat.min()
    time_span = time_span.days
    if time_span == 0:
        time_span += 1
    num_days = data['day'].nunique()
    num_geo = len(data)
    geo_freq = num_geo / num_days
    share_active = num_days / time_span
    return pd.DataFrame.from_dict({'time_span': [time_span],
            'num_days': [num_days],
            'num_geo': [num_geo],
            'geo_freq': [geo_freq],
            'share_active': [share_active]
            })

def region_tweets_stats_per_user(region=None):
    df = pd.read_csv(f'../../dbs/{region}/geotweets.csv')
    df['day'] = df['createdat'].apply(lambda x: x.split(' ')[0])
    df['createdat'] = pd.to_datetime(df['createdat'], infer_datetime_format=True)
    tqdm.pandas(desc=region)
    df_users = df.groupby('userid').progress_apply(user_stats_cal).reset_index()
    df_users.loc[:, 'region'] = region
    df_users.drop(columns=['level_1'], inplace=True)
    return df_users

region_list = ['sweden', 'netherlands', 'saopaulo', 'australia', 'austria', 'barcelona',
               'capetown', 'cebu', 'egypt', 'guadalajara', 'jakarta',
               'johannesburg', 'kualalumpur', 'lagos', 'madrid', 'manila', 'mexicocity', 'moscow', 'nairobi',
               'rio', 'saudiarabia', 'stpertersburg', 'surabaya']

with open('../../lib/regions.yaml', encoding='utf8') as f:
    region_manager = yaml.load(f, Loader=yaml.FullLoader)

## 1 Summarize the geotagged tweets used as input to the model
Geotagged tweets: Time span, No. of Twitter users, No. of geotagged tweets,
Days covered/user, No. of geotagged tweets/day/user

In [6]:
df = pd.DataFrame([load_region_tweets(region=x) for x in region_list],
                  columns=('region', 'time_span', 'num_users', 'num_geo', 'num_days', 'num_geo_freq'))
df.loc[:, 'gdp_capita'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['gdp_capita'])
df.loc[:, 'country'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['country'])
df.loc[:, 'pop'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['pop'])
df.loc[:, 'time_span'] = df.loc[:, 'time_span'].apply(lambda x: ' - '.join([x_t.split(' ')[0] for x_t in x.split(' - ')]))
df.loc[:, 'region'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['name'])
df

KeyboardInterrupt: 

In [None]:
df.to_clipboard(index=False)

## 1-extra Summarize the geotagged tweets used as input to the model - by user
This is for dissertation presentation - sparsity issue.

Geotagged tweets: Time span, No. of Twitter users, No. of geotagged tweets,
Days covered/user, No. of geotagged tweets/day/user

In [25]:
df = pd.concat([region_tweets_stats_per_user(region=x) for x in region_list])
df.loc[:, 'gdp_capita'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['gdp_capita'])
df.loc[:, 'country'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['country'])
df.loc[:, 'pop'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['pop'])
df.loc[:, 'region'] = df.loc[:, 'region'].apply(lambda x: region_manager[x]['name'])
df.to_csv(f'../../dbs/regional_stats.csv', index=False)

sweden: 100%|██████████| 3961/3961 [00:05<00:00, 778.29it/s]
netherlands: 100%|██████████| 5375/5375 [00:06<00:00, 793.24it/s]
saopaulo: 100%|██████████| 10943/10943 [00:14<00:00, 766.58it/s] 
australia: 100%|██████████| 3310/3310 [00:04<00:00, 756.92it/s]
austria: 100%|██████████| 729/729 [00:00<00:00, 820.02it/s]
barcelona: 100%|██████████| 1891/1891 [00:02<00:00, 728.01it/s]
capetown: 100%|██████████| 1092/1092 [00:01<00:00, 760.97it/s]
cebu: 100%|██████████| 1486/1486 [00:01<00:00, 754.36it/s]
egypt: 100%|██████████| 1464/1464 [00:01<00:00, 779.08it/s]
guadalajara: 100%|██████████| 684/684 [00:00<00:00, 767.69it/s]
jakarta: 100%|██████████| 13088/13088 [00:17<00:00, 754.31it/s]
johannesburg: 100%|██████████| 1268/1268 [00:01<00:00, 820.29it/s]
kualalumpur: 100%|██████████| 4663/4663 [00:05<00:00, 838.44it/s]
lagos: 100%|██████████| 812/812 [00:01<00:00, 795.90it/s]
madrid: 100%|██████████| 3172/3172 [00:03<00:00, 868.21it/s]
manila: 100%|██████████| 11997/11997 [00:14<00:00, 817.76

## 2 Merge ODMs for visualisation
This part applies to Sweden, The Netherlands, and Sao Paulo, Brazil.

Separate files will be deleted.

In [4]:
for region in ['sweden', 'netherlands', 'saopaulo']:
    df = pd.read_csv(f'../../dbs/{region}/odm_gt.csv')
    df_c = pd.read_csv(f'../../dbs/{region}/odm_calibration.csv')
    df_v = pd.read_csv(f'../../dbs/{region}/odm_validation.csv')
    df_cb = pd.read_csv(f'../../dbs/{region}/odm_benchmark_c.csv')
    df_vb = pd.read_csv(f'../../dbs/{region}/odm_benchmark_v.csv')
    df = pd.merge(df, df_c, on=['ozone', 'dzone'])
    df = df.rename(columns={'model': 'model_c'})
    df = pd.merge(df, df_v, on=['ozone', 'dzone'])
    df = df.rename(columns={'model': 'model_v'})
    df = pd.merge(df, df_cb, on=['ozone', 'dzone'])
    df = df.rename(columns={'benchmark': 'benchmark_c'})
    df = pd.merge(df, df_vb, on=['ozone', 'dzone'])
    df = df.rename(columns={'benchmark': 'benchmark_v'})
    df.loc[:, ['ozone', 'dzone',
               'gt', 'model_c', 'model_v',
               'benchmark_c', 'benchmark_v']].to_csv(f'../../dbs/{region}/odms.csv', index=False)
    os.remove(f'../../dbs/{region}/odm_gt.csv')
    os.remove(f'../../dbs/{region}/odm_calibration.csv')
    os.remove(f'../../dbs/{region}/odm_validation.csv')
    os.remove(f'../../dbs/{region}/odm_benchmark_c.csv')
    os.remove(f'../../dbs/{region}/odm_benchmark_v.csv')

## 3 Quantify the od-pair similarity
This part applies to Sweden, The Netherlands, and Sao Paulo, Brazil.

The overall similarity.

In [9]:
quant_list = []
for region in ['sweden', 'netherlands', 'saopaulo']:
    df = pd.read_csv(f'../../dbs/{region}/odms.csv')
    df_c = df.loc[(df.gt != 0) & (df.model_c != 0) & (df.benchmark_c != 0), :]
    mc = stats.kendalltau(df_c.loc[:, 'gt'], df_c.loc[:, 'model_c'])
    quant_list.append((region, 'model', 'c', mc.correlation, mc.pvalue))

    bc = stats.kendalltau(df_c.loc[:, 'gt'], df_c.loc[:, 'benchmark_c'])
    quant_list.append((region, 'benchmark', 'c', bc.correlation, bc.pvalue))

    df_v = df.loc[(df.gt != 0) & (df.model_v != 0) & (df.benchmark_v != 0), :]
    mv = stats.kendalltau(df_v.loc[:, 'gt'], df_v.loc[:, 'model_v'])
    quant_list.append((region, 'model', 'v', mv.correlation, mv.pvalue))

    bv = stats.kendalltau(df_v.loc[:, 'gt'], df_v.loc[:, 'benchmark_v'])
    quant_list.append((region, 'benchmark', 'v', bv.correlation, bv.pvalue))
df_stats = pd.DataFrame(quant_list, columns=['region', 'type', 'data', 'cor', 'p'])
df_stats

Unnamed: 0,region,type,data,cor,p
0,sweden,model,c,0.18727,6.653739999999999e-246
1,sweden,benchmark,c,0.220507,9.577697000000001e-277
2,sweden,model,v,0.18407,2.009274e-128
3,sweden,benchmark,v,0.266851,3.4163179999999996e-215
4,netherlands,model,c,0.439008,0.0
5,netherlands,benchmark,c,0.331307,0.0
6,netherlands,model,v,0.432798,0.0
7,netherlands,benchmark,v,0.40465,0.0
8,saopaulo,model,c,0.414933,0.0
9,saopaulo,benchmark,c,0.296695,0.0


In [10]:
df_stats.groupby(['region', 'type'])['cor'].mean()



region       type     
netherlands  benchmark    0.367978
             model        0.435903
saopaulo     benchmark    0.326389
             model        0.447656
sweden       benchmark    0.243679
             model        0.185670
Name: cor, dtype: float64