In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from shapely.geometry import MultiPoint
from geopy.distance import great_circle
import requests
import json
import time
import duckdb
import awscli

from tqdm import tqdm

In [None]:
cities = ['Bordeaux', 'Clermont-Ferrand', 'Dijon', 'Grenoble', 'Lille',
                 'Lyon', 'Mans', 'Marseille', 'Metz', 'Montpellier',
                 'Nancy', 'Nantes', 'Nice', 'Orleans', 'Paris',
                 'Rennes', 'Saint-Etienne', 'Strasbourg', 'Toulouse', 'Tours']

apps = ['Web_Adult', 'Tor', 'YouTube']

traffic_dir = ['DL']

### Sum tile-level activity across time

In [None]:
nighttime = list(range(29)) + list(range(88, 97))

In [None]:
df = pd.DataFrame()

for city_str in cities:
    for app_str in apps:
        for rate_str in traffic_dir:
            
            df_1 = pd.DataFrame()
            
            for month in range(3, 6):
              print("Month", month, "in", city_str, "for", app_str, rate_str)
              traffic = dict()
              s = 1
              if month == 3:
                s = 16
              if month == 4:
                n = 31
              else:
                n = 32
              for day in tqdm(range(s, n)):
                day_index = day
                if day < 10:
                  day_str = f'20190{month}0{day}'
                else:
                  day_str = f'20190{month}{day}'

                df_2 = pd.read_csv(f'../Data/Netmob/{city_str}/{app_str}/{day_str}/{city_str}_{app_str}_{day_str}_{rate_str}.txt', sep = " ", header=None)
                
                col_list= list(df_2)
                col_list.remove(0)

                df_2[day_str] = df_2[col_list].sum(axis=1)
                
                df_2 = df_2[[0, day_str]].copy().rename(columns={0: "tile_id"})
                
                if df_1.empty:
                    df_1 = df_2.copy()
                else:
                    df_1 = df_1.merge(df_2, how = 'left', on = 'tile_id')
            
            df_1_list = list(df_1)
            df_1_list.remove('tile_id')
            
            df_1['traffic_sum'] = df_1[df_1_list].sum(axis=1)
            df_1['traffic_mean'] = df_1[df_1_list].mean(axis=1)
            
            df_1 = df_1[['tile_id', 'traffic_sum', 'traffic_mean']].copy()
            
            df_1["traffic_dir"] = rate_str
            df_1["apps"] = app_str
            df_1["cities"] = city_str
            
            df = pd.concat([df,df_1])
                
            


In [None]:
df.head()

In [None]:
df.shape

### Identify top active tiles

Add cpc correction factor

In [None]:
map_df = gpd.read_file("../midsave/map_crime.gpkg")

In [None]:
cpc = pd.read_csv("../midsave/cpc_com.csv", dtype={'code_com': str})

In [None]:
map_df.head()

In [None]:
cpc.head()

In [None]:
df = (df
      .merge(map_df[['tile_id', 'code_com', 'name_com', 'cities']], on = ['tile_id', 'cities'], how = 'left')
      .merge(cpc[['code_com', 'c']], on = ['code_com'], how = 'left'))

In [None]:
df['Tor_scaled'] = df['traffic_mean']*df['c']

Number of top tiles inspected (or alternatively top X%)

In [None]:
n = int(df.shape[0]*0.001)

In [None]:
n

In [None]:
top10 = (df
    .query("apps == 'Tor'")
    .query("traffic_dir == 'DL'")
    .nlargest(n, 'Tor_scaled'))

In [None]:
top10.cities.unique()

In [None]:
top_tiles = pd.DataFrame()

for city_str in top10.cities.unique():
    
    shape = gpd.read_file(f'../Data/Netmob/{city_str}/{city_str}.geojson')
    
    shape['cities'] = city_str
    
    tmp = top10.merge(shape, how = 'inner', on = ['tile_id','cities'])
    
    if top_tiles.empty:
        top_tiles = tmp
    else:
        top_tiles = pd.concat([top_tiles, tmp])


In [None]:
top_tiles = (gpd.GeoDataFrame(top_tiles, crs=shape.crs, geometry=top_tiles['geometry'])
             .nlargest(n, 'Tor_scaled')).reset_index(drop = True)

### Querying Overture maps for POI

#### For Tor

Only uncomment following line once to download Overture data

In [None]:
# aws s3 cp --region us-west-2 --no-sign-request --recursive s3://overturemaps-us-west-2/release/2023-07-26-alpha.0/theme=places/ /Users/tillkoebe/Documents/Data/Overture

In [None]:
db = duckdb.connect()

In [None]:
cursor = db.execute("""
select *
from read_parquet('../Data/Overture/type=place/*')
where
    bbox.minx > -4.9857056141 
    AND bbox.maxx < 8.4615600109 
    AND bbox.miny > 42.1448396402 
    AND bbox.maxy < 51.2187257569
""")
rows = cursor.fetchall()
columns = [desc[0] for desc in cursor.description]
dicts = [dict(zip(columns, row)) for row in rows]

In [None]:
dicts[0]

In [None]:
overture_df = pd.DataFrame(data = [
    {
        'place_id': d['id'],
        'name': d['names']['value'][0][0]['value'][0],
        'category': d['categories']['main'],
        'place_lon': (d['bbox']['maxx'] + d['bbox']['minx']) / 2,
        'place_lat': (d['bbox']['maxy'] + d['bbox']['miny']) / 2
    }
    for d in dicts
]
                 )

In [None]:
overture_gdf = gpd.GeoDataFrame(
    overture_df, geometry=gpd.points_from_xy(overture_df.place_lon, overture_df.place_lat), crs=shape.crs
)

In [None]:
overture_gdf.to_file("../midsave/overture.gpkg", layer='overture', driver="GPKG")

In [None]:
overture_df['category'].unique()

In [None]:
overture_df.shape

In [None]:
overture_tor = pd.DataFrame()

In [None]:
for i in tqdm(range(0, top_tiles.shape[0], 1)):
    tiles_mask = overture_gdf.within(top_tiles.loc[i, 'geometry'])
    temp = overture_gdf.loc[tiles_mask].copy()
    temp['tile_id'] = top_tiles.loc[i, 'tile_id']
    temp['cities'] = top_tiles.loc[i, 'cities']
    temp['traffic_mean_tor'] = top_tiles.loc[i, 'Tor_scaled']
    overture_tor = pd.concat([overture_tor, temp])

In [None]:
def sum_divided_by_count(series):
    return series.sum() / series.count()

In [None]:
overture_tor['traffic_mean_tor_per_poi'] = overture_tor.groupby(['cities', 'tile_id'])['traffic_mean_tor'].transform('mean') / overture_tor.groupby(['cities', 'tile_id'])['traffic_mean_tor'].transform('count')

In [None]:
(overture_tor
 .drop_duplicates(subset=['place_id'])
 .groupby(['category'])
 .agg({'traffic_mean_tor_per_poi': sum_divided_by_count, 'place_id': 'count'})
 .reset_index()
 .rename(columns={'place_id': 'category_count'})
 .sort_values(by=['traffic_mean_tor_per_poi'], ascending = False)
 .query('category_count >= 3')
 .head(10))

#### For Web Adult

In [None]:
top10_wa = (df
    .query("apps == 'Web_Adult'")
    .query("traffic_dir == 'DL'")
    .nlargest(n, 'Tor_scaled'))

In [None]:
top_tiles_wa = pd.DataFrame()

for city_str in top10_wa.cities.unique():
    
    shape = gpd.read_file(f'../Data/Netmob/{city_str}/{city_str}.geojson')
    
    shape['cities'] = city_str
    
    tmp = top10_wa.merge(shape, how = 'inner', on = ['tile_id','cities'])
    
    if top_tiles_wa.empty:
        top_tiles_wa = tmp
    else:
        top_tiles_wa = pd.concat([top_tiles_wa, tmp])

In [None]:
top_tiles_wa = (gpd.GeoDataFrame(top_tiles_wa, crs=shape.crs, geometry=top_tiles_wa['geometry'])
             .nlargest(n, 'Tor_scaled')).reset_index(drop = True)

In [None]:
top_tiles_wa.head(1)

In [None]:
overture_wa = pd.DataFrame()

In [None]:
for i in tqdm(range(0, top_tiles_wa.shape[0], 1)):
    tiles_mask = overture_gdf.within(top_tiles_wa.loc[i, 'geometry'])
    temp = overture_gdf.loc[tiles_mask].copy()
    temp['tile_id'] = top_tiles_wa.loc[i, 'tile_id']
    temp['cities'] = top_tiles_wa.loc[i, 'cities']
    temp['traffic_mean_wa'] = top_tiles_wa.loc[i, 'Tor_scaled']
    overture_wa = pd.concat([overture_wa, temp])

In [None]:
overture_wa['traffic_mean_wa_per_poi'] = overture_wa.groupby(['cities', 'tile_id'])['traffic_mean_wa'].transform('mean') / overture_wa.groupby(['cities', 'tile_id'])['traffic_mean_wa'].transform('count')

In [None]:
(overture_wa
 .drop_duplicates(subset=['place_id'])
 .groupby(['category'])
 .agg({'traffic_mean_wa_per_poi': sum_divided_by_count, 'place_id': 'count'})
 .reset_index()
 .rename(columns={'place_id': 'category_count'})
 .sort_values(by=['traffic_mean_wa_per_poi'], ascending = False)
 .query('category_count >= 3')
 .head(10))

#### For YouTube

In [None]:
top10_yt = (df
    .query("apps == 'YouTube'")
    .query("traffic_dir == 'DL'")
    .nlargest(n, 'Tor_scaled'))

In [None]:
top_tiles_yt = pd.DataFrame()

for city_str in top10_yt.cities.unique():
    
    shape = gpd.read_file(f'../Data/Netmob/{city_str}/{city_str}.geojson')
    
    shape['cities'] = city_str
    
    tmp = top10_yt.merge(shape, how = 'inner', on = ['tile_id','cities'])
    
    if top_tiles_yt.empty:
        top_tiles_yt = tmp
    else:
        top_tiles_yt = pd.concat([top_tiles_yt, tmp])

In [None]:
top_tiles_yt = (gpd.GeoDataFrame(top_tiles_yt, crs=shape.crs, geometry=top_tiles_yt['geometry'])
             .nlargest(n, 'Tor_scaled')).reset_index(drop = True)

In [None]:
overture_yt = pd.DataFrame()

In [None]:
for i in tqdm(range(0, top_tiles_yt.shape[0], 1)):
    tiles_mask = overture_gdf.within(top_tiles_yt.loc[i, 'geometry'])
    temp = overture_gdf.loc[tiles_mask].copy()
    temp['tile_id'] = top_tiles_yt.loc[i, 'tile_id']
    temp['cities'] = top_tiles_yt.loc[i, 'cities']
    temp['traffic_mean_yt'] = top_tiles_yt.loc[i, 'Tor_scaled']
    overture_yt = pd.concat([overture_yt, temp])

In [None]:
overture_yt['traffic_mean_yt_per_poi'] = overture_yt.groupby(['cities', 'tile_id'])['traffic_mean_yt'].transform('mean') / overture_yt.groupby(['cities', 'tile_id'])['traffic_mean_yt'].transform('count')

In [None]:
(overture_yt
 .drop_duplicates(subset=['place_id'])
 .groupby(['category'])
 .agg({'traffic_mean_yt_per_poi': sum_divided_by_count, 'place_id': 'count'})
 .reset_index()
 .rename(columns={'place_id': 'category_count'})
 .sort_values(by=['traffic_mean_yt_per_poi'], ascending = False)
 .query('category_count >= 3')
 .head(10))