In [1]:
import os
import gc
from tqdm import tqdm
import pandas as pd
import geopandas as gpd
from shapely.geometry import LineString
import matplotlib.pyplot as plt

In [9]:
def get_ookla_nuts_level(nuts, read_root, folder, file, eu_range):
    # read ookla data, reproject and calculate the area of each tile
    print(read_root + r'/' + folder + r'/' + file)
    ookla_global = gpd.read_file(read_root + r'/' + folder + r'/' + file)
    print('ookla loaded')
    ookla_global.to_crs('EPSG:3035', inplace = True)
    print('transformed')
    ookla_global['Shape_Area'] = ookla_global.area    
    print('area calculated')
    
    # extract ookla polygon to centroids
    ookla_global.geometry = ookla_global['geometry'].centroid

    # musk the centroids by NUTS boundary
    ookla = gpd.clip(ookla_global, eu_range)
    del ookla_global
    gc.collect()

    # calculate the average download and upload data of all ookla points that fall within certain NUTS region
    speed_dict = {}
    unit = 'kbps'
    freq = 'quarter'
    for nuts_id in tqdm(nuts['NUTS_ID']):
        area = nuts[nuts['NUTS_ID']==nuts_id]
        ookla_within = gpd.sjoin(ookla, area, how='inner')
        ookla_within['area*d'] = ookla_within['avg_d_kbps']*ookla_within['Shape_Area']
        ookla_within['area*u'] = ookla_within['avg_u_kbps']*ookla_within['Shape_Area']
        avg_d_kpbs = ookla_within['area*d'].sum()/ookla_within['Shape_Area'].sum()
        avg_u_kbps = ookla_within['area*u'].sum()/ookla_within['Shape_Area'].sum()
        speed_dict[nuts_id] = [quarter, network_type, unit, freq, avg_d_kpbs, avg_u_kbps, year]

    del ookla
    gc.collect()
    
    return speed_dict    

In [None]:
if __name__ == "__main__":
    nuts = gpd.read_file('/data/xiang/1-Data/NUTS/NUTS_RG_01M_2021_3035.shp')
    eu_range = gpd.read_file('/data/xiang/1-Data/NUTS/eu.shp')
    
    read_root = '/data/xiang/1-Data/Ookla'
    save_path = '/data/xiang/1-Data/Ookla_eu.xlsx'
    for folder in tqdm(os.listdir(read_root)[:2]): 
        for file in os.listdir(read_root + '//' + folder):
            if file.endswith('.shp'):
                # get the input variable from file paths
                year = folder.split('-')[0]
                quarter = int(folder.split('-')[1])//3+1
                network_type = folder.split('_')[-2]

                #  concatenante df from different quarters and years into one
                speed_dict = get_ookla_nuts_level(nuts, read_root, folder, file, eu_range)
                speed_df = pd.DataFrame(speed_dict).T
                speed_df.reset_index(inplace=True)
                speed_df.columns = ['geo', 'quarter', 'network_type', 'unit', 'freq', 'download', 'upload', 'obsTime']

                # reshape the df and save it
                reshap_df = pd.melt(speed_df, id_vars=['geo', 'quarter', 'unit', 'freq', 'obsTime','network_type'], value_vars = ['download', 'upload'], var_name = 'direction', value_name = 'obsValue')
                reshap_df.to_excel('/data/xiang/3-case studies/0-ookla data' + folder + '.xlsx')

  0%|                                                    | 0/42 [00:00<?, ?it/s]