In [None]:
# Reload all src modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

In [None]:
import os
import cProfile
import pandas as pd
import geopandas as geopd
import numpy as np
import multiprocessing as mp
try:
    import cld3
except ModuleNotFoundError:
    pass
import pycld2
from shapely.geometry import Polygon
from shapely.geometry import Point
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import descartes
import src.utils.geometry as geo
import src.utils.places_to_cells as places_to_cells
import src.data.shp_extract as shp_extract
import src.data.tweets_cells_counts as tweets_counts
import src.data.text_process as text_process
import src.data.access as data_access
import src.visualization.grid_viz as grid_viz
import src.data.user_filters as ufilters
import src.data.user_agg as uagg
from dotenv import load_dotenv
load_dotenv()

pd.reset_option("display.max_rows")

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container </style>"))
# display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
data_dir_path = os.environ['DATA_DIR']
tweets_files_format = 'tweets_2015_2018_{}.json.gz'
places_files_format = 'places_2015_2018_{}.json.gz'
ssh_domain = os.environ['IFISC_DOMAIN']
ssh_username = os.environ['IFISC_USERNAME']
project_data_dir = os.path.join('..', 'data')
external_data_dir = os.path.join(project_data_dir, 'external')
interim_data_dir = os.path.join(project_data_dir, 'interim')
processed_data_dir = os.path.join(project_data_dir, 'processed')
cell_data_path_format = os.path.join(processed_data_dir,
                                     '{}_cell_data_cc={}_cell_size={}m.geojson')
latlon_proj = 'epsg:4326'
LANGS_DICT = dict([(lang[1],lang[0].lower().capitalize())
                   for lang in pycld2.LANGUAGES])

country_codes = ('BO', 'CA', 'CH', 'EE', 'ES', 'FR', 'HK', 'ID', 'LT', 'LV',
                 'MY', 'PE', 'RO', 'SG', 'TN', 'UA')
with open(os.path.join(external_data_dir, 'countries.json')) as f:
    countries_study_data = json.load(f)

# Country specific parameters
cc = 'CH'
fig_dir = os.path.join('..', 'reports', 'figures', cc)
if not os.path.exists(fig_dir):
    os.mkdir(os.path.join(fig_dir, 'counts'))
    os.mkdir(os.path.join(fig_dir, 'prop'))
xy_proj = countries_study_data[cc]['xy_proj']
plot_langs_list = countries_study_data[cc]['local_langs']
valid_uids_path = os.path.join(interim_data_dir, f'valid_uids_{cc}.csv')

Get rid of bots, company account (eg careerarc, tweetmyjobs). If can't distinguish companies by source like careerarc, then how?

## Getting the data

### Places, area and grid

In [None]:
shapefile_name = 'CNTR_RG_01M_2016_4326.shp'
shapefile_path = os.path.join(
    external_data_dir, shapefile_name, shapefile_name)
shape_df = geopd.read_file(shapefile_path)
shape_df = shape_df.loc[shape_df['FID'] == cc]
country_name = shape_df['NAME_ENGL'].iloc[0]

Places can be a point too -> treat them like tweets with coords in this case

In [None]:
def geo_from_bbox(bbox):
    bbox = bbox['coordinates'][0]
    geo = Polygon(bbox)
    area = geo.area
    if area == 0:
        geo = Point(bbox[0])
    return geo, area

places_file_path = os.path.join(data_dir_path, places_files_format.format(cc))
raw_places_df = data_access.return_json(places_file_path,
    ssh_domain=ssh_domain, ssh_username=ssh_username, compression='gzip')
places_df = raw_places_df[['id', 'bounding_box', 'name', 'place_type']].copy()
places_df['geometry'], places_df['area'] = zip(
    *places_df['bounding_box'].apply(geo_from_bbox))
places_geodf = geopd.GeoDataFrame(
    places_df, crs=latlon_proj, geometry=places_df['geometry'])
places_geodf['area'] = area
places_geodf = places_geodf.set_index('id', drop=False)
# Since the places' bbox can stretch outside of the whole shape, we need to 
# take the intersection between the two. However, we only use the overlay to 
# calculate the area, so that places_to_cells distributes the whole population 
# of a place to the different cells within it. However we don't need the actual 
# geometry from the intersection, which is more complex and thus slows down 
# computations later on.
poly_mask = places_geodf['area'] > 0
polygons_in_shape = geopd.overlay(
    shape_df[['geometry']], places_geodf.loc[poly_mask], how='intersection')
polygons_in_shape = polygons_in_shape.set_index('id')
places_geodf.loc[poly_mask, 'area'] = polygons_in_shape.to_crs(xy_proj).area
places_geodf = places_geodf.drop(columns=['bounding_box', 'id'])
places_in_xy = places_geodf.geometry.to_crs(xy_proj)
places_geodf.head()

In [None]:
cell_size = 5000
max_place_area = 1e9 # linked to cell size and places data
cells_df, cells_in_area_df = geo.create_grid(shape_df, cell_size, latlon_proj, 
                                             xy_proj, intersect=True)
grid_test_df = cells_in_area_df.copy()
grid_test_df['metric'] = 1
save_path = os.path.join(fig_dir, f'grid_cc={cc}_cell_size={cell_size}m.pdf')
plot_kwargs = dict(alpha=0.7, edgecolor='w', linewidths=0.5, cmap='Purples')
ax = grid_viz.plot_grid(grid_test_df, shape_df, metric_col='metric', show=True, 
                        save_path=save_path, xy_proj=xy_proj, **plot_kwargs)

### Reading the data

In [None]:
tweets_file_path = os.path.join(data_dir_path, tweets_files_format.format(cc))

def read_data(tweets_file_path, chunk_start, chunk_size):
    raw_tweets_df = data_access.read_json_wrapper(
        tweets_file_path, chunk_start, chunk_size, ssh_domain=ssh_domain,
        ssh_username=ssh_username)
    return raw_tweets_df

def profile_pre_process(tweets_file_path, chunk_start, chunk_size):
    cProfile.runctx('read_data(tweets_file_path, chunk_start, chunk_size)', 
                    globals(), locals())

with mp.Pool(8) as pool:
    tweets_access_res = []
    for chunk_start, chunk_size in data_access.chunkify(
            tweets_file_path, size=1e9, ssh_domain=ssh_domain, 
            ssh_username=ssh_username):
        tweets_access_res.append(pool.apply_async(
            read_data, (tweets_file_path, chunk_start, chunk_size)))
    
    # This is mandatory so that the pool doesn't stop working until every
    # chunk has been processed.
    for res in tweets_access_res:
        res.get()

In [None]:
tweeted_months = None
tweets_pb_months = None
for res in tweets_access_res:
    tweets_df = res.get().copy()
    tweets_df['month'] = tweets_df['created_at'].dt.to_period('M')
    has_gps = tweets_df['coordinates'].notnull()
    geometry = tweets_df.loc[has_gps, 'coordinates'].apply(lambda x: Point(x['coordinates']))
    tweets_coords = geopd.GeoSeries(geometry, crs=latlon_proj, index=tweets_df.loc[has_gps].index)
    tweets_df = tweets_df.join(places_geodf, on='place_id', how='left')
    coords_in_place = tweets_coords.within(geopd.GeoSeries(tweets_df.loc[has_gps, 'geometry']))
    
    tweeted_months = tweets_counts.increment_counts(
        tweeted_months, tweets_df, ['month'])
    tweets_pb_months = tweets_counts.increment_counts(
        tweets_pb_months, tweets_df.loc[has_gps].loc[~coords_in_place], ['month'])
#     print(coords_in_place.value_counts())

In [None]:
months_counts = tweeted_months.join(tweets_pb_months, rsuffix='_pb', how='left')
months_counts['prop'] = months_counts['count_pb'] / months_counts['count']
ax = months_counts['prop'].plot.bar()
ticks = np.arange(0,47,5)
tick_labels = ax.get_xticklabels()
_ = ax.set_xticks(ticks)
_ = ax.set_xticklabels([tick_labels[i] for i in ticks])
_ = ax.set_ylabel('proportion')
_ = ax.set_title('Proportion of tweets with coords outside of place')

### Filtering out users

Filters: user-based imply a loop over all the raw_tweets_df, and must be applied before getting tweets_lang_df and even tweets_loc_df, because these don't interest us at all.

This is incremental, so can't parallelize. And it's rather fast, so not worth the time

In [None]:
tweeted_months_users = None
for res in tweets_access_res:
    raw_tweets_df = res.get()
    nr_users = len(raw_tweets_df['uid'].unique())
    print(f'There are {nr_users} distinct users in this chunk.')
    tweeted_months_users = ufilters.inc_months_activity(
        tweeted_months_users, raw_tweets_df)

tweeted_months_users = tweeted_months_users['count']
total_nr_users = len(tweeted_months_users.index.levels[0])
print(f'In total, there are {total_nr_users} distinct users in the whole dataset.')
local_uids = ufilters.consec_months(tweeted_months_users)
bot_uids = ufilters.bot_activity(tweeted_months_users)
# We have local_uids: index of uids with a column full of True, and bot_uids:
# index of uids with a column full of False. When we multiply them, the uids
# in local_uids which are not in bot_uids are assigned NaN, and the ones which 
# are in bot_uids are assigned False. When we convert to the boolean type,
# the NaNs turn to True.
valid_uids = (local_uids * bot_uids).astype('bool').rename('valid')
valid_uids = valid_uids.loc[valid_uids]
print(f'This leaves us with {len(valid_uids)} valid users in the whole dataset.')

In [None]:
def speed_filter(raw_tweets_df, valid_uids, places_in_xy, max_distance):
    tweets_df = raw_tweets_df.join(valid_uids, on='uid', how='inner')
    too_fast_uids = ufilters.too_fast(tweets_df, places_in_xy, max_distance)
    return too_fast_uids

too_fast_uids_series = pd.Series([])
area_bounds = shape_df.to_crs(xy_proj).geometry.iloc[0].bounds
# Get an upper limit of the distance that can be travelled inside the area
max_distance = np.sqrt((area_bounds[0]-area_bounds[2])**2 
                       + (area_bounds[1]-area_bounds[3])**2)

with mp.Pool(8) as pool:
    cols = ['uid', 'created_at', 'place_id', 'coordinates']
    map_parameters = [
        (res.get().loc[:, cols], valid_uids, places_in_xy, max_distance) 
        for res in tweets_access_res]
    print('entering the loop')
    too_fast_uids_list = pool.starmap_async(speed_filter, map_parameters).get()
    for too_fast_uids in too_fast_uids_list:
        too_fast_uids_series = (too_fast_uids_series * too_fast_uids).fillna(False)

print(f'In total, there are {len(too_fast_uids_series)} too fast users left to '
      'filter out in the whole dataset.')
valid_uids = (valid_uids * too_fast_uids_series).astype('bool').rename('valid')
valid_uids = valid_uids.loc[valid_uids]
print(f'This leaves us with {len(valid_uids)} valid users in the whole dataset.')
valid_uids.index = valid_uids.index.rename('uid')
valid_uids.to_csv(valid_uids_path, header=True)

most tweets in the month in that country to asign local

### Processing

We don't filter out tweets with a useless place (one too large) here, because these tweets can still be useful for language detection. So this filter is only applied later on. Similarly, we keep tweets with insufficient text to make a reliable language detection, because they can still be useful for residence attribution.

In [None]:
valid_uids = pd.read_csv(valid_uids_path, index_col='uid', header=0)

def process(raw_tweets_df, valid_uids, places_geodf, text_col='text', 
            min_nr_words=4, cld='pycld2'):
    cols = ['text', 'id', 'lang', 'place_id', 'coordinates', 'uid', 
            'created_at', 'source']
    tweets_loc_df = raw_tweets_df.loc[:, cols]
    print('- starting geo join')
    tweets_loc_df = tweets_loc_df.join(valid_uids, on='uid', how='inner')
    has_gps = tweets_loc_df['coordinates'].notnull()
    tweets_places_df = tweets_loc_df.loc[~has_gps].join(
        places_geodf[['geometry', 'area']], on='place_id', how='left')
    # The geometry of the tweets with GPS coordinates is the Point associated 
    # to them.
    tweets_loc_df.loc[has_gps, 'geometry'] = tweets_loc_df.loc[has_gps, 'coordinates'].apply(
        lambda x: Point(x['coordinates']))
    # We assign the area of points to 0, and at the same time initialize the 
    # whole column, whose values will change for tweets without GPS coordinates.
    tweets_loc_df['area'] = 0
    # We add the geometry of the place to the tweets without GPS coordinates
    tweets_loc_df.loc[~has_gps, 'geometry'] = tweets_places_df['geometry']
    tweets_loc_df.loc[~has_gps, 'area'] = tweets_places_df['area']
    tweets_loc_df = (tweets_loc_df.rename(columns={'lang': 'twitter_lang'})
                                  .drop(columns=['valid', 'coordinates']))
    tweets_loc_df = geopd.GeoDataFrame(tweets_loc_df, crs=latlon_proj)
    print('starting lang detect')
    tweets_lang_df = text_process.lang_detect(
        tweets_loc_df, text_col='text', min_nr_words=4, cld='pycld2')
    print('chunk done')
    return tweets_lang_df


def profile_process(raw_tweets_df, valid_uids, places_geodf):
    cProfile.runctx(
        'process(raw_tweets_df, valid_uids, places_geodf)', globals(), locals())


with mp.Pool(8) as pool:
    map_parameters = [(res.get(), valid_uids, places_geodf) 
                      for res in tweets_access_res]
    print('entering the loop')
    tweets_process_res = pool.starmap_async(process, map_parameters).get()

## Study at the tweet level

### Make tweet counts data

In [None]:
tweet_level_label = 'tweets in {}'

plot_langs_dict = {}
for plot_lang in plot_langs_list:
    readable_lang = LANGS_DICT[plot_lang]
    lang_count_col = f'cell_count_{plot_lang}'
    lang_prop_col = f'cell_prop_{plot_lang}'
    level_lang_label = tweet_level_label.format(readable_lang)
    lang_count_label = f'Number of {level_lang_label} in the cell'
    lang_prop_label = f'Proportion of {level_lang_label} in the cell'
    lang_dict = {'prop_col': lang_prop_col,
                 'count_col': lang_count_col,
                 'count_label': lang_count_label,
                 'prop_label': lang_prop_label,
                 'readable': readable_lang}
    plot_langs_dict[plot_lang] = lang_dict

Why sjoin so slow? It tests on every cell, even though it's exclusive: if one cell matches no other will. Solution: loop over cells, ordered by the counts obtained from places, and stop at first match, will greatly reduce the number of 'within' operations

In [None]:
places_langs_counts = None
cells_langs_counts = None
for res in tweets_process_res:
    tweets_df = res.copy()
    relevant_area_mask = tweets_df['area'] < max_place_area
    tweets_df = tweets_df.loc[relevant_area_mask]
    has_gps = tweets_df['area'] == 0
    # Here the tweets with coordinates outside the grid are out, because of the
    # inner join
    tweets_cells_df = geopd.sjoin(tweets_df.loc[has_gps], cells_in_area_df,
        op='within', rsuffix='cell', how='inner')
    nr_out_tweets =  len(tweets_df.loc[has_gps]) - len(tweets_cells_df)
    print(f'{nr_out_tweets} tweets have been found outside of the grid and'
         ' filtered out as a result.')
    # geopd adds an underscore by itself to the suffix
    tweets_cells_df = tweets_cells_df.rename(columns={'index_cell': 'cell_id'})
    groupby_cols = ['cld_lang', 'cell_id']
    cells_langs_counts = tweets_counts.increment_counts(
        cells_langs_counts, tweets_cells_df, groupby_cols)
    tweets_places_df = tweets_df.loc[~has_gps]
    groupby_cols = ['cld_lang', 'place_id']
    places_langs_counts = tweets_counts.increment_counts(
        places_langs_counts, tweets_places_df, groupby_cols)

places_counts = places_langs_counts.groupby('place_id').sum()
cells_langs_counts = cells_langs_counts['count']
cells_counts = cells_langs_counts.groupby('cell_id').sum().rename('total_count')

Places -> cells

In [None]:
cell_plot_df = places_to_cells.get_counts(
    places_counts, places_langs_counts, places_geodf,
    cells_in_area_df, plot_langs_dict, xy_proj=xy_proj)

# We add the counts from the tweets with coordinates
cell_plot_df = tweets_counts.increment_join(
    cell_plot_df, cells_counts, count_col='total_count')
cell_plot_df = cell_plot_df.loc[cell_plot_df['total_count'] > 0]

for plot_lang, lang_dict in plot_langs_dict.items():
    lang_count_col = lang_dict['count_col']
    cells_lang_counts = cells_langs_counts.xs(plot_lang).rename(lang_count_col)
    cell_plot_df = tweets_counts.increment_join(
        cell_plot_df, cells_lang_counts, count_col=lang_count_col)
    
    level_lang_label = tweet_level_label.format(lang_dict['readable'])
    sum_lang = cell_plot_df[lang_count_col].sum()
    print(f'There are {sum_lang:.0f} {level_lang_label}.')
    
    lang_prop_col = lang_dict['prop_col']
    cell_plot_df[lang_prop_col] = (cell_plot_df[lang_count_col]
                                   / cell_plot_df['total_count'])

cell_plot_df['cell_id'] = cell_plot_df.index
cell_data_path = cell_data_path_format.format('tweets', cc, cell_size)
cell_plot_df.to_file(cell_data_path, driver='GeoJSON')

### Plots

In [None]:
# cell_size = 10000
cell_data_path = cell_data_path_format.format('tweets', cc, cell_size)
cell_plot_df = geopd.read_file(cell_data_path)
cell_plot_df.index = cell_plot_df['cell_id']

In [None]:
for plot_lang, plot_dict in plot_langs_dict.items():
    count_lang_col = plot_dict['count_col']
    readable_lang = plot_dict['readable']
    save_path = os.path.join(fig_dir, 'counts',
        f'tweet_counts_cc={cc}_lang={plot_lang}_cell_size={cell_size}m.pdf')
    plot_title = f'Distribution of {readable_lang} speakers in {country_name}'
    cbar_label = plot_dict['count_label']
    plot_kwargs = dict(edgecolor='w', linewidths=0.2, cmap='Purples')
    ax_count = grid_viz.plot_grid(
        cell_plot_df, shape_df, metric_col=count_lang_col, save_path=save_path, 
        show=False, log_scale=True, title=plot_title, cbar_label=cbar_label,
        xy_proj=xy_proj, **plot_kwargs)
    
    prop_lang_col = plot_dict['prop_col']
    save_path = os.path.join(fig_dir, 'prop',
        f'tweets_prop_cc={cc}_lang={plot_lang}_cell_size={cell_size}m.pdf')
    plot_title = '{} predominance in {}'.format(readable_lang, country_name)
    cbar_label = plot_dict['prop_label']
    # Avoid sequential colormaps starting or ending with white, as white is  
    # reserved for an absence of data
    plot_kwargs = dict(edgecolor='w', linewidths=0.2, cmap='plasma')
    ax_prop = grid_viz.plot_grid(
        cell_plot_df, shape_df, metric_col=prop_lang_col, save_path=save_path, 
        title=plot_title, cbar_label=cbar_label, vmax=1, xy_proj=xy_proj, 
        **plot_kwargs)

In [None]:
save_path = os.path.join(fig_dir, 
            f'tweets_prop_cc={cc}_cell_size={cell_size}m.html')

fig = grid_viz.plot_interactive(cell_plot_df, shape_df, plot_langs_dict,
    save_path=save_path, plotly_renderer='iframe_connected', show=True)

## Study at the user level

### Make user counts data

Users who have tagged their tweets with gps coordinates seem to do it regularly, as the median of the proportion of tweets they geo tag is at more than 75% on the first chunk -> it's worth it to try and get their cell of residence

In [None]:
a = tweets_process_res[0].copy()
a['has_gps'] = a['area'] == 0
gps_uids = a.loc[a['has_gps'], 'uid'].unique()
a = a.loc[a['uid'].isin(gps_uids)].groupby(['uid', 'has_gps']).size().rename('count').to_frame()
a = a.join(a.groupby('uid')['count'].sum().rename('sum'))
b = a.reset_index()
b = b.loc[b['has_gps']]
b['ratio'] = b['count'] / b['sum']
b['ratio'].describe()

If there's one or more cells where a user tweeted in proportion more than relevant_th of the time, we take among these cells the one where they tweeted the most outside work hours. Otherwise, we take the relevant place where they tweeted the most outside work hours, or we default to the place where they tweeted the most.

In [None]:
user_level_label = '{}-speaking users'
relevant_th = 0.1

plot_langs_dict = {}
for plot_lang in plot_langs_list:
    readable_lang = LANGS_DICT[plot_lang]
    lang_count_col = f'cell_count_{plot_lang}'
    lang_prop_col = f'cell_prop_{plot_lang}'
    level_lang_label = user_level_label.format(readable_lang)
    lang_count_label = f'Number of {level_lang_label} in the cell'
    lang_prop_label = f'Proportion of {level_lang_label} in the cell'
    lang_dict = {'prop_col': lang_prop_col,
                 'count_col': lang_count_col,
                 'count_label': lang_count_label,
                 'prop_label': lang_prop_label,
                 'readable': readable_lang}
    plot_langs_dict[plot_lang] = lang_dict

Here we get rid of users whose language we couldn't identify

In [None]:
# Residence attribution is the longest to run, and by a long shot, so we'll start
# with language to filter out uids in tweets_df before doing it
groupby_cols = ['uid', 'cld_lang']
user_langs_counts = None
for res in tweets_process_res:
    tweets_lang_df = res.copy()
    # Here we don't filter out based on max_place_area, because these tweets
    # are still useful for language attribution.
    tweets_lang_df = tweets_lang_df.loc[tweets_lang_df['cld_lang'].notnull()]
    user_langs_counts = tweets_counts.increment_counts(
        user_langs_counts, tweets_lang_df, groupby_cols)
    
total_per_user = user_langs_counts.groupby('uid')['count'].sum().rename('user_count')
user_langs_agg = user_langs_counts.join(total_per_user).assign(
    prop_lang=lambda df: df['count'] / df['user_count'])
user_langs_agg = user_langs_agg.loc[user_langs_agg['prop_lang'] > relevant_th]
uid_with_lang = user_langs_agg.index.levels[0].values
print(f'We were able to attribute at least one language to {len(uid_with_lang)}'
      ' users')

user_places_habits = None
user_cells_habits = None
for res in tweets_process_res:
    tweets_df = res.copy()
    # We filter out users to which we couldn't attribute even one language
    uid_mask = tweets_df['uid'].isin(uid_with_lang)
    relevant_area_mask = tweets_df['area'] < max_place_area
    tweets_df = tweets_df.loc[uid_mask & relevant_area_mask].copy()
    tweets_df['hour'] = (tweets_df['created_at'].dt.tz_localize('UTC')
                                                .dt.tz_convert('CET')
                                                .dt.hour)
    # Tweets are considered in work hours if they were made between 8 and 18
    # outside of the week-end (weekday goes from 0 (Monday) to 6 (Sunday)).
    tweets_df['isin_workhour'] = (
        (tweets_df['hour'] > 7) 
        & (tweets_df['hour'] < 18)
        & (tweets_df['created_at'].dt.weekday < 5))
    
    groupby_cols = ['uid', 'place_id', 'isin_workhour']
    # We first count the number of times a user has tweeted in each place inside
    # and outside work hours.
    user_places_habits = tweets_counts.increment_counts(
        user_places_habits, tweets_df, groupby_cols)
    
    has_gps = tweets_df['area'] == 0
    tweets_cells_df = geopd.sjoin(tweets_df.loc[has_gps], cells_in_area_df, 
        op='within', rsuffix='cell', how='inner')
    # geopd adds an underscore by itself to the suffix
    tweets_cells_df = tweets_cells_df.rename(columns={'index_cell': 'cell_id'})
    groupby_cols = ['uid', 'cell_id', 'isin_workhour']
    # Then we do the same thing except in each cell, using the tweets with
    # coordinates.
    user_cells_habits = tweets_counts.increment_counts(user_cells_habits,
        tweets_cells_df, groupby_cols)

Here we took number of speakers, whether they're multilingual or monolingual, if they speak a language, they count as one in that language's count

Other possibility: pass the places counts to cells counts here, and then do the whole residence attribution solely
on a cell basis. Problem: user tags himself in the same city all the time, overlapping multiple cells: we'll have more than one cell with approximately the same count, and one cells takes all in the end. Typical 'winner takes all' problem in this case

In [None]:
# We calculate the total number of tweets of each user, in order to be able
# to calculate
user_counts = user_places_habits.groupby('uid')['count'].sum().rename('user_count')
user_home_cell = user_cells_habits.join(user_counts, how='inner')
user_home_cell['prop_in_cell'] = user_home_cell['count'] / user_home_cell['user_count']
user_home_cell = (user_home_cell.loc[user_home_cell['prop_in_cell'] > relevant_th]
                                .xs(False, level='isin_workhour')
                                .reset_index()
                                .sort_values(by=['uid', 'count'])
                                .groupby('uid')['cell_id']
                                .last())

users_with_cell = user_home_cell.index.values
user_home_place = uagg.get_residence(user_places_habits, place_id_col='place_id')
user_only_place = user_home_place.drop(users_with_cell)
places_counts = user_only_place.to_frame().groupby('place_id').size().rename('count')
cells_counts = user_home_cell.to_frame().groupby('cell_id').size().rename('total_count')

places_langs_counts = (user_langs_agg.join(user_only_place, how='inner')
                                     .groupby(['cld_lang', 'place_id'])
                                     .size()
                                     .rename('count'))
cells_langs_counts = (user_langs_agg.join(user_home_cell, how='inner')
                                     .groupby(['cld_lang', 'cell_id'])
                                     .size()
                                     .rename('count'))

# We initialize cell_plot_df with the counts from the users for which we could
# only find a place for residence
cell_plot_df = places_to_cells.get_counts(
    places_counts, places_langs_counts, places_geodf,
    cells_in_area_df, plot_langs_dict, xy_proj=xy_proj)
# Then we add the total counts for the users with a cell of residence.
cell_plot_df = tweets_counts.increment_join(cell_plot_df, cells_counts, 
                             count_col='total_count')

cell_plot_df = cell_plot_df.loc[cell_plot_df['total_count'] > 0]

for plot_lang, lang_dict in plot_langs_dict.items():
    lang_count_col = lang_dict['count_col']
    cells_lang_counts = cells_langs_counts.xs(plot_lang).rename(lang_count_col)
    # And then we increment the counts per language with these users.
    cell_plot_df = tweets_counts.increment_join(cell_plot_df, cells_lang_counts, 
                             count_col=lang_count_col)
    
    lang_prop_col = lang_dict['prop_col']
    level_lang_label = user_level_label.format(lang_dict['readable'])
    sum_lang = cell_plot_df[lang_count_col].sum()
    print(f'There are {sum_lang:.0f} {level_lang_label}.')
    cell_plot_df[lang_prop_col] = (cell_plot_df[lang_count_col]
                                   / cell_plot_df['total_count'])

cell_plot_df['cell_id'] = cell_plot_df.index
cell_data_path = cell_data_path_format.format('users', cc, cell_size)
cell_plot_df.to_file(cell_data_path, driver='GeoJSON')

### Plots

In [None]:
# cell_size = 10000
cell_data_path = cell_data_path_format.format('users', cc, cell_size)
cell_plot_df = geopd.read_file(cell_data_path)
cell_plot_df.index = cell_plot_df['cell_id']

In [None]:
for plot_lang, plot_dict in plot_langs_dict.items():
    count_lang_col = plot_dict['count_col']
    readable_lang = plot_dict['readable']
    save_path = os.path.join(fig_dir, 'counts',
        f'users_counts_cc={cc}_lang={plot_lang}_cell_size={cell_size}m.pdf')
    plot_title = f'Distribution of {readable_lang} speakers in {country_name}'
    cbar_label = plot_dict['count_label']
    plot_kwargs = dict(edgecolor='w', linewidths=0.2, cmap='Purples')
    ax_count = grid_viz.plot_grid(
        cell_plot_df, shape_df, metric_col=count_lang_col, save_path=save_path, 
        show=False, log_scale=True, title=plot_title, cbar_label=cbar_label,
        xy_proj=xy_proj, **plot_kwargs)
    
    prop_lang_col = plot_dict['prop_col']
    save_path = os.path.join(fig_dir, 'prop',
        f'users_prop_cc={cc}_lang={plot_lang}_cell_size={cell_size}m.pdf')
    plot_title = f'Predominance of {readable_lang} speakers in {country_name}'
    cbar_label = plot_dict['prop_label']
    # Avoid sequential colormaps starting or ending with white, as white is  
    # reserved for an absence of data
    plot_kwargs = dict(edgecolor='w', linewidths=0.2, cmap='plasma')
    ax_prop = grid_viz.plot_grid(
        cell_plot_df, shape_df, metric_col=prop_lang_col, save_path=save_path, 
        title=plot_title, cbar_label=cbar_label, vmax=1, xy_proj=xy_proj, 
        **plot_kwargs)

In [None]:
save_path = os.path.join(fig_dir, 
            f'users_prop_cc={cc}_cell_size={cell_size}m.html')

fig = grid_viz.plot_interactive(cell_plot_df, shape_df, plot_langs_dict,
    save_path=save_path, plotly_renderer='iframe_connected', show=True)