## Config

In [None]:
# Reload all src modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

In [None]:
import os
import cProfile
import pandas as pd
import geopandas as geopd
import numpy as np
import multiprocessing as mp
try:
    import cld3
except ModuleNotFoundError:
    pass
import pycld2
from shapely.geometry import MultiPolygon
from shapely.geometry import Polygon
from shapely.geometry import Point
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import descartes
import datetime
import src.utils.geometry as geo
import src.utils.places_to_cells as places_to_cells
import src.utils.join_and_count as join_and_count
import src.utils.make_config as make_config
import src.data.shp_extract as shp_extract
import src.data.text_process as text_process
import src.data.access as data_access
import src.data.user_filters as ufilters
import src.data.user_agg as uagg
import src.data.metrics as metrics
import src.visualization.grid_viz as grid_viz
from dotenv import load_dotenv
load_dotenv()

pd.reset_option("display.max_rows")

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container </style>"))
# display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
data_dir_path = os.environ['DATA_DIR']
tweets_files_format = 'tweets_{}_{}_{}.json.gz'
places_files_format = 'places_{}_{}_{}.json.gz'
ssh_domain = os.environ['IFISC_DOMAIN']
ssh_username = os.environ['IFISC_USERNAME']
project_data_dir = os.path.join('..', 'data')
external_data_dir = os.path.join(project_data_dir, 'external')
interim_data_dir = os.path.join(project_data_dir, 'interim')
processed_data_dir = os.path.join(project_data_dir, 'processed')
cell_data_path_format = os.path.join(processed_data_dir,
                                     '{}_cell_data_cc={}_cell_size={}m.geojson')
latlon_proj = 'epsg:4326'
LANGS_DICT = dict([(lang[1],lang[0].lower().capitalize())
                   for lang in pycld2.LANGUAGES])

country_codes = ('BE', 'BO', 'CA', 'CH', 'EE', 'ES', 'FR', 'HK', 'ID', 'LT', 
                 'LV', 'MY', 'PE', 'RO', 'SG', 'TN', 'UA')
with open(os.path.join(external_data_dir, 'countries.json')) as f:
    countries_study_data = json.load(f)
with open(os.path.join(external_data_dir, 'langs_agg.json')) as f:
    langs_agg_dict = json.load(f)

# Country-specific parameters
cc = 'BE'
region = None
# region = 'Cataluña'
if region:
    area_dict = countries_study_data[cc]['regions'][region]
else:
    area_dict = countries_study_data[cc]
    
fig_dir = os.path.join('..', 'reports', 'figures', cc)
if not os.path.exists(fig_dir):
    os.makedirs(os.path.join(fig_dir, 'counts'))
    os.makedirs(os.path.join(fig_dir, 'prop'))
xy_proj = area_dict['xy_proj']
cc_timezone = area_dict['timezone']
plot_langs_list = area_dict['local_langs']
min_poly_area = area_dict.get('min_poly_area') or 0.1
max_place_area = area_dict.get('max_place_area') or 1e9 # linked to cell size and places data
valid_uids_path = os.path.join(interim_data_dir, f'valid_uids_{cc}.csv')

Get rid of bots, company account (eg careerarc, tweetmyjobs). If can't distinguish companies by source like careerarc, then how?

## Getting the data

### Places, area and grid

In [None]:
shapefile_dict = make_config.shapefile_dict(area_dict, cc, region=region)
    
shapefile_path = os.path.join(
    external_data_dir, shapefile_dict['name'], shapefile_dict['name'])
shape_df = geopd.read_file(shapefile_path)
shape_df = geo.extract_shape(shape_df, shapefile_dict['col'], 
                             shapefile_dict['val'])
if region:
    country_name = region
else:
    country_name = shape_df['NAME_ENGL'].iloc[0]
shape_df

Places can be a point too -> treat them like tweets with coords in this case

In [None]:
places_files_paths = [
    os.path.join(data_dir_path, places_files_format.format(2015, 2018, cc))]
#     os.path.join(data_dir_path, places_files_format.format(2019, 2019, cc))]
all_raw_places_df = []
for file in places_files_paths:
    raw_places_df = data_access.return_json(file,
        ssh_domain=ssh_domain, ssh_username=ssh_username, compression='gzip')
    all_raw_places_df.append(
        raw_places_df[['id', 'bounding_box', 'name', 'place_type']])

# We drop the duplicate places (based on their ID)
places_df = pd.concat(all_raw_places_df).drop_duplicates(subset='id')
places_geodf, places_in_xy = geo.make_places_geodf(places_df, shape_df,
                                                   xy_proj=xy_proj)
places_geodf.head()

In [None]:
places_geodf.loc[places_geodf['name'] == 'Brussels']

In [None]:
cell_size = 10000
cells_df, cells_in_area_df, Nx, Ny = geo.create_grid(
    shape_df, cell_size, xy_proj=xy_proj, intersect=True)
grid_test_df = cells_in_area_df.copy()
grid_test_df['metric'] = 1
save_path = os.path.join(fig_dir, f'grid_cc={cc}_cell_size={cell_size}m.pdf')
plot_kwargs = dict(alpha=0.7, edgecolor='w', linewidths=0.5, cmap='plasma')
ax = grid_viz.plot_grid(grid_test_df, shape_df, metric_col='metric', show=True, 
                        save_path=save_path, xy_proj=xy_proj, **plot_kwargs)

### Reading the data

In [None]:
tweets_files_paths = [
    os.path.join(data_dir_path, tweets_files_format.format(2015, 2018, cc))]
#     os.path.join(data_dir_path, tweets_files_format.format(2019, 2019, cc))]

def read_data(tweets_file_path, chunk_start, chunk_size, places_geodf):
    raw_tweets_df = data_access.read_json_wrapper(
        tweets_file_path, chunk_start, chunk_size, ssh_domain=ssh_domain,
        ssh_username=ssh_username)
    og_cols = raw_tweets_df.columns.values
    raw_tweets_df = raw_tweets_df.join(places_geodf, on='place_id', how='inner')
    raw_tweets_df = raw_tweets_df.loc[:, og_cols]
    return raw_tweets_df

def profile_pre_process(tweets_file_path, chunk_start, chunk_size):
    cProfile.runctx(
        'read_data(tweets_file_path, chunk_start, chunk_size, places_geodf)', 
        globals(), locals())

with mp.Pool(8) as pool:
    tweets_access_res = []
    for file_path in tweets_files_paths:
        for chunk_start, chunk_size in data_access.chunkify(
                file_path, size=1e9, ssh_domain=ssh_domain, 
                ssh_username=ssh_username):
            tweets_access_res.append(pool.apply_async(
                read_data, (file_path, chunk_start, chunk_size, places_geodf)))
    
    # This is mandatory so that the pool doesn't stop working until every
    # chunk has been processed.
    for res in tweets_access_res:
        res.get()

In [None]:
tweeted_months = None
tweets_pb_months = None
first_day = datetime.datetime(year=2015, month=1, day=1)
for res in tweets_access_res:
    tweets_df = res.get().copy()
    tweets_df = tweets_df.loc[tweets_df['created_at'] > first_day]
    tweets_df['month'] = tweets_df['created_at'].dt.to_period('M')
    has_gps = tweets_df['coordinates'].notnull()
    geometry = tweets_df.loc[has_gps, 'coordinates'].apply(
        lambda x: Point(x['coordinates']))
    tweets_coords = geopd.GeoSeries(geometry, crs=latlon_proj, 
                                    index=tweets_df.loc[has_gps].index)
    tweets_df = tweets_df.join(places_geodf, on='place_id', how='left')
    coords_in_place = tweets_coords.within(
        geopd.GeoSeries(tweets_df.loc[has_gps, 'geometry']))
    
    tweeted_months = join_and_count.increment_counts(
        tweeted_months, tweets_df, ['month'])
    tweets_pb_months = join_and_count.increment_counts(tweets_pb_months, 
        tweets_df.loc[has_gps].loc[~coords_in_place], ['month'])
#     print(coords_in_place.value_counts())

In [None]:
months_counts = tweeted_months.join(tweets_pb_months, rsuffix='_pb', how='left')
months_counts['prop'] = months_counts['count_pb'] / months_counts['count']
ax = months_counts['prop'].plot.bar()
ticks = np.arange(0,60,5)
tick_labels = ax.get_xticklabels()
_ = ax.set_xticks(ticks)
_ = ax.set_xticklabels([tick_labels[i] for i in ticks])
_ = ax.set_ylabel('proportion')
_ = ax.set_title('Proportion of tweets with coords outside of place')

### Filtering out users

Filters: user-based imply a loop over all the raw_tweets_df, and must be applied before getting tweets_lang_df and even tweets_loc_df, because these don't interest us at all.

This is incremental, so can't parallelize. And it's rather fast, so not worth the time

In [None]:
tweeted_months_users = None
for res in tweets_access_res:
    raw_tweets_df = res.get()
    nr_users = len(raw_tweets_df['uid'].unique())
    print(f'There are {nr_users} distinct users in this chunk.')
    tweeted_months_users = ufilters.inc_months_activity(
        tweeted_months_users, raw_tweets_df)

tweeted_months_users = tweeted_months_users['count']
total_nr_users = len(tweeted_months_users.index.levels[0])
print(f'In total, there are {total_nr_users} distinct users in the whole dataset.')
local_uids = ufilters.consec_months(tweeted_months_users)
bot_uids = ufilters.bot_activity(tweeted_months_users)
# We have local_uids: index of uids with a column full of True, and bot_uids:
# index of uids with a column full of False. When we multiply them, the uids
# in local_uids which are not in bot_uids are assigned NaN, and the ones which 
# are in bot_uids are assigned False. When we convert to the boolean type,
# the NaNs turn to True.
valid_uids = (local_uids * bot_uids).astype('bool').rename('valid')
valid_uids = valid_uids.loc[valid_uids]
print(f'This leaves us with {len(valid_uids)} valid users in the whole dataset.')

In [None]:
def speed_filter(raw_tweets_df, valid_uids, places_in_xy, max_distance):
    tweets_df = raw_tweets_df.join(valid_uids, on='uid', how='inner')
    too_fast_uids = ufilters.too_fast(tweets_df, places_in_xy, max_distance)
    return too_fast_uids

too_fast_uids_series = pd.Series([])
area_bounds = shape_df.to_crs(xy_proj).geometry.iloc[0].bounds
# Get an upper limit of the distance that can be travelled inside the area
max_distance = np.sqrt((area_bounds[0]-area_bounds[2])**2 
                       + (area_bounds[1]-area_bounds[3])**2)

with mp.Pool(8) as pool:
    cols = ['uid', 'created_at', 'place_id', 'coordinates']
    map_parameters = [
        (res.get().loc[:, cols], valid_uids, places_in_xy, max_distance) 
        for res in tweets_access_res]
    print('entering the loop')
    too_fast_uids_list = pool.starmap_async(speed_filter, map_parameters).get()
    for too_fast_uids in too_fast_uids_list:
        too_fast_uids_series = (too_fast_uids_series * too_fast_uids).fillna(False)

print(f'In total, there are {len(too_fast_uids_series)} too fast users left to '
      'filter out in the whole dataset.')
valid_uids = (valid_uids * too_fast_uids_series).astype('bool').rename('valid')
valid_uids = valid_uids.loc[valid_uids]
print(f'This leaves us with {len(valid_uids)} valid users in the whole dataset.')
valid_uids.index = valid_uids.index.rename('uid')
valid_uids.to_csv(valid_uids_path, header=True)

most tweets in the month in that country to asign local

### Processing

We don't filter out tweets with a useless place (one too large) here, because these tweets can still be useful for language detection. So this filter is only applied later on. Similarly, we keep tweets with insufficient text to make a reliable language detection, because they can still be useful for residence attribution.

In [None]:
valid_uids = pd.read_csv(valid_uids_path, index_col='uid', header=0)

def process(raw_tweets_df, valid_uids, places_geodf, langs_agg_dict, 
            text_col='text', min_nr_words=4, cld='pycld2'):
    cols = ['text', 'id', 'lang', 'place_id', 'coordinates', 'uid', 
            'created_at', 'source']
    tweets_loc_df = raw_tweets_df.loc[:, cols]
    print('- starting geo join')
    tweets_loc_df = tweets_loc_df.join(valid_uids, on='uid', how='inner')
    has_gps = tweets_loc_df['coordinates'].notnull()
    tweets_places_df = tweets_loc_df.loc[~has_gps].join(
        places_geodf[['geometry', 'area']], on='place_id', how='left')
    # The geometry of the tweets with GPS coordinates is the Point associated 
    # to them.
    tweets_loc_df.loc[has_gps, 'geometry'] = tweets_loc_df.loc[has_gps, 'coordinates'].apply(
        lambda x: Point(x['coordinates']))
    # We assign the area of points to 0, and at the same time initialize the 
    # whole column, whose values will change for tweets without GPS coordinates.
    tweets_loc_df['area'] = 0
    # We add the geometry of the place to the tweets without GPS coordinates
    tweets_loc_df.loc[~has_gps, 'geometry'] = tweets_places_df['geometry']
    tweets_loc_df.loc[~has_gps, 'area'] = tweets_places_df['area']
    tweets_loc_df = (tweets_loc_df.rename(columns={'lang': 'twitter_lang'})
                                  .drop(columns=['valid', 'coordinates']))
    tweets_loc_df = geopd.GeoDataFrame(tweets_loc_df, crs=latlon_proj)
    print('starting lang detect')
    tweets_lang_df = text_process.lang_detect(tweets_loc_df, text_col=text_col, 
        min_nr_words=min_nr_words, cld=cld, langs_agg_dict=langs_agg_dict)
    print('chunk done')
    return tweets_lang_df


def profile_process(raw_tweets_df, valid_uids, places_geodf):
    cProfile.runctx(
        'process(raw_tweets_df, valid_uids, places_geodf)', globals(), locals())


with mp.Pool(8) as pool:
    map_parameters = [(res.get(), valid_uids, places_geodf, langs_agg_dict) 
                      for res in tweets_access_res]
    print('entering the loop')
    tweets_process_res = pool.starmap_async(process, map_parameters).get()

## Study at the tweet level

### Make tweet counts data

In [None]:
tweet_level_label = 'tweets in {}'
plot_langs_dict = make_config.langs_dict(area_dict, tweet_level_label)

Why sjoin so slow? It tests on every cell, even though it's exclusive: if one cell matches no other will. Solution: loop over cells, ordered by the counts obtained from places, and stop at first match, will greatly reduce the number of 'within' operations -> update: doesn't seem possible, deleting from spatial index is extremely slow

In [None]:
def get_langs_counts(tweets_lang_df, max_place_area, cells_in_area_df):
    tweets_df = tweets_lang_df.copy()
    relevant_area_mask = tweets_df['area'] < max_place_area
    tweets_df = tweets_df.loc[relevant_area_mask]
    # The following mask accounts for both tweets with GPS coordinates and
    # tweets within places which are a point.
    has_gps = tweets_df['area'] == 0
    # Here the tweets with coordinates outside the grid are out, because of the
    # inner join
    tweets_cells_df = geopd.sjoin(tweets_df.loc[has_gps], cells_in_area_df,
        op='within', rsuffix='cell', how='inner')
    nr_out_tweets =  len(tweets_df.loc[has_gps]) - len(tweets_cells_df)
    print(f'{nr_out_tweets} tweets have been found outside of the grid and'
         ' filtered out as a result.')
    tweets_places_df = tweets_df.loc[~has_gps]
    return tweets_cells_df, tweets_places_df
    
with mp.Pool(8) as pool:
    map_parameters = [(res, max_place_area, cells_in_area_df) 
                      for res in tweets_process_res]
    print('entering the loop')
    tweets_pre_cell_res = (
        pool.starmap_async(get_langs_counts, map_parameters).get())

cells_langs_counts = None
places_langs_counts = None

for res in tweets_pre_cell_res:
    tweets_cells_df = res[0]
    tweets_places_df = res[1]
    groupby_cols = ['cld_lang', 'cell_id']
    cells_langs_counts = join_and_count.increment_counts(
        cells_langs_counts, tweets_cells_df, groupby_cols)
    groupby_cols = ['cld_lang', 'place_id']
    places_langs_counts = join_and_count.increment_counts(
        places_langs_counts, tweets_places_df, groupby_cols)

places_langs_counts = places_langs_counts['count']
places_counts = (places_langs_counts.groupby('place_id')
                                   .sum()
                                   .rename('total_count')
                                   .to_frame())
cells_langs_counts = cells_langs_counts['count']
cells_counts = (cells_langs_counts.groupby('cell_id')
                                  .sum()
                                  .rename('total_count')
                                  .to_frame())

Places -> cells

In [None]:
# We count the number of users speaking a local language in each cell and place 
# of residence.
local_langs = [lang for lang in plot_langs_dict]
places_local_counts = places_langs_counts.reset_index(level='cld_lang')
local_langs_mask = places_local_counts['cld_lang'].isin(local_langs)
places_local_counts = (places_local_counts.loc[local_langs_mask]
                                          .groupby('place_id')['count']
                                          .sum()
                                          .rename('local_count'))
places_counts = places_counts.join(places_local_counts, how='left')

cells_local_counts = cells_langs_counts.reset_index(level='cld_lang')
local_langs_mask = cells_local_counts['cld_lang'].isin(local_langs)
cells_local_counts = (cells_local_counts.loc[local_langs_mask]
                                        .groupby('cell_id')['count']
                                        .sum()
                                        .rename('local_count'))
cells_counts = cells_counts.to_frame().join(cells_local_counts, how='left')

cell_plot_df = places_to_cells.get_counts(
    places_counts, places_langs_counts, places_geodf,
    cells_in_area_df, plot_langs_dict, xy_proj=xy_proj)

# We add the counts from the tweets with coordinates
cell_plot_df = join_and_count.increment_join(
    cell_plot_df, cells_counts['total_count'], count_col='total_count')
cell_plot_df = join_and_count.increment_join(
    cell_plot_df, cells_counts['local_count'], count_col='local_count')
cell_plot_df = cell_plot_df.loc[cell_plot_df['total_count'] > 0]

for plot_lang, lang_dict in plot_langs_dict.items():
    lang_count_col = lang_dict['count_col']
    cells_lang_counts = cells_langs_counts.xs(plot_lang).rename(lang_count_col)
    cell_plot_df = join_and_count.increment_join(
        cell_plot_df, cells_lang_counts, count_col=lang_count_col)
    
    level_lang_label = tweet_level_label.format(lang_dict['readable'])
    sum_lang = cell_plot_df[lang_count_col].sum()
    print(f'There are {sum_lang:.0f} {level_lang_label}.')
    
cell_plot_df['cell_id'] = cell_plot_df.index
cell_data_path = cell_data_path_format.format('tweets', cc, cell_size)
cell_plot_df.to_file(cell_data_path, driver='GeoJSON')

### Plots

In [None]:
cell_size = 10000
cell_data_path = cell_data_path_format.format('tweets', cc, cell_size)
cell_plot_df = geopd.read_file(cell_data_path)
cell_plot_df.index = cell_plot_df['cell_id']
cell_plot_df, plot_langs_dict = metrics.calc_by_cell(cell_plot_df, plot_langs_dict)

In [None]:
for plot_lang, plot_dict in plot_langs_dict.items():
    count_lang_col = plot_dict['count_col']
    readable_lang = plot_dict['readable']
    save_path = os.path.join(fig_dir, 'counts',
        f'tweet_counts_cc={cc}_lang={plot_lang}_cell_size={cell_size}m.pdf')
    plot_title = f'Distribution of {readable_lang} speakers in {country_name}'
    cbar_label = plot_dict['count_label']
    plot_kwargs = dict(edgecolor='w', linewidths=0.2, cmap='Purples')
    ax_count = grid_viz.plot_grid(
        cell_plot_df, shape_df, metric_col=count_lang_col, save_path=save_path, 
        show=False, log_scale=True, title=plot_title, cbar_label=cbar_label,
        xy_proj=xy_proj, **plot_kwargs)
    
    prop_lang_col = plot_dict['prop_col']
    save_path = os.path.join(fig_dir, 'prop',
        f'tweets_prop_cc={cc}_lang={plot_lang}_cell_size={cell_size}m.pdf')
    plot_title = '{} predominance in {}'.format(readable_lang, country_name)
    cbar_label = plot_dict['prop_label']
    # Avoid sequential colormaps starting or ending with white, as white is  
    # reserved for an absence of data
    plot_kwargs = dict(edgecolor='w', linewidths=0.2, cmap='plasma')
    ax_prop = grid_viz.plot_grid(
        cell_plot_df, shape_df, metric_col=prop_lang_col, save_path=save_path, 
        title=plot_title, cbar_label=cbar_label, vmin=0, vmax=1, xy_proj=xy_proj, 
        **plot_kwargs)

In [None]:
save_path = os.path.join(fig_dir, 
            f'tweets_prop_cc={cc}_cell_size={cell_size}m.html')
prop_dict = {'name': 'prop', 'readable': 'proportion', 'vmin': 0, 'vmax': 1}
fig = grid_viz.plot_interactive(
    cell_plot_df, shape_df, plot_langs_dict, prop_dict,
    save_path=save_path, plotly_renderer='iframe_connected', show=True)

## Study at the user level

Users who have tagged their tweets with gps coordinates seem to do it regularly, as the median of the proportion of tweets they geo tag is at more than 75% on the first chunk -> it's worth it to try and get their cell of residence

In [None]:
a = tweets_process_res[0].copy()
a['has_gps'] = a['area'] == 0
gps_uids = a.loc[a['has_gps'], 'uid'].unique()
a = a.loc[a['uid'].isin(gps_uids)].groupby(['uid', 'has_gps']).size().rename('count').to_frame()
a = a.join(a.groupby('uid')['count'].sum().rename('sum'))
b = a.reset_index()
b = b.loc[b['has_gps']]
b['ratio'] = b['count'] / b['sum']
b['ratio'].describe()

If there's one or more cells where a user tweeted in proportion more than relevant_th of the time, we take among these cells the one where they tweeted the most outside work hours. Otherwise, we take the relevant place where they tweeted the most outside work hours, or we default to the place where they tweeted the most.

In [None]:
user_level_label = '{}-speaking users'
relevant_th = 0.1
plot_langs_dict = make_config.langs_dict(area_dict, user_level_label)

### Language(s) attribution

 Here we get rid of users whose language we couldn't identify

In [None]:
# Residence attribution is the longest to run, and by a long shot, so we'll start
# with language to filter out uids in tweets_df before doing it
groupby_cols = ['uid', 'cld_lang']
user_langs_counts = None
for res in tweets_process_res:
    tweets_lang_df = res.copy()
    # Here we don't filter out based on max_place_area, because these tweets
    # are still useful for language attribution.
    tweets_lang_df = tweets_lang_df.loc[tweets_lang_df['cld_lang'].notnull()]
    user_langs_counts = join_and_count.increment_counts(
        user_langs_counts, tweets_lang_df, groupby_cols)
    
total_per_user = user_langs_counts.groupby('uid')['count'].sum().rename('user_count')
user_langs_agg = user_langs_counts.join(total_per_user).assign(
    prop_lang=lambda df: df['count'] / df['user_count'])
user_langs_agg = user_langs_agg.loc[user_langs_agg['prop_lang'] > relevant_th]
uid_with_lang = user_langs_agg.index.levels[0].values
print(f'We were able to attribute at least one language to {len(uid_with_lang)}'
      ' users')

In [None]:
area_langs_counts = (user_langs_agg.groupby('cld_lang')
                                   .size()
                                   .rename('count')
                                   .sort_values(ascending=False))
total_count = len(user_langs_agg.index.levels[0])
top_langs = area_langs_counts.index.values[:10]
top_counts = area_langs_counts.values[:10]

plt.bar(top_langs, top_counts)
plt.title(f'Ten languages with the most speakers in {country_name}')
plt.ylabel('number of speakers')
save_path = os.path.join(fig_dir, f'top_langs_speakers_count_cc={cc}.pdf')
plt.savefig(save_path)
plt.show()
plt.clf()

plt.bar(top_langs, top_counts/total_count)
plt.title(f'Ten languages with the most speakers in {country_name}')
save_path = os.path.join(fig_dir, f'top_langs_speakers_prop_cc={cc}.pdf')
plt.ylabel('proportion of the users speaking')
plt.savefig(save_path)
plt.show()
plt.clf()

Attribute users to a group: mono, bi, tri, ... lingual

Problem: need more tweets to detect multilingualism, eg users with only three tweets in the dataset are very unlikely to be detected as multilinguals

In [None]:
local_langs = [lang for lang in plot_langs_dict]
users_ling_grp = user_langs_agg.reset_index(level='cld_lang')
# user_langs_agg is sorted by user and language, because of the groupby in
# increment counts. Thus, when we concatenate the languages with sum() here,
# they're already sorted so we won't get both 'frit' and 'itfr' for instance.
local_mask = users_ling_grp['cld_lang'].isin(local_langs)
users_ling_grp = (users_ling_grp.loc[local_mask, 'cld_lang']
                                .groupby('uid')
                                .apply(lambda langs: 'ling_'+langs.sum())
                                .rename('ling_grp')
                                .to_frame()
                                .groupby(['uid', 'ling_grp'])
                                .first())

In [None]:
ling_counts = (users_ling_grp.reset_index()
                            .groupby('ling_grp')
                            .size()
                            .sort_values(ascending=False))
multiling_grps = ling_counts.index.values
x_plot = [grp[5:] for grp in multiling_grps]
plt.bar(x_plot, ling_counts)
plt.title(f'Local languages groups in {country_name}')
plt.ylabel('number of speakers')
save_path = os.path.join(fig_dir, f'multilinguals_count_cc={cc}.pdf')
plt.savefig(save_path)
plt.show()
plt.clf()

plt.bar(x_plot, ling_counts/total_count)
plt.title(f'Local languages groups in {country_name}')
save_path = os.path.join(fig_dir, f'multilinguals_prop_cc={cc}.pdf')
plt.ylabel('proportion of the users speaking')
plt.savefig(save_path)
plt.show()
plt.clf()

### Pre-residence attribution

In [None]:
def prep_resid_attr(tweets_lang_df, cells_in_area_df, uid_with_lang, 
                    max_place_area, cc_timezone):
    tweets_df = tweets_lang_df.copy()
    # We filter out users to which we couldn't attribute even one language
    uid_mask = tweets_df['uid'].isin(uid_with_lang)
    relevant_area_mask = tweets_df['area'] < max_place_area
    tweets_df = tweets_df.loc[uid_mask & relevant_area_mask].copy()
    tweets_df['hour'] = (tweets_df['created_at'].dt.tz_localize('UTC')
                                                .dt.tz_convert(cc_timezone)
                                                .dt.hour)
    # Tweets are considered in work hours if they were made between 8 and 18
    # outside of the week-end (weekday goes from 0 (Monday) to 6 (Sunday)).
    tweets_df['isin_workhour'] = (
        (tweets_df['hour'] > 7) 
        & (tweets_df['hour'] < 18)
        & (tweets_df['created_at'].dt.weekday < 5))
    
    has_gps = tweets_df['area'] == 0
    tweets_cells_df = geopd.sjoin(tweets_df.loc[has_gps], cells_in_area_df, 
        op='within', rsuffix='cell', how='inner')
    # geopd adds an underscore by itself to the suffix
    tweets_places_df = tweets_df.loc[~has_gps]
    print('chunk done')
    return tweets_cells_df, tweets_places_df


with mp.Pool(8) as pool:
    map_parameters = [(res, cells_in_area_df, uid_with_lang, max_place_area, 
                       cc_timezone) 
                      for res in tweets_process_res]
    print('entering the loop')
    tweets_pre_resid_res = (
        pool.starmap_async(prep_resid_attr, map_parameters).get())
    
user_places_habits = None
user_cells_habits = None
for res in tweets_pre_resid_res:
    # We first count the number of times a user has tweeted in each place inside
    # and outside work hours.
    tweets_places_df = res[1]
    groupby_cols = ['uid', 'place_id', 'isin_workhour']
    user_places_habits = join_and_count.increment_counts(
        user_places_habits, tweets_places_df, groupby_cols)
    # Then we do the same thing except in each cell, using the tweets with
    # coordinates.
    tweets_cells_df = res[0]
    groupby_cols = ['uid', 'cell_id', 'isin_workhour']
    user_cells_habits = join_and_count.increment_counts(
        user_cells_habits, tweets_cells_df, groupby_cols)

Here we took number of speakers, whether they're multilingual or monolingual, if they speak a language, they count as one in that language's count

Other possibility: pass the places counts to cells counts here, and then do the whole residence attribution solely
on a cell basis. Problem: user tags himself in the same city all the time, overlapping multiple cells: we'll have more than one cell with approximately the same count, and one cells takes all in the end. Typical 'winner takes all' problem in this case

### Residence attribution

In [None]:
# We calculate the total number of users in each cell and place of residence.
user_counts_in_cells = user_cells_habits.groupby('uid')['count'].sum().rename('user_count')
user_home_cell = user_cells_habits.join(user_counts_in_cells, how='inner')
user_home_cell['prop_in_cell'] = user_home_cell['count'] / user_home_cell['user_count']
user_home_cell = (user_home_cell.loc[user_home_cell['prop_in_cell'] > relevant_th]
                                .xs(False, level='isin_workhour')
                                .reset_index()
                                .sort_values(by=['uid', 'count'])
                                .groupby('uid')['cell_id']
                                .last())

users_with_cell = user_home_cell.index.values
user_home_place = uagg.get_residence(user_places_habits, place_id_col='place_id')
user_only_place = user_home_place.reset_index()
user_only_place = (
    user_only_place.loc[~user_only_place['uid'].isin(users_with_cell)]
                   .set_index('uid')
                   .loc[:, 'place_id'])

### Generate cell data

TODO: remake plot_langs_dict to have labels for mono, bi, tri linguals, total and local and col names

In [None]:
# We get all the places with residents and the associated count
places_counts = (user_only_place.to_frame()
                                .groupby('place_id')
                                .size()
                                .rename('total_count')
                                .to_frame())
cells_counts = (user_home_cell.to_frame()
                              .groupby('cell_id')
                              .size()
                              .rename('total_count')
                              .to_frame())
# We count the number of users speaking a local language in each cell and place 
# of residence.
local_lang_users = user_langs_agg.reset_index(level='cld_lang')
local_langs = [lang for lang in plot_langs_dict]
local_langs_mask = local_lang_users['cld_lang'].isin(local_langs)
local_lang_users = (local_lang_users.loc[local_langs_mask]
                                    .groupby('uid')
                                    .first())
places_local_counts = uagg.to_count_by_area(local_lang_users, user_only_place,
                                            output_col='local_count')
cells_local_counts = uagg.to_count_by_area(local_lang_users, user_home_cell, 
                                           output_col='local_count')
# Then we get the counts of speakers by language and cell
places_langs_counts = uagg.to_count_by_area(user_langs_agg, user_only_place)
cells_langs_counts = uagg.to_count_by_area(user_langs_agg, user_home_cell)
# Then the counts of groups (mono-, bi-, tri-linguals):
places_ling_counts = uagg.to_count_by_area(users_ling_grp, user_only_place)
cells_ling_counts = uagg.to_count_by_area(users_ling_grp, user_home_cell)

# We always left join on places counts, because total_count == 0 implies
# that every other count is 0.
places_counts = places_counts.join(places_local_counts, how='left')
cells_counts = cells_counts.join(cells_local_counts, how='left')
# TODO:TODO regroup in one for loop
for ling in multiling_grps:
    ling_count_col= f'count_{ling}'
    cells_in_that_grp_count = (cells_ling_counts.xs(ling, level='ling_grp')
                                                .rename(ling_count_col))
    places_in_that_grp_count = (places_ling_counts.xs(ling, level='ling_grp')
                                                  .rename(ling_count_col))
    cells_counts = cells_counts.join(cells_in_that_grp_count, how='left')
    places_counts = places_counts.join(places_in_that_grp_count, how='left')
    
for plot_lang, lang_dict in plot_langs_dict.items():
    lang_count_col = lang_dict['count_col']
    places_lang_counts = (places_langs_counts.xs(plot_lang, level='cld_lang')
                                             .rename(lang_count_col))
    cells_lang_counts = (cells_langs_counts.xs(plot_lang, level='cld_lang')
                                           .rename(lang_count_col))
    places_counts = places_counts.join(places_lang_counts, how='left')
    cells_counts = cells_counts.join(cells_lang_counts, how='left')

cell_plot_df = cells_in_area_df.copy()
cells_in_places = places_to_cells.get_intersect(cell_plot_df, places_geodf, 
                                                places_counts, xy_proj=xy_proj)

# Then we add the total counts for the users with a cell of residence.
count_cols = places_counts.columns
cell_plot_df = places_to_cells.intersect_to_cells(
    cells_in_places, cell_plot_df, count_cols)
for col in count_cols:
    cell_plot_df = join_and_count.increment_join(
        cell_plot_df, cells_counts[col], count_col=col)
# 0 or 1?
cell_plot_df = cell_plot_df.loc[cell_plot_df['total_count'] > 0]

for plot_lang, lang_dict in plot_langs_dict.items():
    lang_count_col = lang_dict['count_col']
    level_lang_label = user_level_label.format(lang_dict['readable'])
    sum_lang = cell_plot_df[lang_count_col].sum()
    print(f'There are {sum_lang:.0f} {level_lang_label}.')

cell_data_path = cell_data_path_format.format('users', cc, cell_size)
cell_plot_df.to_file(cell_data_path, driver='GeoJSON')

TODO: save for every cc and cell_size with correct cell_id

### Plots

In [None]:
# cell_size = 10000
cell_data_path = cell_data_path_format.format('users', cc, cell_size)
cell_plot_df = geopd.read_file(cell_data_path)
cell_plot_df.index = cell_plot_df['cell_id']
cell_plot_df, plot_langs_dict = metrics.calc_by_cell(cell_plot_df, 
                                                     plot_langs_dict)

In [None]:
for plot_lang, plot_dict in plot_langs_dict.items():
    count_lang_col = plot_dict['count_col']
    readable_lang = plot_dict['readable']
    save_path = os.path.join(fig_dir, 'counts',
        f'users_counts_cc={cc}_lang={plot_lang}_cell_size={cell_size}m.pdf')
    plot_title = f'Distribution of {readable_lang} speakers in {country_name}'
    cbar_label = plot_dict['count_label']
    plot_kwargs = dict(edgecolor='w', linewidths=0.2, cmap='Purples')
    ax_count = grid_viz.plot_grid(
        cell_plot_df, shape_df, metric_col=count_lang_col, save_path=save_path, 
        show=False, log_scale=True, title=plot_title, cbar_label=cbar_label,
        xy_proj=xy_proj, **plot_kwargs)
    
    prop_lang_col = plot_dict['prop_col']
    save_path = os.path.join(fig_dir, 'prop',
        f'users_prop_cc={cc}_lang={plot_lang}_cell_size={cell_size}m.pdf')
    plot_title = f'Predominance of {readable_lang} speakers in {country_name}'
    cbar_label = plot_dict['prop_label']
    # Avoid sequential colormaps starting or ending with white, as white is  
    # reserved for an absence of data
    plot_kwargs = dict(edgecolor='w', linewidths=0.2, cmap='plasma')
    ax_prop = grid_viz.plot_grid(
        cell_plot_df, shape_df, metric_col=prop_lang_col, save_path=save_path, 
        title=plot_title, cbar_label=cbar_label, vmin=0, vmax=1, xy_proj=xy_proj, 
        **plot_kwargs)

In [None]:
save_path = os.path.join(fig_dir, 
            f'users_prop_cc={cc}_cell_size={cell_size}m.html')
prop_dict = {'name': 'prop', 'readable': 'proportion', 'vmin': 0, 'vmax': 1}
fig = grid_viz.plot_interactive(
    cell_plot_df, shape_df, plot_langs_dict, prop_dict,
    save_path=save_path, plotly_renderer='iframe_connected', show=True)