In [None]:
# Reload all src modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

In [None]:
import os
import cProfile
import pandas as pd
import geopandas as geopd
import numpy as np
import multiprocessing as mp
import re
import gzip
try:
    import cld3
except ModuleNotFoundError:
    pass
import pycld2
from pyproj import Transformer
from shapely.geometry import Polygon
from shapely.geometry import Point
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import descartes
import folium
import src.utils.geometry as geo
import src.data.shp_extract as shp_extract
import src.data.tweets_cells_counts as tweets_counts
import src.data.text_process as text_process
import src.data.access as data_access
import src.visualization.grid_viz as grid_viz
import src.data.user_filters as ufilters
from dotenv import load_dotenv
load_dotenv()

pd.reset_option("display.max_rows")

Too small 'places' data: BO, TN

Limited 'places' data: LT: 69 and EE: 252 (only large cities), HK: 21 (only districts), 

mixed distribution?

In [None]:
data_dir_path = os.environ['DATA_DIR']
tweets_files_format = 'tweets_2015_2018_{}.json.gz'
places_files_format = 'places_2015_2018_{}.json.gz'
ssh_domain = os.environ['IFISC_DOMAIN']
ssh_username = os.environ['IFISC_USERNAME']
country_codes = ('BO', 'CA', 'CH', 'EE', 'ES', 'FR', 'HK','ID', 'LT', 'LV',
                'MY', 'PE', 'RO', 'SG', 'TN', 'UA')
latlon_proj = 'epsg:4326'
xy_proj = 'epsg:3857'
external_data_dir = '../data/external/'
fig_dir = '../reports/figures'
cc = 'CH'

## Getting data

In [None]:
tweets_file_path = os.path.join(data_dir_path, tweets_files_format.format(cc))
chunk_size = 100000
raw_tweets_df_generator = data_access.yield_json(tweets_file_path, 
    ssh_domain=ssh_domain, ssh_username=ssh_username, chunk_size=chunk_size, compression='gzip')
for i,raw_tweets_df in enumerate(raw_tweets_df_generator):
    break

raw_tweets_df_generator.close()
ratio_coords = len(raw_tweets_df.loc[raw_tweets_df['coordinates'].notnull()]) / chunk_size
print('{:.1%} of tweets have exact coordinates data'.format(ratio_coords))
nr_users = len(raw_tweets_df['uid'].unique())
print('There are {} distinct users in the dataset'.format(nr_users))
raw_tweets_df.head()

In [None]:
places_file_path = os.path.join(data_dir_path, places_files_format.format(cc))
shapefile_name = 'CNTR_RG_01M_2016_4326.shp'
shapefile_path = os.path.join(external_data_dir, shapefile_name, shapefile_name)
shape_df = geopd.read_file(shapefile_path)
shape_df = shape_df.loc[shape_df['FID'] == cc]
raw_places_df = data_access.return_json(places_file_path, 
    ssh_domain=ssh_domain, ssh_username=ssh_username, compression='gzip')
raw_places_df.head()

Get most frequent, small enough place: if most frequent -> select it, if within more frequent bigger place -> select it, 

If not small enough place, discard the user

In [None]:
print(raw_tweets_df.info())

The "I'm at \<place\>" from Foursquare are also there, and they all have 'source' = <a href="http://foursquare.com" rel="nofollow">Foursquare</a>. Tweetbot is an app for regular users, it's not related to bot users.

In [None]:
tweets_df = raw_tweets_df[['text', 'id', 'lang', 'place_id', 'coordinates', 'uid', 'created_at']]
tweets_df = tweets_df.rename(columns={'lang': 'twitter_lang'})
null_reply_id = 'e39d05b72f25767869d44391919434896bb055772d7969f74472032b03bc18418911f3b0e6dd47ff8f3b2323728225286c3cb36914d28dc7db40bdd786159c0a'
raw_tweets_df.loc[raw_tweets_df['in_reply_to_status_id'] == null_reply_id, 
    ['in_reply_to_status_id', 'in_reply_to_screen_name', 'in_reply_to_user_id']] = None
tweets_df['source'] = raw_tweets_df['source'].str.extract(r'>(.+)</a>', expand=False)
tweets_df['source'].value_counts().head(20)

In [None]:
a = raw_tweets_df[raw_tweets_df['source'].str.contains('tweetmyjobs')]
a = (a.drop(columns=['in_reply_to_status_id', 'id', 'source',  
                'in_reply_to_screen_name', 'in_reply_to_user_id', 'quoted_status_id'])
    .sort_values(by=['uid', 'created_at']))
pd.set_option("display.max_rows", None)
a[a['uid'] == '066669353196d994d624138aa1ef4aafd892ed8e1e6e65532a39ecc7e6129b829bdbf8ea2b53b11f93a74cb7d1a3e1aa537d0c060be02778b37550d70a77a80d']


## First tests on single df

In [None]:
ref_year = 2015
nr_consec_months = 3
tweets_file_path = os.path.join(data_dir_path, tweets_files_format.format(cc))
raw_tweets_df_generator = data_access.yield_json(tweets_file_path, 
    ssh_domain=ssh_domain, ssh_username=ssh_username, chunk_size=1000000, compression='gzip')
agg_tweeted_months_users = pd.DataFrame([], columns=['uid', 'month', 'count'])
tweets_df_list = []
for raw_tweets_df in raw_tweets_df_generator:
    tweets_df_list.append(raw_tweets_df)
    agg_tweeted_months_users = ufilters.inc_months_activity(
            agg_tweeted_months_users, raw_tweets_df)
raw_tweets_df_generator.close()
local_uid_series = ufilters.consec_months(agg_tweeted_months_users)

In [None]:
ref_year = 2015
nr_consec_months = 3
tweeted_months_users = pd.DataFrame([], columns=['uid', 'month', 'count'])
tweeted_months_users = ufilters.inc_months_activity(
            tweeted_months_users, tweets_df)
local_uid_series = ufilters.consec_months(tweeted_months_users)

In [None]:
raw_tweets_df['lang'].value_counts().head(10)

In [None]:
raw_tweets_df.join(local_uid_series, on='uid', how='inner')['lang'].value_counts().head(10)

In [None]:
tweets_file_path = os.path.join(data_dir_path, tweets_files_format.format(cc))
raw_tweets_df_generator = data_access.yield_json(tweets_file_path, 
    ssh_domain=ssh_domain, ssh_username=ssh_username, chunk_size=1000000, compression='gzip')
for raw_tweets_df in raw_tweets_df_generator:
    filtered_tweets_df = pd.DataFrame(local_uid_series)

## Language detection

### Detected languages

- Languages possibly detected by CLD:

In [None]:
lang_with_code = dict(pycld2.LANGUAGES)
detected_lang_with_code = [(lang, lang_with_code[lang]) for lang in pycld2.DETECTED_LANGUAGES]
print(detected_lang_with_code)

- Languages possibly detected by Twitter (see 'lang' in https://support.gnip.com/apis/powertrack2.0/rules.html#Operators):

Amharic - am
Arabic - ar
Armenian - hy
Bengali - bn
Bulgarian - bg
Burmese - my
Chinese - zh
Czech - cs
Danish - da
Dutch - nl
English - en
Estonian - et
Finnish - fi
French - fr
Georgian - ka
German - de
Greek - el
Gujarati - gu
Haitian - ht
Hebrew - iw
Hindi - hi
Hungarian - hu
Icelandic - is
Indonesian - in
Italian - it
Japanese - ja
Kannada - kn
Khmer - km
Korean - ko
Lao - lo
Latvian - lv
Lithuanian - lt
Malayalam - ml
Maldivian - dv
Marathi - mr
Nepali - ne
Norwegian - no
Oriya - or
Panjabi - pa
Pashto - ps
Persian - fa
Polish - pl
Portuguese - pt
Romanian - ro
Russian - ru
Serbian - sr
Sindhi - sd
Sinhala - si
Slovak - sk
Slovenian - sl
Sorani Kurdish - ckb
Spanish - es
Swedish - sv
Tagalog - tl
Tamil - ta
Telugu - te
Thai - th
Tibetan - bo
Turkish - tr
Ukrainian - uk
Urdu - ur
Uyghur - ug
Vietnamese - vi
Welsh - cy

In [None]:
tweets_lang_df = text_process.lang_detect(tweets_df, text_col='text', min_nr_words=4, cld='pycld2')
tweets_lang_df.head()

In [None]:
cld_langs = tweets_lang_df['cld_lang'].unique()
cld_langs.sort()
print('Languages detected by cld: {}'.format(cld_langs))
twitter_langs = tweets_lang_df['twitter_lang'].unique()
twitter_langs.sort()
print('Languages detected by twitter: {}'.format(twitter_langs))

In [None]:
tweets_lang_df['twitter_lang'].value_counts().head(10)

In [None]:
tweets_lang_df['cld_lang'].value_counts().head(10)

French case, corsican is unreliably detected by CLD for French tweets, however seems pretty accurate when twitter_lang='it'

### Multilingual users

In [None]:
groupby_user_lang = tweets_lang_df.loc[tweets_lang_df['twitter_lang'] != 'und'].groupby(['uid', 'twitter_lang'])
count_tweets_by_user_lang = groupby_user_lang.size()
count_langs_by_user_df = count_tweets_by_user_lang.groupby('uid').transform('size')
multiling_users_df = count_tweets_by_user_lang.loc[count_langs_by_user_df > 1]
pd.DataFrame(multiling_users_df)

In [None]:
pd.set_option("display.max_rows", 100)
multiling_users_list = [x[0] for x in multiling_users_df.index.values]
tweets_lang_df[tweets_lang_df['uid'].isin(multiling_users_list)].sort_values(by=['uid', 'cld_lang'])[
    ['uid', 'filtered_text', 'cld_lang', 'twitter_lang', 'created_at']]

## Places into geodf and join on tweets

Calculate the area to discard bbox which are too large? Problem: need to project first, which is expensive

In [None]:
tweets_to_loc_df = tweets_lang_df.loc[tweets_lang_df['coordinates'].isnull()]
crs = {'init': latlon_proj}
places_df = raw_places_df[['id', 'bounding_box', 'name', 'place_type']]
geometry = places_df['bounding_box'].apply(lambda x: Polygon(x['coordinates'][0]))
places_geodf = geopd.GeoDataFrame(places_df, crs=crs, geometry=geometry)
places_geodf = places_geodf.set_index('id')
places_geodf = places_geodf.drop(columns=['bounding_box'])
places_geodf['area'] = places_geodf.geometry.to_crs(xy_proj).area
tweets_final_df = tweets_to_loc_df.join(places_geodf, on='place_id', how='left')
tweets_final_df.head(10)

### Corsican?

In [None]:
tweets_final_df.loc[(tweets_final_df['cld_lang'] =='co') & (tweets_final_df['twitter_lang'] =='it')]

CLD sensitive to letter repetitions made to insist: can put threshold if more than 3 consecutive same letter, bring it down to 2, it seems to improve prediction on example

Usually twitter's prediction seems better...

In [None]:
tweets_final_df[tweets_final_df['cld_lang'] != tweets_final_df['twitter_lang']].drop(columns=['id'])

### Swiss German?

In [None]:
zurich_id = places_geodf.loc[places_geodf['name']=='Zurich', 'geometry'].index[0]
# places_in_zurich = places_geodf
places_in_zurich = places_geodf.loc[places_geodf.within(places_geodf.loc[zurich_id, 'geometry'])]
places_in_zurich

In [None]:
tweets_in_zurich = tweets_final_df.join(places_in_zurich, on='place_id', rsuffix='_place')
print(tweets_in_zurich['cld_lang'].value_counts().head())
print(tweets_in_zurich['twitter_lang'].value_counts().head())

In [None]:
tweets_in_zurich.loc[(tweets_in_zurich['cld_lang']=='un') & (tweets_in_zurich['twitter_lang']=='de'), 
                     'filtered_text']

Mostly mixed languages not detected by twitter it seems:

In [None]:
tweets_in_zurich.loc[tweets_in_zurich['twitter_lang']=='und', 
                     'filtered_text']

## groupbys and stuff

In [None]:
def get_mean_time(df, dt_col):
    t_series_in_sec_of_day = df['hour']*3600 + df['minute']*60 + df['second']
    return pd.to_timedelta(int(t_series_in_sec_of_day.mean()), unit='s')

In [None]:
tweets_df = raw_tweets_df.copy()
# Speeds up the process to extract the hour, min and sec first
tweets_df['hour'] = tweets_df['created_at'].dt.hour
tweets_df['minute'] = tweets_df['created_at'].dt.minute
tweets_df['second'] = tweets_df['created_at'].dt.second
groupby_user_place = tweets_df.groupby(['uid', 'place_id'])
count_tweets_by_user_place = groupby_user_place.size()
count_tweets_by_user_place.rename('count', inplace=True)
mean_time_by_user_place = groupby_user_place.apply(lambda df: get_mean_time(df, 'created_at'))
mean_time_by_user_place.rename('avg time', inplace=True)
# transform to keep same size, so as to be able to have a matching boolean Series of same size as 
# original df to select users with more than one place for example:
count_places_by_user_df = count_tweets_by_user_place.groupby('uid').transform('size')
agg_data_df = pd.concat([count_tweets_by_user_place, mean_time_by_user_place], axis=1)
count_tweets_by_user_place_geodf = agg_data_df.join(places_geodf, on='place_id')
count_tweets_by_user_place_geodf.head()

In [None]:
cProfile.run("groupby_user_place.apply(lambda df: get_mean_time(df, 'created_at'))")

In [None]:
count_tweets_by_user_place_geodf.loc[count_places_by_user_df > 1]

Add new chunk to cumulative data:

In [None]:
count_tweets_by_user_place_geodf = count_tweets_by_user_place_geodf.join(
    count_tweets_by_user_place_geodf['count'], 
    on=['uid', 'place_id'], how='outer', rsuffix='_new')
count_tweets_by_user_place_geodf['count'] += count_tweets_by_user_place_geodf['count_new']
count_tweets_by_user_place_geodf.drop(columns=['count_new'], inplace=True)
count_tweets_by_user_place_geodf