In [1]:
# Reload all src modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

In [2]:
import os
import cProfile
import pandas as pd
import geopandas as geopd
import numpy as np
import multiprocessing as mp
import re
import gzip
try:
    import cld3
except ModuleNotFoundError:
    pass
import pycld2
from pyproj import Transformer
from shapely.geometry import Polygon
from shapely.geometry import Point
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import descartes
import folium
import src.utils.geometry as geo
import src.data.shp_extract as shp_extract
import src.data.tweets_cells_counts as tweets_counts
import src.data.text_process as text_process
import src.data.access as data_access
import src.visualization.grid_viz as grid_viz
import src.data.user_filters as ufilters
from dotenv import load_dotenv
load_dotenv()

pd.reset_option("display.max_rows")

Too small 'places' data: BO, TN

Limited 'places' data: LT: 69 and EE: 252 (only large cities), HK: 21 (only districts), 

In [3]:
data_dir_path = os.environ['DATA_DIR']
tweets_files_format = 'tweets_2015_2018_{}.json.gz'
places_files_format = 'places_2015_2018_{}.json.gz'
ssh_domain = os.environ['IFISC_DOMAIN']
ssh_username = os.environ['IFISC_USERNAME']
country_codes = ('BO', 'CA', 'CH', 'EE', 'ES', 'FR', 'HK','ID', 'LT', 'LV',
                'MY', 'PE', 'RO', 'SG', 'TN', 'UA')
latlon_proj = 'epsg:4326'
xy_proj = 'epsg:3857'
external_data_dir = '../data/external/'
fig_dir = '../reports/figures'
cc = 'CH'

## Getting data

In [4]:
tweets_file_path = os.path.join(data_dir_path, tweets_files_format.format(cc))
chunk_size = 100000
raw_tweets_df_generator = data_access.yield_json(tweets_file_path, 
    ssh_domain=ssh_domain, ssh_username=ssh_username, chunk_size=chunk_size, compression='gzip')
for i,raw_tweets_df in enumerate(raw_tweets_df_generator):
    break

raw_tweets_df_generator.close()
ratio_coords = len(raw_tweets_df.loc[raw_tweets_df['coordinates'].notnull()]) / chunk_size
print('{:.1%} of tweets have exact coordinates data'.format(ratio_coords))
nr_users = len(raw_tweets_df['uid'].unique())
print('There are {} distinct users in the dataset'.format(nr_users))
raw_tweets_df.head()

18.1% of tweets have exact coordinates data
There are 11566 distinct users in the dataset


Unnamed: 0,text,in_reply_to_status_id,id,source,coordinates,timestamp_ms,in_reply_to_screen_name,in_reply_to_user_id,lang,created_at,uid,place_id,quoted_status_id
0,"Ah bueno, aprecio tanto como me ignorassssssss",e39d05b72f25767869d44391919434896bb055772d7969...,5d36fe19b2ab1178560fafb6e2270197fe8e006f620861...,"<a href=""http://twitter.com/download/android"" ...","{'type': 'Point', 'coordinates': [-68.8684979,...",2015-10-05 19:43:29.657,e39d05b72f25767869d44391919434896bb055772d7969...,e39d05b72f25767869d44391919434896bb055772d7969...,es,2015-10-05 19:43:29,17745d10f17d7e3fc0da15997e88676c804f627eeadca4...,4e7c21fd2af027c6,
1,i miss them both #ALDUBMissingYou,e39d05b72f25767869d44391919434896bb055772d7969...,412e85cce06bb37c3b0752467baac07c51dd7d93f6c7db...,"<a href=""http://twitter.com/download/android"" ...","{'type': 'Point', 'coordinates': [121.0115236,...",2015-10-11 23:31:42.311,e39d05b72f25767869d44391919434896bb055772d7969...,e39d05b72f25767869d44391919434896bb055772d7969...,en,2015-10-11 23:31:42,28c6e28fc7cc605cac4298aea7a8592ac4f8a5155093af...,5868d0e0749c9c47,
2,https://t.co/Cmc6DLwx5A,e39d05b72f25767869d44391919434896bb055772d7969...,1bcc9cceac0bc74b6c0b6ae63c3873321e0cffbab82dc6...,"<a href=""http://twitter.com/download/android"" ...","{'type': 'Point', 'coordinates': [-73.9303333,...",2015-12-25 19:24:21.852,e39d05b72f25767869d44391919434896bb055772d7969...,e39d05b72f25767869d44391919434896bb055772d7969...,und,2015-12-25 19:24:21,6e7e3ad5746ecfee12d788cb30eb99484e92ae4558a4a7...,4e7c21fd2af027c6,
3,Follow me:\nSnapchat: @c9fb9d1daebb152f5316733...,e39d05b72f25767869d44391919434896bb055772d7969...,f11720130f52da86725d04d96525a2365cc1a367d68ff7...,"<a href=""http://twitter.com/download/android"" ...","{'type': 'Point', 'coordinates': [120.5687683,...",2015-12-30 14:27:31.663,e39d05b72f25767869d44391919434896bb055772d7969...,e39d05b72f25767869d44391919434896bb055772d7969...,en,2015-12-30 14:27:31,c25018eacaf19cb4679ffbe7b6cdd26d6010c2e0e2cc2f...,99bc40068c24ecd8,
4,My morning mantra: I can survive through Justi...,e39d05b72f25767869d44391919434896bb055772d7969...,15745701d2d0bd8229378a8bc07b5b2576d63d72c0779e...,"<a href=""http://twitter.com/download/iphone"" r...",,2015-08-31 09:39:44.029,e39d05b72f25767869d44391919434896bb055772d7969...,e39d05b72f25767869d44391919434896bb055772d7969...,en,2015-08-31 09:39:44,7dd8675d6ea158894d34ea2b98b2fe0bbc66637e1f6f53...,ddcca24ee29ddff2,


In [5]:
places_file_path = os.path.join(data_dir_path, places_files_format.format(cc))
shapefile_name = 'CNTR_RG_01M_2016_4326.shp'
shapefile_path = os.path.join(external_data_dir, shapefile_name, shapefile_name)
shape_df = geopd.read_file(shapefile_path)
shape_df = shape_df.loc[shape_df['FID'] == cc]
raw_places_df = data_access.return_json(places_file_path, 
    ssh_domain=ssh_domain, ssh_username=ssh_username, compression='gzip')
raw_places_df.head()

Unnamed: 0,attributes,bounding_box,country,country_code,full_name,id,name,place_type,url
0,{},"{'coordinates': [[[5.955894, 45.817792], [5.95...",Schweiz,CH,Suiza,4e7c21fd2af027c6,Suiza,country,https://api.twitter.com/1.1/geo/id/4e7c21fd2af...
1,{},"{'coordinates': [[[6.861601, 46.326441], [6.86...",Schweiz,CH,"Berne, Switzerland",5868d0e0749c9c47,Bern,admin,https://api.twitter.com/1.1/geo/id/5868d0e0749...
2,{},"{'coordinates': [[[9.400411, 47.496178], [9.40...",Schweiz,CH,"Arbon, Thurgau",99bc40068c24ecd8,Arbon,city,https://api.twitter.com/1.1/geo/id/99bc40068c2...
3,{},"{'coordinates': [[[6.127588, 46.168188], [6.12...",Schweiz,CH,"Carouge (GE), Genève",ddcca24ee29ddff2,Carouge (GE),city,https://api.twitter.com/1.1/geo/id/ddcca24ee29...
4,{},"{'coordinates': [[[9.491093, 46.750148], [9.49...",Schweiz,CH,"Churwalden, Graubünden",1a5f7565464884a4,Churwalden,city,https://api.twitter.com/1.1/geo/id/1a5f7565464...


Get most frequent, small enough place: if most frequent -> select it, if within more frequent bigger place -> select it, 

If not small enough place, discard the user

In [7]:
print(raw_tweets_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 13 columns):
text                       1000000 non-null object
in_reply_to_status_id      1000000 non-null object
id                         1000000 non-null object
source                     1000000 non-null object
coordinates                291348 non-null object
timestamp_ms               1000000 non-null datetime64[ns]
in_reply_to_screen_name    1000000 non-null object
in_reply_to_user_id        1000000 non-null object
lang                       1000000 non-null object
created_at                 1000000 non-null datetime64[ns]
uid                        1000000 non-null object
place_id                   1000000 non-null object
quoted_status_id           33451 non-null object
dtypes: datetime64[ns](2), object(11)
memory usage: 99.2+ MB
None
There are 61472 distinct users in the dataset


The "I'm at \<place\>" from Foursquare are also there, and they all have 'source' = <a href="http://foursquare.com" rel="nofollow">Foursquare</a>. Tweetbot is an app for regular users, it's not related to bot users.

In [8]:
tweets_df = raw_tweets_df[['text', 'id', 'lang', 'place_id', 'coordinates', 'uid', 'created_at']]
tweets_df = tweets_df.rename(columns={'lang': 'twitter_lang'})
null_reply_id = 'e39d05b72f25767869d44391919434896bb055772d7969f74472032b03bc18418911f3b0e6dd47ff8f3b2323728225286c3cb36914d28dc7db40bdd786159c0a'
raw_tweets_df.loc[raw_tweets_df['in_reply_to_status_id'] == null_reply_id, 
    ['in_reply_to_status_id', 'in_reply_to_screen_name', 'in_reply_to_user_id']] = None
tweets_df['source'] = raw_tweets_df['source'].str.extract(r'>(.+)</a>', expand=False)
tweets_df['source'].value_counts().head(20)

Twitter for iPhone             44359
Twitter Web Client             19632
Twitter for Android            14912
Instagram                       8268
Twitter for iPad                2351
Tweetbot for iΟS                1996
Foursquare                      1951
TweetMyJOBS                      802
Sandaysoft Cumulus               710
dlvr.it                          567
Busted App                       380
TTYtter                          351
Twitter for Windows Phone        343
www.kartenquiz.de                326
Tweetbot for Mac                 319
twitterfeed                      313
iOS                              313
MapGame                          297
World Cities                     254
Twitter for Android Tablets      217
Name: source, dtype: int64

In [None]:
a = raw_tweets_df[raw_tweets_df['source'].str.contains('tweetmyjobs')]
a = (a.drop(columns=['in_reply_to_status_id', 'id', 'source',  
                'in_reply_to_screen_name', 'in_reply_to_user_id', 'quoted_status_id'])
    .sort_values(by=['uid', 'created_at']))
pd.set_option("display.max_rows", None)
a[a['uid'] == '066669353196d994d624138aa1ef4aafd892ed8e1e6e65532a39ecc7e6129b829bdbf8ea2b53b11f93a74cb7d1a3e1aa537d0c060be02778b37550d70a77a80d']


## First tests on single df

In [17]:
ref_year = 2015
nr_consec_months = 3
tweets_file_path = os.path.join(data_dir_path, tweets_files_format.format(cc))
raw_tweets_df_generator = data_access.yield_json(tweets_file_path, 
    ssh_domain=ssh_domain, ssh_username=ssh_username, chunk_size=1000000, compression='gzip')
agg_tweeted_months_users = pd.DataFrame([], columns=['uid', 'month'])
tweets_df_list = []
for raw_tweets_df in raw_tweets_df_generator:
    tweets_df_list.append(raw_tweets_df)
    tweeted_months_users = ufilters.get_months_activity(raw_tweets_df)
    agg_tweeted_months_users = pd.concat([agg_tweeted_months_users, tweeted_months_users])
raw_tweets_df_generator.close()
local_uid_series = ufilters.consec_months(agg_tweeted_months_users)

KeyboardInterrupt: 

In [18]:
ref_year = 2015
nr_consec_months = 3
tweeted_months_users = ufilters.get_months_activity(tweets_df)
local_uid_series = ufilters.consec_months(tweeted_months_users)

There are 61472 distinct users in the dataset.
There are 10786 distinct users left in the dataset.
There are 3906 distinct users left in the dataset.


In [36]:
raw_tweets_df['lang'].value_counts().head(10)

en     128643
de      67800
fr      42027
und     39489
es      19339
it      14614
pt      13953
tr       9493
ar       6517
nl       4170
Name: lang, dtype: int64

In [37]:
raw_tweets_df.join(local_uid_series, on='uid', how='inner')['lang'].value_counts().head(10)

en     64087
de     45258
fr     25252
und    22067
es     11251
it      9493
pt      5676
tr      4977
ar      3259
nl      2742
Name: lang, dtype: int64

In [None]:
tweets_file_path = os.path.join(data_dir_path, tweets_files_format.format(cc))
raw_tweets_df_generator = data_access.yield_json(tweets_file_path, 
    ssh_domain=ssh_domain, ssh_username=ssh_username, chunk_size=1000000, compression='gzip')
for raw_tweets_df in raw_tweets_df_generator:
    filtered_tweets_df = pd.DataFrame(local_uid_series)

## Language detection

### Detected languages

- Languages possibly detected by CLD:

In [6]:
lang_with_code = dict(pycld2.LANGUAGES)
detected_lang_with_code = [(lang, lang_with_code[lang]) for lang in pycld2.DETECTED_LANGUAGES]
print(detected_lang_with_code)

[('ABKHAZIAN', 'ab'), ('AFAR', 'aa'), ('AFRIKAANS', 'af'), ('AKAN', 'ak'), ('ALBANIAN', 'sq'), ('AMHARIC', 'am'), ('ARABIC', 'ar'), ('ARMENIAN', 'hy'), ('ASSAMESE', 'as'), ('AYMARA', 'ay'), ('AZERBAIJANI', 'az'), ('BASHKIR', 'ba'), ('BASQUE', 'eu'), ('BELARUSIAN', 'be'), ('BENGALI', 'bn'), ('BIHARI', 'bh'), ('BISLAMA', 'bi'), ('BOSNIAN', 'bs'), ('BRETON', 'br'), ('BULGARIAN', 'bg'), ('BURMESE', 'my'), ('CATALAN', 'ca'), ('CEBUANO', 'ceb'), ('CHEROKEE', 'chr'), ('CORSICAN', 'co'), ('CROATIAN', 'hr'), ('CZECH', 'cs'), ('Chinese', 'zh'), ('ChineseT', 'zh-Hant'), ('DANISH', 'da'), ('DHIVEHI', 'dv'), ('DUTCH', 'nl'), ('DZONGKHA', 'dz'), ('ENGLISH', 'en'), ('ESPERANTO', 'eo'), ('ESTONIAN', 'et'), ('FAROESE', 'fo'), ('FIJIAN', 'fj'), ('FINNISH', 'fi'), ('FRENCH', 'fr'), ('FRISIAN', 'fy'), ('GALICIAN', 'gl'), ('GANDA', 'lg'), ('GEORGIAN', 'ka'), ('GERMAN', 'de'), ('GREEK', 'el'), ('GREENLANDIC', 'kl'), ('GUARANI', 'gn'), ('GUJARATI', 'gu'), ('HAITIAN_CREOLE', 'ht'), ('HAUSA', 'ha'), ('HAWAIIAN

- Languages possibly detected by Twitter (see 'lang' in https://support.gnip.com/apis/powertrack2.0/rules.html#Operators):

Amharic - am
Arabic - ar
Armenian - hy
Bengali - bn
Bulgarian - bg
Burmese - my
Chinese - zh
Czech - cs
Danish - da
Dutch - nl
English - en
Estonian - et
Finnish - fi
French - fr
Georgian - ka
German - de
Greek - el
Gujarati - gu
Haitian - ht
Hebrew - iw
Hindi - hi
Hungarian - hu
Icelandic - is
Indonesian - in
Italian - it
Japanese - ja
Kannada - kn
Khmer - km
Korean - ko
Lao - lo
Latvian - lv
Lithuanian - lt
Malayalam - ml
Maldivian - dv
Marathi - mr
Nepali - ne
Norwegian - no
Oriya - or
Panjabi - pa
Pashto - ps
Persian - fa
Polish - pl
Portuguese - pt
Romanian - ro
Russian - ru
Serbian - sr
Sindhi - sd
Sinhala - si
Slovak - sk
Slovenian - sl
Sorani Kurdish - ckb
Spanish - es
Swedish - sv
Tagalog - tl
Tamil - ta
Telugu - te
Thai - th
Tibetan - bo
Turkish - tr
Ukrainian - uk
Urdu - ur
Uyghur - ug
Vietnamese - vi
Welsh - cy

In [9]:
tweets_lang_df = text_process.lang_detect(tweets_df, text_col='text', min_nr_words=4, cld='pycld2')
tweets_lang_df.head()

Unnamed: 0,text,id,twitter_lang,place_id,coordinates,uid,created_at,source,filtered_text,cld_lang,proba
0,"Ah bueno, aprecio tanto como me ignorassssssss",5d36fe19b2ab1178560fafb6e2270197fe8e006f620861...,es,4e7c21fd2af027c6,"{'type': 'Point', 'coordinates': [-68.8684979,...",17745d10f17d7e3fc0da15997e88676c804f627eeadca4...,2015-10-05 19:43:29,Twitter for Android,"Ah bueno, aprecio tanto como me ignorassssssss",un,0
1,i miss them both #ALDUBMissingYou,412e85cce06bb37c3b0752467baac07c51dd7d93f6c7db...,en,5868d0e0749c9c47,"{'type': 'Point', 'coordinates': [121.0115236,...",28c6e28fc7cc605cac4298aea7a8592ac4f8a5155093af...,2015-10-11 23:31:42,Twitter for Android,i miss them both,en,94
3,Follow me:\nSnapchat: @c9fb9d1daebb152f5316733...,f11720130f52da86725d04d96525a2365cc1a367d68ff7...,en,99bc40068c24ecd8,"{'type': 'Point', 'coordinates': [120.5687683,...",c25018eacaf19cb4679ffbe7b6cdd26d6010c2e0e2cc2f...,2015-12-30 14:27:31,Twitter for Android,Follow me:\nSnapchat: \nCrafty Amino: Tiffany_...,en,99
4,My morning mantra: I can survive through Justi...,15745701d2d0bd8229378a8bc07b5b2576d63d72c0779e...,en,ddcca24ee29ddff2,,7dd8675d6ea158894d34ea2b98b2fe0bbc66637e1f6f53...,2015-08-31 09:39:44,Twitter for iPhone,My morning mantra: I can survive through Justi...,en,98
6,Bruno mars vai ser sempre dos meus cantores fa...,ee6e2b5b3a2e07776599ebf16f56c20daf878bf8d253cf...,pt,2808e6955aae0035,,77b119cd8928169fb7b288ab980524a9dbc30ba6b4f9ac...,2015-08-31 09:40:41,Twitter for Android,Bruno mars vai ser sempre dos meus cantores fa...,pt,98


In [57]:
cld_langs = tweets_lang_df['cld_lang'].unique()
cld_langs.sort()
print('Languages detected by cld: {}'.format(cld_langs))
twitter_langs = tweets_lang_df['twitter_lang'].unique()
twitter_langs.sort()
print('Languages detected by twitter: {}'.format(twitter_langs))

Languages detected by cld: ['af' 'ak' 'ar' 'az' 'bg' 'bi' 'br' 'bs' 'ca' 'ceb' 'co' 'crs' 'cs' 'cy'
 'da' 'de' 'el' 'en' 'eo' 'es' 'et' 'eu' 'fa' 'fi' 'fo' 'fr' 'fy' 'ga'
 'gl' 'gn' 'gv' 'ha' 'haw' 'hi' 'hmn' 'hr' 'ht' 'hu' 'ia' 'id' 'ie' 'is'
 'it' 'iw' 'ja' 'jw' 'kha' 'ko' 'la' 'lb' 'lg' 'ln' 'lv' 'mfe' 'mi' 'mk'
 'mn' 'ms' 'mt' 'na' 'ne' 'nl' 'nn' 'no' 'ny' 'oc' 'pl' 'ps' 'pt' 'rm'
 'ro' 'ru' 'rw' 'sa' 'sco' 'sd' 'si' 'sk' 'sl' 'sm' 'so' 'sq' 'sr' 'ss'
 'st' 'su' 'sv' 'sw' 'ta' 'th' 'tk' 'tl' 'tlh' 'tn' 'to' 'tr' 'tt' 'uk'
 'un' 'ur' 'uz' 'vi' 'vo' 'war' 'xh' 'xx-Qaai' 'yo' 'zh' 'zh-Hant' 'zu']
Languages detected by twitter: ['ar' 'bg' 'bs' 'cy' 'da' 'de' 'el' 'en' 'es' 'et' 'fa' 'fi' 'fr' 'hi'
 'hr' 'ht' 'hu' 'in' 'is' 'it' 'iw' 'ja' 'ko' 'lt' 'lv' 'ne' 'nl' 'no'
 'pl' 'pt' 'ro' 'ru' 'si' 'sk' 'sl' 'sr' 'sv' 'ta' 'th' 'tl' 'tr' 'uk'
 'und' 'ur' 'vi' 'zh']


In [15]:
tweets_lang_df['twitter_lang'].value_counts().head(10)

en    21494
fr    16965
de    10347
pt     4511
ar     2725
es     2332
it     2188
tr     2011
tl      572
ru      542
Name: twitter_lang, dtype: int64

In [16]:
tweets_lang_df['cld_lang'].value_counts().head(10)

en    24035
fr    13818
de     8568
un     5105
pt     4094
ar     2643
tr     1844
es     1785
it     1768
ru      496
Name: cld_lang, dtype: int64

French case, corsican is unreliably detected by CLD for French tweets, however seems pretty accurate when twitter_lang='it'

### Multilingual users

In [61]:
groupby_user_lang = tweets_lang_df.loc[tweets_lang_df['twitter_lang'] != 'und'].groupby(['uid', 'twitter_lang'])
count_tweets_by_user_lang = groupby_user_lang.size()
count_langs_by_user_df = count_tweets_by_user_lang.groupby('uid').transform('size')
multiling_users_df = count_tweets_by_user_lang.loc[count_langs_by_user_df > 1]
pd.DataFrame(multiling_users_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,0
uid,twitter_lang,Unnamed: 2_level_1
0008d1ec39afb845de445f93b30521eb882a80bafe6f7f350777d00d50010e99976c99f91f8889163bd73785d5303b62f7192c8b10ee266d2b1b62e0f1d7436d,es,4
0008d1ec39afb845de445f93b30521eb882a80bafe6f7f350777d00d50010e99976c99f91f8889163bd73785d5303b62f7192c8b10ee266d2b1b62e0f1d7436d,in,1
002c2a22f282d95caed8b751786d21a90bf8289df8e5c409055ad61d834f2784962ee260ad003558722904d89fe79f03e27035ab1b7a63e6946c197b3012f94f,es,1
002c2a22f282d95caed8b751786d21a90bf8289df8e5c409055ad61d834f2784962ee260ad003558722904d89fe79f03e27035ab1b7a63e6946c197b3012f94f,fr,17
006531d6400b3b801335a3489d8ff1d48607d045125a23698559b1308e3384ffeabf174e3c35ecd9b29cafe2de5896786683d588f3987e8d91cd3589d53b345f,de,2
...,...,...
ff6a2f8ad9bba3ae5d8712726947d25d9d3a072ff7dadc6c9774c10832fb42d9fffb459c9e3a966bd8e76700d5a3c71cb606ca60f26fcebcf1b5db2cd8ff902a,fr,77
ffd3ef8cab8414353532efb849c032dae8ffa99e99df13ac0c6691c890b2c13a96a5b55439f009a90e59cd0791a912c2c4ac41fd8b346e12db1a5d2680b16c08,en,1
ffd3ef8cab8414353532efb849c032dae8ffa99e99df13ac0c6691c890b2c13a96a5b55439f009a90e59cd0791a912c2c4ac41fd8b346e12db1a5d2680b16c08,es,1
ffe323890c48312da490ce5ec178c6f228535822940276b69ff2adb75260565e5f8692f6fa699fb9668f5c4eb2aedc8c7a966d8b7680e7dc1567dd5d91c90972,ar,10


In [62]:
pd.set_option("display.max_rows", 100)
multiling_users_list = [x[0] for x in multiling_users_df.index.values]
tweets_lang_df[tweets_lang_df['uid'].isin(multiling_users_list)].sort_values(by=['uid', 'cld_lang'])[
    ['uid', 'filtered_text', 'cld_lang', 'twitter_lang', 'created_at']]

Unnamed: 0,uid,filtered_text,cld_lang,twitter_lang,created_at
42031,0008d1ec39afb845de445f93b30521eb882a80bafe6f7f...,El tratamiento me parece rocambolesco ! Q verg...,es,es,2015-09-11 08:51:34
58838,0008d1ec39afb845de445f93b30521eb882a80bafe6f7f...,cuándo vuelve el programa??? Lo echamos d men...,es,es,2015-09-16 21:38:29
66545,0008d1ec39afb845de445f93b30521eb882a80bafe6f7f...,Salvador se merece un buen escarmiento! Ade+ e...,es,es,2015-09-17 17:31:37
66790,0008d1ec39afb845de445f93b30521eb882a80bafe6f7f...,"Me gusta la serie... No sabía quién era Rubén,...",es,es,2015-09-16 21:36:27
37117,0008d1ec39afb845de445f93b30521eb882a80bafe6f7f...,stop terapia Celia. Es de mal gusto... stop ya!,un,in,2015-09-09 22:48:31
...,...,...,...,...,...
89464,ffe323890c48312da490ce5ec178c6f228535822940276...,منظر ولقطة لقمة تتلس الثلجية بسويسرا من قمة S...,ar,ar,2015-09-21 21:24:16
89921,ffe323890c48312da490ce5ec178c6f228535822940276...,ياصباح البرررررد Iseltwald اليوم 9الحجة\n,ar,ar,2015-09-22 08:01:37
92425,ffe323890c48312da490ce5ec178c6f228535822940276...,أمطار مستمرة وبارد قارس اليوم 23 ستمبر 9 الحجة...,ar,ar,2015-09-23 11:13:22
92850,ffe323890c48312da490ce5ec178c6f228535822940276...,اليوم التاسع من الحجة 22 ستمبر 2015\nعلى السري...,ar,ar,2015-09-22 12:04:36


## Places into geodf and join on tweets

Calculate the area to discard bbox which are too large? Problem: need to project first, which is expensive

In [19]:
tweets_to_loc_df = tweets_lang_df.loc[tweets_lang_df['coordinates'].isnull()]
crs = {'init': latlon_proj}
places_df = raw_places_df[['id', 'bounding_box', 'name', 'place_type']]
geometry = places_df['bounding_box'].apply(lambda x: Polygon(x['coordinates'][0]))
places_geodf = geopd.GeoDataFrame(places_df, crs=crs, geometry=geometry)
places_geodf = places_geodf.set_index('id')
places_geodf = places_geodf.drop(columns=['bounding_box'])
places_geodf['area'] = places_geodf.geometry.to_crs(xy_proj).area
tweets_final_df = tweets_to_loc_df.join(places_geodf, on='place_id', how='left')
tweets_final_df.head(10)

Unnamed: 0,text,id,twitter_lang,place_id,coordinates,uid,created_at,source,filtered_text,cld_lang,proba,name,place_type,geometry,area
4,My morning mantra: I can survive through Justi...,15745701d2d0bd8229378a8bc07b5b2576d63d72c0779e...,en,ddcca24ee29ddff2,,7dd8675d6ea158894d34ea2b98b2fe0bbc66637e1f6f53...,2015-08-31 09:39:44,Twitter for iPhone,My morning mantra: I can survive through Justi...,en,98,Carouge (GE),city,"POLYGON ((6.12759 46.16819, 6.12759 46.19075, ...",10875630.0
6,Bruno mars vai ser sempre dos meus cantores fa...,ee6e2b5b3a2e07776599ebf16f56c20daf878bf8d253cf...,pt,2808e6955aae0035,,77b119cd8928169fb7b288ab980524a9dbc30ba6b4f9ac...,2015-08-31 09:40:41,Twitter for Android,Bruno mars vai ser sempre dos meus cantores fa...,pt,98,Biberist,city,"POLYGON ((7.51041 47.16026, 7.51041 47.20208, ...",52019160.0
10,Will the @4a4943c37848a0acd8d104b2d03007f81007...,88b2f02a833bfd273f6e2e186b50f2836de93007afcd67...,en,57d9b5e0a53e48e5,,15bc753d7d0887b255220f01689b891137cf0735c4a60d...,2015-08-31 09:40:30,Twitter for iPhone,Will the use branding on power in 2016? wo...,en,98,Kloten,city,"POLYGON ((8.54496 47.43600, 8.54496 47.48048, ...",71451260.0
11,Bence hem klip hemde şarkı müthiş olmuş 😍 #Wh...,7d68733775db5dda8f8549d7f25221090e5382c017050c...,tr,3acb748d0f1e9265,,dc7417e7114844470ba45a7526d97ab553f070e77ac3aa...,2015-08-31 09:41:13,Twitter for iPhone,Bence hem klip hemde şarkı müthiş olmuş 😍,tr,97,Zurich,city,"POLYGON ((8.44808 47.32018, 8.44808 47.43472, ...",371753900.0
12,Try our gentle yoga for seniors video series. ...,0b305e6d611fa438930dfd449fa70b381fbc2d8843f4f7...,en,3acb748d0f1e9265,,4668312884185bbe92ac73f211aa3955c98b6e6ebdf3e4...,2015-08-31 09:41:25,Twitter Web Client,Try our gentle yoga for seniors video series.,en,97,Zurich,city,"POLYGON ((8.44808 47.32018, 8.44808 47.43472, ...",371753900.0
13,"O meu irmão está sempre a dizer ""ela parte-me ...",cbd30451251830090cc489f4351907dac0450f03791fa5...,pt,2808e6955aae0035,,77b119cd8928169fb7b288ab980524a9dbc30ba6b4f9ac...,2015-08-31 09:41:42,Twitter for Android,"O meu irmão está sempre a dizer ""ela parte-me ...",pt,98,Biberist,city,"POLYGON ((7.51041 47.16026, 7.51041 47.20208, ...",52019160.0
14,Von konstruktiven SVP-Wadenbeissenden eine Lös...,4db6bac4642763e0c5f8597fd9f8a1362f33311e69021c...,de,83886ecd7407b3a1,,250df03a8d8df2be028fac3db1fffdce429c448bd8fddd...,2015-08-31 09:41:36,Twitter for iPhone,Von konstruktiven SVP-Wadenbeissenden eine Lös...,de,99,Wauwil,city,"POLYGON ((8.01290 47.17292, 8.01290 47.19561, ...",9822657.0
15,Alex il m'a prévenu qu'il venait pas cet aprem,867752243582b0eb9def702d47140335749b0e72458f51...,fr,c3a6437e1b1a726d,,4831dfbaf26fde99e19cde56729fc46fbec83c5dbe44ce...,2015-08-31 09:42:27,Twitter for iPhone,Alex il m'a prévenu qu'il venait pas cet aprem,fr,97,Geneva,city,"POLYGON ((6.11051 46.17763, 6.11051 46.23188, ...",63375260.0
18,"não sei o quê que ele quer, ligou me 3 vezes",1ca75dcf0505f975ddf8a67ef296b1f0c34386ab15124c...,pt,c2bf4772ec58dc04,,65f12da5d7d633b084114aee5371d9781b3b6a872e9d03...,2015-08-31 09:42:24,Twitter for iPhone,"não sei o quê que ele quer, ligou me 3 vezes",pt,97,La Chaux-de-Fonds,city,"POLYGON ((6.76410 47.06163, 6.76410 47.16555, ...",235982600.0
21,Jvais me retrouver 1 h toute seule,f6efec9861954be35d3e0afedd1760bd59506451f7d8b6...,fr,c3a6437e1b1a726d,,4831dfbaf26fde99e19cde56729fc46fbec83c5dbe44ce...,2015-08-31 09:42:39,Twitter for iPhone,Jvais me retrouver 1 h toute seule,fr,97,Geneva,city,"POLYGON ((6.11051 46.17763, 6.11051 46.23188, ...",63375260.0


### Corsican?

In [100]:
tweets_final_df.loc[(tweets_final_df['cld_lang'] =='co') & (tweets_final_df['twitter_lang'] =='it')]

Unnamed: 0,text,id,twitter_lang,place_id,coordinates,uid,created_at,filtered_text,lang,proba,name,place_type,geometry,area
13856,@a24bc8c3afca1a8f3ba6ff7c04540ebab404c87204284...,2ff89dc3204c58cd6a5e636bf4f3ab938d65b21f9867c6...,it,f9c4cad0af2337fa,,b142ccaa01aa65d50ffb2be88e044cc479267055319509...,2015-08-31 16:35:26,Io no. Sto a Parigi :),co,95,Île-de-France,admin,"POLYGON ((1.44652 48.12045, 1.44652 49.23392, ...",44142260000.0
23392,Sto facendo troppe figure di merda parlando fr...,df9ec98e3a7e4b1c2489ff361e125de0593d7db50a7c07...,it,23f8a07383ac617e,,a881fbd7749263a99d69de827c98d28e1d26af2f53e942...,2015-08-31 20:06:35,Sto facendo troppe figure di merda parlando fr...,co,98,Nice,city,"POLYGON ((7.18209 43.64529, 7.18209 43.76075, ...",280766700.0
58808,@3f0dfce67e864403c374b8127022c6d0b8db2d4fa459d...,aa8432747e4b716906532ba5eb98b8da3358462fedee0c...,it,3079bb84261d240f,,795cb88bf3910f3d2ab3843b3d189a787e17c2c8f6f5f3...,2015-09-01 19:43:20,"Senza scurdà ..Danielle Casanova , Fred Scam...",co,98,Florensac,city,"POLYGON ((3.41996 43.34095, 3.41996 43.42297, ...",116853300.0
66624,@3f0dfce67e864403c374b8127022c6d0b8db2d4fa459d...,6d07e5433dd84f8c4df1fce56757b1f045f58951015afc...,it,3079bb84261d240f,,795cb88bf3910f3d2ab3843b3d189a787e17c2c8f6f5f3...,2015-09-01 19:34:01,umaggiu per i risistenti corsi incuntru u fas...,co,98,Florensac,city,"POLYGON ((3.41996 43.34095, 3.41996 43.42297, ...",116853300.0
74485,è di l'avvene ùn ai paura,05a36074f12d76eac74c752020d549b1272da7e871ef2a...,it,0e8fa96911a7d663,,13bbe7ecf4af52aaa364ac37a9fb6285cde95f4a21fcc4...,2015-09-02 01:52:04,è di l'avvene ùn ai paura,co,96,Bastia,city,"POLYGON ((9.39019 42.66128, 9.39019 42.71073, ...",56000670.0
77918,@3422d7fcfaed3bb3a0db2904c0c2fec7c9da02872f5e6...,079a18d748023f7ac0491072aef7ff5e761aff6251200d...,it,2dfa071ded160a23,,b696abe7847e5e4491660ce3710912f9b4758601879b33...,2015-09-02 06:00:29,mi fà piace per tè amicu Felice.,co,97,Haute-Corse,admin,"POLYGON ((8.57318 41.83216, 8.57318 43.01156, ...",19535190000.0
82191,D'altronde la storia la scrivono gli audaci !!...,817c5035f409f92cae05883df430c243bfa62a2f2f0643...,it,0d06987b4f09169c,,a666099447ed48dbf450286c6f318888f300d32649fe07...,2015-09-02 14:27:26,D'altronde la storia la scrivono gli audaci !!...,co,98,Novalaise,city,"POLYGON ((5.72756 45.55811, 5.72756 45.61493, ...",79777040.0
84260,"Dimmi induve si, chì possu fà quì, senza tè.. ...",88d0ef405398214bba470e9141f489578c8428b1449120...,it,031d458d918a5407,,082f14b26c4b122b6c7ac75a85ad52b1172ce24975c0ac...,2015-09-02 14:16:15,"Dimmi induve si, chì possu fà quì, senza tè.. ...",co,98,Tallone,city,"POLYGON ((9.38383 42.13020, 9.38383 42.24919, ...",343172000.0
84855,I studienti chi so partuti incù a borsa #impro...,c9a48cf88677f082e9d482958f1d31129d29908c5b4b09...,it,0e603367cb002e00,,c1825ad12336127e71ee20e1166ec326f98eb7d0b912d6...,2015-09-02 14:04:32,I studienti chi so partuti incù a borsa di a ...,co,98,Corte,city,"POLYGON ((8.92659 42.20318, 8.92659 42.34494, ...",677520900.0
88360,@cc2a6523cec58ca621c892a0c7a9aa932b7b114ca4975...,c964810025858bb54370cbf1527aed27a15c268163c5a5...,it,20c4c5a3ce69f1a0,,260f8f5b558d54a2b353debf3bb527b0fd2847ac99e955...,2015-09-01 06:48:03,\nSì vero son tanti,co,95,Antibes,city,"POLYGON ((7.06447 43.54188, 7.06447 43.62279, ...",111578900.0


CLD sensitive to letter repetitions made to insist: can put threshold if more than 3 consecutive same letter, bring it down to 2, it seems to improve prediction on example

Usually twitter's prediction seems better...

In [98]:
tweets_final_df[tweets_final_df['cld_lang'] != tweets_final_df['twitter_lang']].drop(columns=['id'])

Unnamed: 0,text,twitter_lang,place_id,coordinates,filtered_text,lang,proba,name,place_type,geometry,area
103,Pfaa comme ça me gave la,fr,0701e94ee168b555,,Pfaa comme ça me gave la,sw,0.459360,Strasbourg,city,"POLYGON ((7.68816 48.49242, 7.68816 48.64619, ...",4.264696e+08
115,- serviette et tout ca blc,en,747f1deba49c4162,,- serviette et tout ca blc,fr,0.999411,Thionville,city,"POLYGON ((6.05525 49.32725, 6.05525 49.41566, ...",2.994746e+08
134,Steve Carell remplace Bruce Willis chez Woody ...,en,0701e94ee168b555,,Steve Carell remplace Bruce Willis chez Woody ...,fr,0.470754,Strasbourg,city,"POLYGON ((7.68816 48.49242, 7.68816 48.64619, ...",4.264696e+08
139,Mdr elle bisque la fille,fr,7a71b84ee2763115,,Mdr elle bisque la fille,da,0.995121,Solaro,city,"POLYGON ((9.22735 41.83216, 9.22735 41.92556, ...",2.826802e+08
140,@85ebdd6fed998d7e6f8d5f6716a8a9bf96e8385ad2309...,fr,22f8f77a10d6a008,,"ouai :(, je me renseigne...",haw,0.420369,Bonviller,city,"POLYGON ((6.47317 48.61755, 6.47317 48.64298, ...",2.469253e+07
...,...,...,...,...,...,...,...,...,...,...,...
957,Super je reprend demain 14h 17h nique.,fr,6f9ad77b2f379c86,,Super je reprend demain 14h 17h nique.,it,0.764051,Tart-le-Haut,city,"POLYGON ((5.17299 47.16792, 5.17299 47.21831, ...",5.732771e+07
965,Bonne anniv ma petite tartelette en sucre @a94...,ht,3c3d85c8cd212149,,Bonne anniv ma petite tartelette en sucre,fr,0.526238,Morez,city,"POLYGON ((5.99318 46.50860, 5.99318 46.54516, ...",5.682778e+07
971,En plus je reprend demain 💥,fr,3bee11b579567e98,,En plus je reprend demain 💥,es,0.486612,Cessey-sur-Tille,city,"POLYGON ((5.20424 47.25877, 5.20424 47.30360, ...",4.204246e+07
982,jm bien Jaebum moi,ht,716aaae1152bc3e2,,jm bien Jaebum moi,lb,0.959223,Dung,city,"POLYGON ((6.72242 47.49517, 6.72242 47.51484, ...",1.572062e+07


### Swiss German?

In [39]:
zurich_id = places_geodf.loc[places_geodf['name']=='Zurich', 'geometry'].index[0]
# places_in_zurich = places_geodf
places_in_zurich = places_geodf.loc[places_geodf.within(places_geodf.loc[zurich_id, 'geometry'])]
places_in_zurich

Unnamed: 0_level_0,name,place_type,geometry,area
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3acb748d0f1e9265,Zurich,city,"POLYGON ((8.44808 47.32018, 8.44808 47.43472, ...",371753900.0
5a6fe9b57e416499,Oberengstringen,city,"POLYGON ((8.45181 47.40284, 8.45181 47.42094, ...",8895118.0
7b5bcdfb5c50c957,Zurich,admin,"POLYGON ((8.44808 47.32018, 8.44808 47.43472, ...",371753900.0
936a6f77f5233bdb,Wettswil am Albis,city,"POLYGON ((8.45510 47.32337, 8.45510 47.34991, ...",14817350.0


In [48]:
tweets_in_zurich = tweets_final_df.join(places_in_zurich, on='place_id', rsuffix='_place')
print(tweets_in_zurich['cld_lang'].value_counts().head())
print(tweets_in_zurich['twitter_lang'].value_counts().head())

en    17868
fr    13266
de     7980
un     4908
pt     4193
Name: cld_lang, dtype: int64
en    17095
fr    15988
de     8607
pt     4744
ar     2534
Name: twitter_lang, dtype: int64


In [53]:
tweets_in_zurich.loc[(tweets_in_zurich['cld_lang']=='un') & (tweets_in_zurich['twitter_lang']=='de'), 
                     'filtered_text']

342                                   ich sage es dir ! 😂👌
526      Dä wo dr  Deal hett gmacht bi  hett eue im  es...
640                                   Es ist sooo heiss! 😭
683                                   ja nei ish lässig ^^
785                  Alter Onkel mit Argumenteverstopfung?
                               ...                        
99500               Andererfeiff find da ja Nüffe drin 😁😁😁
99619    Schöne Jurasüdfuss Höhentour Staffelegg (621 m...
99687      DIE=Prof.Dr.Peter Mani, Mathematikprof., ehe...
99779    Trouvaille aus der Bubenzeit: so was Kontiki-A...
99962    mach.robot.gen.aggcorperzdkb.was solldas.nojob...
Name: filtered_text, Length: 427, dtype: object

Mostly mixed languages not detected by twitter it seems:

In [55]:
tweets_in_zurich.loc[tweets_in_zurich['twitter_lang']=='und', 
                     'filtered_text']

10198        قرية في سويسرا ..😍\nLauterbrunnen , bern\n\n 
14207                           வாழ்த்துக்கள்...සුබ පැතුම්
14277    так. 32я глава манги AJIN вышла\nи у Shingeki ...
20803    Bellissime nuvole mi piacciono  molto ท้องฟ้าท...
22944         \nقريةringgenberg قُرب انترلاكن😻🇨🇭\nالاطل...
24101    SRI LANKA: எங்கள் வாழ்த்துகள் சுவிச்சர்லாந்து ...
24315    سفارة المملكة العربية السعودية في   \n Kirchen...
24641    السفارة الأردنية في  \nعنوان :\nThorackerstras...
26353                                    الحق كلو ع vienna
33888                                       вы про MGS V ?
34530                                 buy kal\nже на пикче
37923                                Вышла GM И 9.1 beta 1
42484    view from tonight balcony\nยอดเขาโลโก้ช็อกโกแล...
43055    view from tonight balcony\nยอดเขาโลโก้ช็อกโกแล...
46436        \n\n( ͡° ͜ʖ ( ͡° ͜ʖ ( ͡° ͜ʖ ( ͡° ͜ʖ ͡°) ͜ʖ...
47592     @jasmeenmanzoor @shahidmasoodd@Kashifabbasiar...
47726           ใครถามถึง rösti นะ\nแถม Matterhorn ให้ด้

## groupbys and stuff

In [233]:
def get_mean_time(df, dt_col):
    t_series_in_sec_of_day = df['hour']*3600 + df['minute']*60 + df['second']
    return pd.to_timedelta(int(t_series_in_sec_of_day.mean()), unit='s')

In [234]:
tweets_df = raw_tweets_df.copy()
# Speeds up the process to extract the hour, min and sec first
tweets_df['hour'] = tweets_df['created_at'].dt.hour
tweets_df['minute'] = tweets_df['created_at'].dt.minute
tweets_df['second'] = tweets_df['created_at'].dt.second
groupby_user_place = tweets_df.groupby(['uid', 'place_id'])
count_tweets_by_user_place = groupby_user_place.size()
count_tweets_by_user_place.rename('count', inplace=True)
mean_time_by_user_place = groupby_user_place.apply(lambda df: get_mean_time(df, 'created_at'))
mean_time_by_user_place.rename('avg time', inplace=True)
# transform to keep same size, so as to be able to have a matching boolean Series of same size as 
# original df to select users with more than one place for example:
count_places_by_user_df = count_tweets_by_user_place.groupby('uid').transform('size')
agg_data_df = pd.concat([count_tweets_by_user_place, mean_time_by_user_place], axis=1)
count_tweets_by_user_place_geodf = agg_data_df.join(places_geodf, on='place_id')
count_tweets_by_user_place_geodf.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,avg time,name,place_type,geometry,area
uid,place_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0002a7f25f7b2bb275cc8d0ea3cb5203894eb423ab2652f4654fd3de2a405c191d4e87d06326b02051096c4596d5d72b16e3783c4407a53be8ee94f441ceeaf3,3c4826d2dd591817,13,16:46:41,Plombières-lès-Dijon,city,"POLYGON ((4.90021 47.31624, 4.90021 47.35745, ...",70906520.0
0006cbf8e27bf7139818c439e9b45e2c516321897b01e3879f228bada509a1053dbb2d7e155325d6a7cd6a6308ef1687dadc76f76439385000f7bfc4a389983f,66fabed9d649aa12,5,16:10:59,Nancy,city,"POLYGON ((6.13423 48.66686, 6.13423 48.70924, ...",62404090.0
00078708d7ae2b0605b557d01f52f7ad1dbe07acb6aed5ff126c2ee59f51a056adf59850ece5d0ccbeeebf33a779394fbb19e085176655eae9fded0b563e58b4,7c3bafe4c4783291,2,19:45:41,Dijon,city,"POLYGON ((4.96247 47.28624, 4.96247 47.37759, ...",233224900.0
0009c4c1073cf34696e5618a828cc10ab824f3784d7e9124a6d83303d71de8b3d0ad379c230778d940df5f2fd1e713ee96a9c5e9b697c89195e55744e507855b,0e89c95ad54bec56,2,14:58:19,Montigny-lès-Metz,city,"POLYGON ((6.12740 49.07912, 6.12740 49.11434, ...",36413360.0
000c9887d94bd3d73e352b15df00bf2df9a137f27a8754c8493f5c1c478489b1e990667aed464690bb21707a93ff37804d2ddd082abdaf35cc99137fd3439c61,179b8df9e368044d,1,02:39:40,Lyon,city,"POLYGON ((4.77183 45.70736, 4.77183 45.80828, ...",226809500.0


In [230]:
cProfile.run("groupby_user_place.apply(lambda df: get_mean_time(df, 'created_at'))")

         67331893 function calls (65611254 primitive calls) in 35.426 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        2    0.000    0.000    0.000    0.000 <__array_function__ internals>:2(array_equal)
    17380    0.019    0.000    0.304    0.000 <__array_function__ internals>:2(prod)
   764736    0.267    0.000    0.813    0.000 <frozen importlib._bootstrap>:1009(_handle_fromlist)
   104284    0.077    0.000    0.115    0.000 <frozen importlib._bootstrap>:416(parent)
    17380    0.308    0.000   35.134    0.002 <ipython-input-227-38b608dc83f9>:1(get_mean_time)
    17380    0.032    0.000   35.166    0.002 <string>:1(<lambda>)
        1    0.000    0.000   35.426   35.426 <string>:1(<module>)
    34760    0.039    0.000    0.082    0.000 __init__.py:104(_maybe_match_name)
    69520    0.070    0.000    0.089    0.000 __init__.py:139(maybe_upcast_for_op)
    69520    0.051    0.000    0.204    0.000 __init__.py:81(

In [229]:
count_tweets_by_user_place_geodf.loc[count_places_by_user_df > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,count,avg time,name,place_type,geometry,area
uid,place_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00157e92cd3607845e7d81c334751af51b77a13840ad83463a32a2c34e850dfe98b3fb34984750b048fbe68e20494f156b4344ba499d9e086e5e84626448e53b,6bf52674166cad91,1,11:23:13,Biot,city,"POLYGON ((7.04648 43.60751, 7.04648 43.64688, ...",4.805592e+07
00157e92cd3607845e7d81c334751af51b77a13840ad83463a32a2c34e850dfe98b3fb34984750b048fbe68e20494f156b4344ba499d9e086e5e84626448e53b,7f6f3f7db2fab2ea,3,20:54:32,Pégomas,city,"POLYGON ((6.89762 43.56171, 6.89762 43.61552, ...",4.314310e+07
00176cdca9ecbc851a6d067a9abec9233c55864b68c8aa7634cc83bc0b64b20f9196b4470bcc5e0e8bd22454d79a9b79ff04d1070ba6ebef886331dbd7ae22d3,2dec07b880be5ba9,6,18:13:05,Villers-lès-Nancy,city,"POLYGON ((6.09053 48.64024, 6.09053 48.68053, ...",5.598284e+07
00176cdca9ecbc851a6d067a9abec9233c55864b68c8aa7634cc83bc0b64b20f9196b4470bcc5e0e8bd22454d79a9b79ff04d1070ba6ebef886331dbd7ae22d3,456b14ef60b8f9aa,8,11:35:05,Aydoilles,city,"POLYGON ((6.54694 48.18779, 6.54694 48.23691, ...",4.883710e+07
001ca5ba17e95203de0b2e64da07fa9c08008eda67069363e1ff9e64def948211a25ac6faa3ac6450c5890b23c29933a2214289e9d0ee793d43670470b33a0f8,3cbbddef3af0dad1,1,19:19:12,Couchey,city,"POLYGON ((4.90013 47.23995, 4.90013 47.27977, ...",9.130208e+07
...,...,...,...,...,...,...,...
fffb7b25e5e54942fa067d8e51e1b4d7a64ee3741b79e256c3fed5b0472232dbec0892af784aae4e3d9bb435dac2289c7340aa07e6729042b8ea5c8de970b376,60c390624fa83c29,1,19:54:45,Castelnau-le-Lez,city,"POLYGON ((3.88862 43.61844, 3.88862 43.65599, ...",3.101804e+07
fffb7b25e5e54942fa067d8e51e1b4d7a64ee3741b79e256c3fed5b0472232dbec0892af784aae4e3d9bb435dac2289c7340aa07e6729042b8ea5c8de970b376,69f0c48f684f8af6,5,18:45:41,Mende,city,"POLYGON ((3.42827 44.49386, 3.42827 44.57609, ...",1.788440e+08
fffb7b25e5e54942fa067d8e51e1b4d7a64ee3741b79e256c3fed5b0472232dbec0892af784aae4e3d9bb435dac2289c7340aa07e6729042b8ea5c8de970b376,75cf2af37b1c13b8,1,19:09:44,Nîmes,city,"POLYGON ((4.23560 43.74152, 4.23560 43.92488, ...",6.743195e+08
fffb7b25e5e54942fa067d8e51e1b4d7a64ee3741b79e256c3fed5b0472232dbec0892af784aae4e3d9bb435dac2289c7340aa07e6729042b8ea5c8de970b376,7b9c9a44a03d7413,2,18:39:40,Lentilly,city,"POLYGON ((4.63177 45.79012, 4.63177 45.84327, ...",7.075871e+07


Add new chunk to cumulative data:

In [65]:
count_tweets_by_user_place_geodf = count_tweets_by_user_place_geodf.join(
    count_tweets_by_user_place_geodf['count'], 
    on=['uid', 'place_id'], how='outer', rsuffix='_new')
count_tweets_by_user_place_geodf['count'] += count_tweets_by_user_place_geodf['count_new']
count_tweets_by_user_place_geodf.drop(columns=['count_new'], inplace=True)
count_tweets_by_user_place_geodf

Unnamed: 0_level_0,Unnamed: 1_level_0,count,name,place_type,geometry,area
uid,place_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00176cdca9ecbc851a6d067a9abec9233c55864b68c8aa7634cc83bc0b64b20f9196b4470bcc5e0e8bd22454d79a9b79ff04d1070ba6ebef886331dbd7ae22d3,456b14ef60b8f9aa,2,Aydoilles,city,"POLYGON ((6.54694 48.18779, 6.54694 48.23691, ...",48837100.0
001fa20901b392b42b165adef3adaf931c58f3174022d87d92b76a7be0b22807e2d63bbe91e1d1bec9636640606d7edf1c2676d53ae5f5bb10f4200e1d797663,0701e94ee168b555,2,Strasbourg,city,"POLYGON ((7.68816 48.49242, 7.68816 48.64619, ...",426469600.0
005a6d0c89ebbefb5435e590edd83a1a33486ca7f20c7e97526b013159c01d4ee90960e11df4bab93ea3d04cf9d6a3b129158bf753cabd2b740c53ef365a207b,746ece6688f4c9c0,2,Toulon,city,"POLYGON ((5.87956 43.10181, 5.87956 43.17147, ...",127556500.0
00e69cf87b62775a0aa5310ae1313742cf9e5a179800b0b8cb74dca6a10ea87f54e9e3c35c632d08ea00fba99b36f0c11b0de5e4d608e3ef2d3d6b7841eb6210,09f6a7707f18e0b1,2,Paris,city,"POLYGON ((2.22410 48.81552, 2.22410 48.90215, ...",401054800.0
01626b69f176678b72e776006b41cf32b3dfc93aa6c7ceae7361ac30c6f9b0bef2324528e78b4389cb1db04f3b39d56c972b0e5714a60389f6be11f581242688,0fe7b69e4bd109d8,10,Gérardmer,city,"POLYGON ((6.77617 48.02433, 6.77617 48.10761, ...",238112200.0
019ce967e7c4689f8ffee1b24e937d7a998f6a94331e04f0cfe26d32474ba8cc069602f06db3005fc4a812f94cf0a8788cdf246e5d6de28836e4c6c58535152d,39139128c33653ec,2,Hettange-Grande,city,"POLYGON ((6.12628 49.39027, 6.12628 49.44919, ...",63056670.0
02296f0a1b233bc02f4cf724e772ba861885d08a073056db355f4eaa2477a716b7eef79fae940ea87cde592e610948f48c74f209a6edda57b5bbaaa5e2e9e2ea,0244a25808ed968f,6,Metz,city,"POLYGON ((6.13569 49.06083, 6.13569 49.14879, ...",201002200.0
02430b04e42b068be330f8723309ab206233bb7d9f47f2a1330b7a5898f1c6304483fef2a3c9775d56c854af6379dd2432217c4bfe1fa9153ba2707fb1b8b3cd,0b7725facd03b922,2,Pralognan-la-Vanoise,city,"POLYGON ((6.63379 45.26847, 6.63379 45.41740, ...",511737300.0
02bc403f74fc321279a94bedacf470cd3232a08a1cf8c798115c85adb64eea2684144b28c3f950b72a1bb1d2afcf048e42c078577b0b3d16345932f57ba8559b,02037ab2746de8a3,4,Saint-Jean-Saverne,city,"POLYGON ((7.32588 48.76351, 7.32588 48.79844, ...",42375780.0
02e65f00e1b56fd4330df1c1864463fe3c237d59a0b4748af8405e01faf97359a38a9afee9a9a062172087054174a5fb7911ff49c6c448b13781df6f13e48ca1,23f8a07383ac617e,2,Nice,city,"POLYGON ((7.18209 43.64529, 7.18209 43.76075, ...",280766700.0
