In [None]:
# Reload all src modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

In [None]:
import cld3
import pycld2

In [None]:
test_str = "ola k pasa tio"
print(pycld2.detect(test_str))
print(cld3.get_language(test_str))

from cld2: (isReliable, textBytesFound, ((language, language code, proba, score), (...), ...)

is_reliable: True if proba >0.5 for bosnian and croatian, >0.7 otherwise (see https://github.com/bsolomon1124/pycld3/blob/master/src/nnet_language_identifier.cc#L100). proportion is proportion of bytes wchih are assigned to the language

In [None]:
import paramiko
import os
import json
import cProfile
import pandas as pd
import geopandas as geopd
import numpy as np
from pyproj import Transformer
from shapely.geometry import Polygon
from shapely.geometry import Point
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import descartes
import src.utils.geometry as geo
import src.data.shp_extract as shp_extract
import src.data.tweets_cells_counts as tweets_counts
import src.visualization.grid_viz as grid_viz
from dotenv import load_dotenv
load_dotenv()

latlon_proj = 'epsg:4326'
xy_proj = 'epsg:3857'


## Read remote json file and put chunks as df

In [None]:
ssh_username = os.environ['IFISC_USERNAME']
ssh_domain = os.environ['IFISC_DOMAIN']

In [None]:
# ssh key needs to be stored on computer, here I had it in /home/.ssh/id_rsa.pub
with paramiko.client.SSHClient() as ssh_client:
    ssh_client.load_system_host_keys()
    ssh_client.connect(ssh_domain, username=ssh_username)
    sftp_client = ssh_client.open_sftp()
    path = '/data/social/twitter/europe/201701_with_place/tweets_europe_201701_place.json'
    with sftp_client.file(path, mode='r') as f:
        chunks = pd.read_json(f, lines=True, chunksize = 100)
        # chunks is an iteator: elements (which are data frames) can only be accessed 
        # in for loop or with native method next, it's not indexed.
        df = next(chunks)
    #     print(chunks.__next__())
    #     for df in chunks:
    #         df.head()
df.head()

# try cld on these

## Retrieve relevant cities' shapes

### Murica

In [None]:
latlon_proj = 'epsg:4326'
xy_proj = 'epsg:3857'
external_data_dir = '../data/external/'
american_areas_data_file = os.path.join(external_data_dir, 'cbsa-est2017-alldata.csv')
shapefile_name = 'cb_2016_us_cbsa_500k'
shapefile_path = os.path.join(external_data_dir, shapefile_name, shapefile_name+'.shp')
min_pop = 1400000
init_cols = ['CBSA', 'LSAD', 'CENSUS2010POP', 'NAME']
data_id_col = 'CBSA'
shp_id_col = 'CBSAFP'
msa = ("Micropolitan Statistical Area", "Metropolitan Statistical Area")
filters = [lambda df: df['CENSUS2010POP'] > min_pop, lambda df: df['LSAD'].isin(msa)]
# 2062

In [None]:
final_area_df = shp_extract.get_cities_geometry(american_areas_data_file, shapefile_path, 
        filters, init_cols, data_id_col, shp_id_col, final_cols=['CBSA', 'NAME'], csv_engine='python')
final_area_df.head()

### Spain

In [None]:
latlon_proj = 'epsg:4326'
xy_proj = 'epsg:3857'
external_data_dir = '../data/external'
spanish_areas_data_file = os.path.join(external_data_dir, 'Poblacion_total_por_municipios._Padron_2015.csv')
shapefile_name = 'Municipios_IGN'
shapefile_path = os.path.join(external_data_dir, shapefile_name, shapefile_name+'.shp')
min_pop = 500000
init_cols = ['Codigo', 'Texto', 'Poblacion']
data_id_col = 'Codigo'
shp_id_col = 'CODIGOINE'
filters = [lambda df: df['Poblacion'] > min_pop]

In [None]:
spain_area_df = shp_extract.get_cities_geometry(spanish_areas_data_file, shapefile_path, 
        filters, init_cols, data_id_col, shp_id_col)
spain_area_df.head()

## Create the grid

In [None]:
barcelona_mask = spain_area_df['Texto']=='Barcelona'
# barcelona_id = np.where(barcelona_mask)[0][0]
barcelona_shape_df = spain_area_df.loc[barcelona_mask]
# barcelona_shape = spain_area_df.loc[barcelona_id, 'geometry']
cell_size = 1000
cells_df, cells_in_bcn_df = geo.create_grid(barcelona_shape_df, 1000, latlon_proj, xy_proj, intersect=True)
print(cells_df.info())
print(cells_in_bcn_df.info())

In [None]:
from shapely.geometry import MultiPolygon
a=MultiPolygon([cell[0] for cell in cells_df.values])
list(a)

## Deal with tweets

### Small example with the first few tweets

In [None]:
raw_data_dir = '../data/raw/'
with open(raw_data_dir+'geo_unique_Barcelona_2014_2017.json') as f:
    # file is a collection of json objects which are each on their own line,
    # hence lines=True, and we read only a certain number of lines because of
    # its size, and that is chunksize.
    chunks = pd.read_json(f, lines=True, chunksize=100)
    tweets_df = next(chunks)
geometry = tweets_df['coordinates'].apply(lambda x: Point(x))
crs = {'init': latlon_proj}
tweets_gdf = geopd.GeoDataFrame(tweets_df, crs=crs, geometry=geometry)
tweets_gdf.head()

In [None]:
tweets_within_cells = geopd.sjoin(tweets_gdf, cells_in_bcn_df, op='within', rsuffix='cell')
cell_tweet_counts = tweets_within_cells.groupby(['index_cell']).size()
cell_tweet_counts.rename('count', inplace=True)
cell_tweet_counts.head()

In [None]:
grid_viz.plot_grid(cells_in_bcn_df, cell_tweet_counts, barcelona_shape_df)

### Loop over all tweets

In [None]:
tweet_cols = ['id', 'uid', 'created_at', 'coordinates']
raw_data_dir = '../data/raw/'
tweet_data_file = 'geo_unique_Barcelona_2014_2017.json'
tweet_file_path = os.path.join(raw_data_dir, tweet_data_file)
tweet_cols = ['id', 'uid', 'created_at', 'coordinates']
dtype_dict = {'id': 'int', 'uid':'int', 'created_at': 'datetime64[s]', 'coordinates':'object'}

In [None]:
cell_tweet_counts = tweets_counts.get_counts(tweet_file_path, cells_in_bcn_df, dtype_dict=dtype_dict)

In [None]:
# 243 sec
cProfile.run("tweets_counts.get_counts(('{}', {}, dtype_dict={})".format(
    tweet_file_path, 'cells_in_bcn_df', 'dtype_dict'))

In [None]:
# 375 sec
cProfile.run("tweets_counts.get_counts_primitive('{}', {}, {})".format(tweet_file_path, 'cells_in_bcn_df', 'tweet_cols'))

In [None]:
grid_viz.plot_grid(cells_in_bcn_df, cell_tweet_counts, barcelona_shape_df)