In [83]:
import pandas as pd
import json

### Extracting Data

In [84]:
import csv

In [85]:
data_path = '../../twitter-swisscom/sample.tsv'

In [86]:
columns_names = ['id', 'userId', 'createdAt', 'text', 'longitude', 'latitude', 'placeId', 'inReplyTo', 'source',
                 'truncated', 'placeLatitude', 'placeLongitude', 'sourceName', 'sourceUrl', 'userName', 'screenName',
                 'followersCount', 'friendsCount', 'statusesCount', 'userLocation']

# the columns that interest us for the density map
columns_to_keep = ['id', 'createdAt', 'placeLatitude', 'placeLongitude']

In [87]:
# set to None to get all the records
num_rows = None

In [88]:
df_data = pd.read_csv(data_path, sep='\t', encoding='utf-8', escapechar='\\', quoting=csv.QUOTE_NONE,
                      header=None, na_values='N', nrows=num_rows)

# give labels to the columns
df_data.columns = columns_names

In [89]:
df_data = df_data[columns_to_keep]

In [90]:
# make sure all tweets are geolocated
df_data.dropna(subset=['placeLatitude', 'placeLongitude'], inplace=True)

### Parsing Date

In [91]:
import numpy as np
from datetime import datetime

In [92]:
df_data['year'] = pd.DatetimeIndex(df_data['createdAt']).year
df_data['month'] = pd.DatetimeIndex(df_data['createdAt']).month

In [93]:
def convert_to_unix_time(record):
    datetime_index = pd.DatetimeIndex([datetime(record['year'], record['month'], 1)])
    unix_time_index = datetime_index.astype(np.int64) // 10**6
    return unix_time_index[0]

In [94]:
df_data['unix_time'] = df_data.apply(convert_to_unix_time, axis=1)

In [95]:
df_data.drop(['createdAt', 'year', 'month'], axis=1, inplace=True)

### Getting Geolocation

In [96]:
from shapely.geometry import Point, shape

In [97]:
# get the ID of cantons or municipalities, depending on the geoJSON file passed as an argument
def get_id_from_geoJSON(record, filename):
    point = Point(record['placeLongitude'], record['placeLatitude'])
    
    for feature in filename['features']:
        if(feature['geometry']['type'] == 'Polygon'):
            polygon = shape(feature['geometry'])
            if polygon.contains(point):
                return feature['properties']['id']
        elif(feature['geometry']['type'] == 'MultiPolygon'):
            multipolygon = shape(feature['geometry'])
            for polygon in multipolygon:
                if polygon.contains(point):
                    return feature['properties']['id']

In [98]:
# read geoJSON file with canton names and IDs
path_cantons = '../res/topo/ch-cantons-geo.json'

with open(path_cantons) as file:
    cantons_json = json.load(file)

In [99]:
# for each record, get the ID corresponding to the canton
df_data['canton_id'] = df_data.apply(get_id_from_geoJSON, args=(cantons_json,), axis=1)

In [100]:
print('number of tweets which location is not in CH:', df_data['canton_id'].isnull().sum())

number of tweets which location is not in CH: 2737


In [101]:
# drop records where the location didn't correspond to a place in CH
df_data.dropna(subset=['canton_id'], inplace=True)

In [102]:
# read geoJSON file with municipalities names and IDs
path_towns = '../res/topo/ch-municipalities-geo.json'

with open(path_towns) as file:
    towns_json = json.load(file)

In [103]:
# for each record, get the ID corresponding to the municipality
df_data['town_id'] = df_data.apply(get_id_from_geoJSON, args=(towns_json,), axis=1)

### Grouping Tweets by Year and Canton/Municipality

In [104]:
grouped_year_canton = df_data.groupby(['unix_time', 'canton_id']).size()

In [105]:
grouped_year_town = df_data.groupby(['unix_time', 'town_id']).size()

### Creating JSON Files

In [106]:
def create_json_file(canton_municipality, grouped_dataframe, output_filename):
    dates_list = list(grouped_dataframe.index.levels[0])
    ids_list = list(grouped_dataframe.index.levels[1])
    
    if canton_municipality == 'c':
        main_object = 'cantons'
    else:
        main_object = 'municipalities'
    
    json_file = dict()
    json_file[main_object] = list()  
    
    for date_index in range(len(dates_list)):
        json_file[main_object].append(dict())
        json_file[main_object][date_index]['date'] = int(dates_list[date_index])
        json_file[main_object][date_index]['data'] = list()

        for id_index in range(len(ids_list)):
            json_file[main_object][date_index]['data'].append(dict())
            json_file[main_object][date_index]['data'][id_index]['id'] = int(ids_list[id_index])
            json_file[main_object][date_index]['data'][id_index]['nbr'] = int(grouped_dataframe[(dates_list[date_index],
                                                                                                 ids_list[id_index])])
            
    with open(output_filename, 'w') as file:
        json.dump(json_file, file)

In [107]:
create_json_file('c', grouped_year_canton, '../res/density/canton_density.json')

In [108]:
create_json_file('m', grouped_year_town, '../res/density/municipality_density.json')