In [1]:
import pandas as pd
import json

### Extracting Data

In [2]:
import csv

In [3]:
data_path = '../../twitter-swisscom/sample.tsv'

In [4]:
columns_names = ['id', 'userId', 'createdAt', 'text', 'longitude', 'latitude', 'placeId', 'inReplyTo', 'source',
                 'truncated', 'placeLatitude', 'placeLongitude', 'sourceName', 'sourceUrl', 'userName', 'screenName',
                 'followersCount', 'friendsCount', 'statusesCount', 'userLocation']

# the columns that interest us for the density map
columns_to_keep = ['id', 'createdAt', 'placeLatitude', 'placeLongitude']

In [5]:
# set to None to get all the records
num_rows = None

In [6]:
df_data = pd.read_csv(data_path, sep='\t', encoding='utf-8', escapechar='\\', quoting=csv.QUOTE_NONE,
                      header=None, na_values='N', nrows=num_rows)

# give labels to the columns
df_data.columns = columns_names

In [7]:
df_data = df_data[columns_to_keep]

In [8]:
# check if we have NaNs in any column
df_data.isnull().sum()

id                0
createdAt         0
placeLatitude     0
placeLongitude    0
dtype: int64

### Parsing Date

In [9]:
import numpy as np
from datetime import datetime

In [10]:
df_data['year'] = pd.DatetimeIndex(df_data['createdAt']).year
df_data['month'] = pd.DatetimeIndex(df_data['createdAt']).month

In [11]:
def convert_to_unix_time(record):
    datetime_index = pd.DatetimeIndex([datetime(record['year'], record['month'], 1)])
    unix_time_index = datetime_index.astype(np.int64) // 10**6
    return unix_time_index[0]

In [12]:
df_data['unix_time'] = df_data.apply(convert_to_unix_time, axis=1)

In [13]:
df_data.drop(['createdAt', 'year', 'month'], axis=1, inplace=True)

### Getting Geolocation

In [14]:
from shapely.geometry import Point, shape

In [25]:
# read geoJSON file with canton names and IDs
path_cantons = '../res/topo/ch-cantons-geo.json'

with open(path_cantons) as file:
    cantons_json = json.load(file)

In [16]:
def get_canton_id(record):
    point = Point(record['placeLongitude'], record['placeLatitude'])
    
    for feature in cantons_json['features']:
        if(feature['geometry']['type'] == 'Polygon'):
            polygon = shape(feature['geometry'])
            if polygon.contains(point):
                return feature['properties']['id']
        elif(feature['geometry']['type'] == 'MultiPolygon'):
            multipolygon = shape(feature['geometry'])
            for polygon in multipolygon:
                if polygon.contains(point):
                    return feature['properties']['id']

In [17]:
# for each record, get the ID corresponding to the canton
df_data['canton_id'] = df_data.apply(get_canton_id, axis=1)

In [18]:
print('number of tweets which location is not in CH:', df_data['canton_id'].isnull().sum())

number of tweets which location is not in CH: 2737


In [19]:
# drop records where the location didn't correspond to a place in CH
df_data.dropna(subset=['canton_id'], inplace=True)

### Grouping Tweets by Year and Canton/Town

In [20]:
grouped_year_canton = df_data.groupby(['unix_time', 'canton_id']).size()

In [21]:
grouped_year_canton

unix_time      canton_id
1472688000000  1.0          1265
               2.0           525
               3.0           372
               4.0             5
               5.0            31
               6.0           116
               7.0            12
               8.0             2
               9.0            47
               10.0          139
               11.0           57
               12.0          218
               13.0           57
               14.0           12
               15.0            2
               16.0           17
               17.0          153
               18.0           76
               19.0          238
               20.0           70
               21.0          269
               22.0          722
               23.0          236
               24.0          170
               25.0         1233
               26.0            9
dtype: int64

In [22]:
# TODO
# grouped_year_town = df_data.groupby(['unix_time', 'town']).size()

### Creating JSON Files

In [23]:
def create_json_file(grouped_dataframe, output_filename):
    dates_list = list(grouped_dataframe.index.levels[0])
    cantons_list = list(grouped_dataframe.index.levels[1])
    
    cantons = dict()
    cantons['cantons'] = list()
    
    for date_index in range(len(dates_list)):
        cantons['cantons'].append(dict())
        cantons['cantons'][date_index]['date'] = int(dates_list[date_index])
        cantons['cantons'][date_index]['data'] = list()

        for canton_index in range(len(cantons_list)):
            cantons['cantons'][date_index]['data'].append(dict())
            cantons['cantons'][date_index]['data'][canton_index]['id'] = int(cantons_list[canton_index])
            cantons['cantons'][date_index]['data'][canton_index]['nbr'] = int(grouped_year_canton[(dates_list[date_index],
                                                                                                   cantons_list[canton_index])])
            
    with open(output_filename, 'w') as file:
        json.dump(cantons, file)
        
    return cantons

In [24]:
cantons_json = create_json_file(grouped_year_canton, '../res/density/canton_density.json')