In [None]:
import pandas as pd

### Extracting Data

In [None]:
import csv

In [None]:
data_path = '../twitter-swisscom/sample.tsv'

In [None]:
columns_names = ['id', 'userId', 'createdAt', 'text', 'longitude', 'latitude', 'placeId', 'inReplyTo', 'source',
                 'truncated', 'placeLatitude', 'placeLongitude', 'sourceName', 'sourceUrl', 'userName', 'screenName',
                 'followersCount', 'friendsCount', 'statusesCount', 'userLocation']

# the columns that interest us for the density map
columns_to_keep = ['id', 'createdAt', 'placeLatitude', 'placeLongitude']

In [None]:
# set to None to get all the records
num_rows = 500

In [None]:
df_data = pd.read_csv(data_path, sep='\t', encoding='utf-8', escapechar='\\', quoting=csv.QUOTE_NONE,
                      header=None, na_values='N', nrows=num_rows)

# give labels to the columns
df_data.columns = columns_names

In [None]:
df_data = df_data[columns_to_keep]

In [None]:
# check if we have NaNs in any column
df_data.isnull().sum()

### Parsing Date

In [None]:
import numpy as np
from datetime import datetime

In [None]:
df_data['year'] = pd.DatetimeIndex(df_data['createdAt']).year
df_data['month'] = pd.DatetimeIndex(df_data['createdAt']).month

In [None]:
def convert_to_unix_time(record):
    datetime_index = pd.DatetimeIndex([datetime(record['year'], record['month'], 1)])
    unix_time_index = datetime_index.astype(np.int64) // 10**6
    return unix_time_index[0]

In [None]:
df_data['unix_time'] = df_data.apply(convert_to_unix_time, axis=1)

In [None]:
df_data.drop(['createdAt', 'year', 'month'], axis=1, inplace=True)

### Getting Geolocation

In [None]:
import re
from geopy.geocoders import Nominatim

In [None]:
def get_locations(dataframe):
    num_non_ch = 0
    
    geolocator = Nominatim()
    
    for index, row in dataframe.iterrows():
        location = geolocator.reverse((row['placeLatitude'], row['placeLongitude']))
        
        # print(location.raw)
        
        if location.raw['address']['country_code'] == "ch":
            dataframe.set_value(index, 'canton', location.raw['address']['state'])
            if 'city' in location.raw['address']:
                dataframe.set_value(index, 'town', location.raw['address']['city'])
            elif 'town' in location.raw['address']:
                dataframe.set_value(index, 'town', location.raw['address']['town'])
            else:
                dataframe.set_value(index, 'town', location.raw['address']['village'])
        else:
            num_non_ch += 1
    
    print('number of tweets which location is not in CH:', num_non_ch)
    
    return dataframe

In [None]:
df_data = get_locations(df_data)

In [None]:
# drop records where the location didn't correspond to a place in CH
df_data.dropna(subset=['canton'], inplace=True)

In [None]:
# read mapping between canton names and IDs
cantons_ids = pd.read_json(path_or_buf='cantons_mapping.json', orient='records', typ='series')

In [None]:
def get_canton_id(record):
    split_canton = re.split(' - | ', record['canton'])
    
    for word in split_canton:
        for canton_name in cantons_ids.index:
            if word in canton_name:
                return cantons_ids[canton_name]

In [None]:
# for each record, get the ID corresponding to the canton
df_data['canton_id'] = df_data.apply(get_canton_id, axis=1)

### Grouping Tweets by Year and Canton/Town

In [None]:
grouped_year_canton = df_data.groupby(['unix_time', 'canton_id']).size()

In [None]:
grouped_year_canton

In [None]:
grouped_year_town = df_data.groupby(['unix_time', 'town']).size()

### Creating JSON Files

In [None]:
import json

In [None]:
def create_json_file(grouped_dataframe, output_filename):
    dates_list = list(grouped_dataframe.index.levels[0])
    cantons_list = list(grouped_dataframe.index.levels[1])
    
    cantons = dict()
    cantons['cantons'] = list()
    
    for date_index in range(len(dates_list)):
        cantons['cantons'].append(dict())
        cantons['cantons'][date_index]['date'] = int(dates_list[date_index])
        cantons['cantons'][date_index]['data'] = list()

        for canton_index in range(len(cantons_list)):
            cantons['cantons'][date_index]['data'].append(dict())
            cantons['cantons'][date_index]['data'][canton_index]['id'] = int(cantons_list[canton_index])
            cantons['cantons'][date_index]['data'][canton_index]['nbr'] = int(grouped_year_canton[(dates_list[date_index],
                                                                                                   cantons_list[canton_index])])
            
    with open(output_filename, 'w') as file:
        json.dump(cantons, file)
        
    return cantons

In [None]:
cantons_json = create_json_file(grouped_year_canton, 'canton_density.json')