In [337]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode,iplot
init_notebook_mode(connected=True)
import numpy as np
import pandas as pd
import math
import colorcet as cc

def matplotlib_to_plotly(cmap, pl_entries):
    h = 1.0/(pl_entries-1)
    pl_colorscale = []

    for k in range(pl_entries):
        C = map(np.uint8, np.array(cmap(k*h)[:3])*255)
        pl_colorscale.append([k*h, 'rgb'+str((C[0], C[1], C[2]))])

    return pl_colorscale

heatmap_colorscale    = matplotlib_to_plotly(cc.m_diverging_rainbow_bgymr_45_85_c67, 256)
# choropleth_colorscale = 'Reds'
choropleth_colorscale = matplotlib_to_plotly(cc.m_bgy_r, 256)

In [431]:
def read_dict(filename, columns):
    df = pd.read_csv(filename, sep='\t')
    return dict(df[columns].values)

def draw_choropleth_by_dataframe(df, title, filename=None):
    data = {
        'type': 'choropleth',
        'locations': df['country'], 'locationmode': 'country names',
        'z': df['value'],
        'text': df['country'],
        'colorscale': choropleth_colorscale,
        'reversescale': False,
        'colorbar': {
            'title': None,
            'thickness': 10,
            'ticklen': 0,
            'tickfont': {'size': 16},
            'len': 0.8,
        },
    }

    layout = {
        'title': title,
        'titlefont': {'size': 24},
        'margin': {'l': 0, 't': 50, 'b': 0, 'r': 0, 'pad': 0,},
        'geo': {
            'showframe': False, 
            'projection': {'type':'Mercator'},
            'showcountries': True,
            'bgcolor': 'rgba(0,0,0,0)',
        },
        'paper_bgcolor': 'rgba(0,0,0,0)',
        'plot_bgcolor': 'rgba(0,0,0,0)',
    }
    choromap = go.Figure(data=[data], layout=layout)
    if filename:
        iplot(choromap, validate=False, image_width=1280, image='png', filename=filename)#, image_height=1024
    else:
        iplot(choromap, validate=False, image_width=1280)#, image_height=1024

def choropleth_df(value_method, relevant_countries):
    pairs = [(country, value_method(country)) for country in relevant_countries]
    return pd.DataFrame(pairs, columns = ['country', 'value'])

def draw_choropleth(value_method, relevant_countries, title, filename=None):
    draw_choropleth_by_dataframe(choropleth_df(value_method, relevant_countries), title, filename)


In [434]:
population_by_country = read_dict('population.tsv', ['country', 'population'])
big_countries = [country for (country,population) in population_by_country.items() if population > 1000000]

all_counts  = read_dict('country_heatmap.tsv', columns=['country', 'count'])
ieee_counts = read_dict('country_heatmap_ieee.tsv', columns=['country', 'count'])
elsevier_counts = read_dict('country_heatmap_elsevier.tsv', columns=['country', 'count'])

scihub_active_countries = [country for (country,count) in all_counts.items() if count > 100000]
relevant_countries = set(all_counts.keys()) & \
                     set(elsevier_counts.keys()) & \
                     set(ieee_counts.keys()) & \
                     set(big_countries) & \
                     set(scihub_active_countries)

draw_choropleth(lambda country: all_counts[country], all_counts.keys(),
                title='Number of downloads', filename='choropleth_num_downloads')
draw_choropleth(lambda country: math.log10(all_counts[country]), all_counts.keys(),
                title='log10(number of downloads)', filename='choropleth_log10_num_downloads')

# draw_choropleth(lambda country: all_counts[country] * 1.0 / population_by_country[country], relevant_countries,
#                 title='Number of downloads per capita', filename='choropleth_num_downloads_per_capita')
draw_choropleth(lambda country: math.log10(all_counts[country] * 1.0 / population_by_country[country]), relevant_countries,
                title='log10(number of downloads per capita)', filename='choropleth_log10_num_downloads_per_capita')

# draw_choropleth(lambda country: ieee_counts.get(country,0), all_counts.keys(),
#                 title='Number of IEEE downloads', filename='choropleth_num_ieee_downloads')
draw_choropleth(lambda country: ieee_counts.get(country,0) * 1.0 / all_counts[country], relevant_countries,
                title='Share of IEEE downloads', filename='choropleth_share_ieee_downloads')
draw_choropleth(lambda country: elsevier_counts.get(country,0) * 1.0 / all_counts[country], relevant_countries,
                title='Share of Elsevier downloads', filename='choropleth_share_elsevier_downloads')


# who is downloading the most IEEE / Elsevier papers
print('Average share of IEEE downloads: ' + str(sum(ieee_counts.values())*1.0 / sum(all_counts.values())))
share_ieee_df = choropleth_df(lambda country: ieee_counts.get(country,0) * 1.0 / all_counts[country], relevant_countries)
print(share_ieee_df.sort_values('value',ascending=False).head())

print('Average share of Elsevier downloads: ' + str(sum(elsevier_counts.values())*1.0 / sum(all_counts.values())))
share_elsevier_df = choropleth_df(lambda country: elsevier_counts.get(country,0) * 1.0 / all_counts[country], relevant_countries)
print(share_elsevier_df.sort_values('value',ascending=False).head())

Average share of IEEE downloads: 0.0659162004891
        country     value
72        India  0.207174
83  South Korea  0.201090
4    Bangladesh  0.184497
40    Singapore  0.148476
10     Ethiopia  0.141017
Average share of Elsevier downloads: 0.304348180396
    country     value
50  Morocco  0.517910
66  Algeria  0.508469
14  Bolivia  0.500272
69  Tunisia  0.493255
12     Peru  0.458112


In [4]:
# Germany was not subscribed to Elsevier more than 40 days of the year
# https://www.nature.com/news/german-scientists-regain-access-to-elsevier-journals-1.21482
all_counts_january  = read_dict('country_heatmap_january_upto_09feb.tsv', columns=['country', 'count'])
elsevier_counts_january = read_dict('country_heatmap_elsevier_january_upto_09feb.tsv', columns=['country', 'count'])
scihub_active_countries_january = [country for (country,count) in all_counts_january.items() if count > 100000 * (40.0 / 365)]
relevant_countries_january = set(all_counts_january.keys()) & \
                     set(elsevier_counts_january.keys()) & \
                     set(big_countries) & \
                     set(scihub_active_countries_january)
draw_choropleth(lambda country: elsevier_counts_january.get(country,0) * 1.0 / all_counts_january[country], relevant_countries_january,
                title='Share of Elsevier downloads (01 Jan - 09 Feb)')#, filename='choropleth_share_elsevier_downloads_january')

# who is downloading the most Elsevier papers in the first 40 days of the year

print('Average share of Elsevier downloads: ' + str(sum(elsevier_counts_january.values())*1.0 / sum(all_counts_january.values())))
share_elsevier_df_january = choropleth_df(lambda country: elsevier_counts_january.get(country,0) * 1.0 / all_counts_january[country], relevant_countries_january)
print(share_elsevier_df_january.sort_values('value',ascending=False).head())


Average share of Elsevier downloads: 0.317110695461
    country     value
23  Algeria  0.514224
47  Morocco  0.502503
71  Tunisia  0.497833
17  Ecuador  0.468010
11     Peru  0.467372


In [5]:
import csv

def read_time_counts(counts_filename, mode, init=None):
    counts_by_place = init if init else {}
    with open(counts_filename) as f:
        reader = csv.reader(f, delimiter='\t')
        for idx,row in enumerate(reader):
            if idx == 0: # drop header
                timerange_header = row[2:]
                continue
            
            if mode == 'country':
                country = row[0]
                total = row[1]
                counts = row[2:]
                place = (country,)                
            elif mode == 'city':
                country = row[0]
                city = row[1]
                total = row[2]
                counts = row[3:]
                place = (country, city)
            
            counts_by_place[place] = [float(x) for x in counts]
        return (counts_by_place, timerange_header)

rates_weektime10_by_place, weektime10_header = read_time_counts('rates/country/weekhour_10.tsv', mode='country')
read_time_counts('rates/city/weekhour_10.tsv', mode='city', init=rates_weektime10_by_place)

rates_weektime60_by_place, weektime60_header = read_time_counts('rates/country/weekhour.tsv', mode='country')
read_time_counts('rates/city/weekhour.tsv', mode='city', init=rates_weektime60_by_place)

rates_daytime10_by_place, daytime10_header = read_time_counts('rates/country/daytime_10.tsv', mode='country')
read_time_counts('rates/city/daytime_10.tsv', mode='city', init=rates_daytime10_by_place)

weektime10_header += ['Sun, 24:00']
weektime60_header += ['Sun, 24:00']
daytime10_header += ['24:00']  # 24:00 is the same as 00:00
print('Loaded')

Loaded


In [437]:
def plot_daytime(places, filename=None, rescale=True, pull_down=False):
    layout = {
        'margin': {'b':80, 't': 0, 'l': 45},
        'legend': {'y' : 0.5, 'font': {'size': 20},},
        'font': {'size': 20},
        'yaxis': {
            'range': [0,1.05],
            'rangemode': 'tozero',
            'showline': True,
            'gridwidth': 1,
            'gridcolor': '#bdbdbd',
#             'showticklabels': False,
        },
        'xaxis': {
            'dtick': 24, # every 4 hours
            'showline': True,
            'gridwidth': 1,
            'gridcolor': '#bdbdbd',        
            'tickangle': 270,
        },
        'paper_bgcolor': 'rgba(0,0,0,0)',
        'plot_bgcolor': 'rgba(0,0,0,0)',
    }
    data = []
    for place in places:
        vals = rates_daytime10_by_place[place]
        
        if pull_down:
            minval = min(vals)
            vals = [x - minval for x in vals]
        if rescale:
            maxval = max(vals)
            vals = [x*1.0 / maxval for x in vals]
        
        trend = go.Scatter(
            x = daytime10_header,
            y = vals + [vals[0]], # 24:00 is the same as 00:00
            mode = 'lines',
            line = {'width': 5},
            name = ', '.join(place)
        )
        data.append(trend)

    fig = go.Figure(data=data, layout=layout)
    if filename:
        iplot(fig, image_width=1280, image_height=1024, image='png', filename=filename)
    else:
        iplot(fig, image_width=1280, image_height=1024)

def plot_weektime(places, filename=None, mode='weektime60', rescale=True, pull_down=False):
    if mode not in ['weektime10', 'weektime60']:
        raise 'Unknown mode'
    
    if mode == 'weektime10':
        dtick = 24 # every 4 hours
        x_header = weektime10_header
    elif mode == 'weektime60':
        dtick = 4 # every 4 hours
        x_header = weektime60_header
    
    layout = {
        'margin': {'b':120, 't': 0, 'l': 40},
        'legend': {'y' : 0.5, 'font': {'size': 20},},
        'font': {'size': 18},
        'yaxis': {
            'range': [0,1.05],
            'rangemode': 'tozero',
            'showline': True,
            'gridwidth': 1,
            'gridcolor': '#bdbdbd',
            'showticklabels': True,
        },
        'xaxis': {
            'dtick': dtick, # every 4 hours
            'showline': False,
            'gridwidth': 1,
            'gridcolor': '#bdbdbd',        
            'tickangle': 270,
        },
        'paper_bgcolor': 'rgba(0,0,0,0)',
        'plot_bgcolor': 'rgba(0,0,0,0)',
    }
    data = []
    for place in places:
        if mode == 'weektime10':
            vals = rates_weektime10_by_place[place]
        elif mode == 'weektime60':
            vals = rates_weektime60_by_place[place]

        if pull_down:
            minval = min(vals)
            vals = [x - minval for x in vals]
        if rescale:
            maxval = max(vals)
            vals = [x*1.0 / maxval for x in vals]
        
        trend = go.Scatter(
            x = x_header,
            y = vals + [vals[0]], # Sun 24:00 is the same as Mon 00:00
            mode = 'lines',
            line = {'width': 5},
            name = ', '.join(place)
        )
        data.append(trend)
    
    # Draw borderline of different colors for different weekdays
    weekday_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday','Friday', 'Saturday', 'Sunday']
    rainbow_colors = ['Red', 'Orange', 'Yellow', 'Green', 'Cyan', 'Blue', 'Violet']
    for wday in range(7):
        if mode == 'weektime10':
            xs = [weektime10_header[wday*24*6 + daytime10] for daytime10 in range(24*6 + 1)]
            ys = [0]*(24*6 + 1)
        elif mode == 'weektime60':
            xs = [weektime60_header[wday*24 + daytime60] for daytime60 in range(24 + 1)]
            ys = [0]*(24 + 1)
        
        data.append({'mode':'lines', 'name': weekday_names[wday], 'showlegend': False,
                     'line': {'width': 10, 'color': rainbow_colors[wday]},
                     'x': xs, 'y': ys,
                    })
        
    fig = go.Figure(data=data, layout=layout)
    if filename:
        iplot(fig, image_width=1280, image_height=1024, image='png', filename=filename)
    else:
        iplot(fig, image_width=1280, image_height=1024)

In [438]:
plot_weektime([('Russia', 'Moskva'), ], filename = 'weekplot_Moscow')
plot_weektime([('Russia', 'Moskva'), ('Russia', 'Novosibirsk'), ], filename = 'weekplot_Moscow-Novosib')
plot_weektime([('Russia', 'Moskva'), ('Russia', 'Novosibirsk'), ], pull_down = True, filename = 'weekplot_Moscow-Novosib-pulldown')
plot_daytime([('Russia',), ('France',)], filename = 'dayplot_Russia-France')

plot_weektime([('Russia',), ('Colombia',)], filename = 'weekplot_Russia-Colombia')
plot_weektime([('Russia',), ('Israel',)], filename = 'weekplot_Russia-Israel')
plot_weektime([('Russia',), ('Cuba',)], filename = 'weekplot_Russia-Cuba')
plot_weektime([('Russia',), ('India',)], filename = 'weekplot_Russia-India')
plot_weektime([('India',), ('India', 'Kolkata')], filename = 'weekplot_India-all-Kolkata')
plot_weektime([('Iraq',), ('Iran',)], filename = 'weekplot_Iran-Iraq')

plot_daytime([('China',), ('South Korea',), ('France',), ('Germany',)], filename = 'dayplot_China-France-Germany-SouthKorea')
plot_daytime([('China',), ('China', 'Beijing'), ('China', 'Shanghai'), ('China', 'Guangzhou Shi')], filename = 'dayplot_China-cities')
plot_weektime([('China',), ('China', 'Dingxi Shi')], filename = 'weekplot_China-DingxiShi')

plot_weektime([('Russia', 'Moskva'), ('United States', 'Los Angeles')], filename = 'weekplot_Moscow-LosAngeles')
plot_weektime([('Russia', 'Moskva'), ('United States', 'Chicago')], filename = 'weekplot_Moscow-Chicago')
plot_weektime([('Russia', ), ('United States', )], filename = 'weekplot_Russia-US')
plot_weektime([('Russia', ), ('United States', )], pull_down = True, filename = 'weekplot_Russia-US-pulldown')
plot_weektime([('Russia', 'Moskva'), ('United States', 'Boston'),], filename = 'weekplot_Moscow-Boston')
plot_weektime([('Japan',)], filename = 'weekplot_Japan')
plot_daytime([('Japan','Chiyoda-ku')], filename = 'dayplot_Japan-Chiyoda-ku')
plot_daytime([('Canada', 'Montréal'), ('Canada', 'Montreal')], 'dayplot_Canada-Montreals')

## plot_weektime([('Russia', 'Moskva'), ('United States', 'New York')])
## plot_weektime([('Russia',), ('France',)], mode='weektime60', filename = 'weekplot_Russia-France')
## plot_weektime([('Russia',), ('India',)], mode='weektime10', filename = 'weekplot10_Russia-India')
## plot_weektime([('Algeria',),])


In [439]:
import time, datetime
import os

def date_by_yday(yday):
    year_start = datetime.datetime(2017,1,1)
    year_start_timestamp = time.mktime(year_start.timetuple())
    return datetime.datetime.fromtimestamp(year_start_timestamp + 3600*24*(yday-1))

def human_week(week):
    from_day = date_by_yday(1 + week*7)
    to_day = date_by_yday(1 + (week + 1)*7)
    return "%s - %s" % (from_day.strftime('%d %b'), to_day.strftime('%d %b'))

# mode can be one of: week10 / week60 / day10 / day60
def heatmap(heatmap_filename, title=None, mode='week10', output_filename=None):
    if not title:
        title = os.path.splitext(os.path.basename(heatmap_filename))[0]
    spectra = pd.read_csv(heatmap_filename, sep='\t')
    spectra = spectra.iloc[::-1] # the first week of a year goes at image top
    if mode not in ['week10', 'week60', 'day10', 'day60']:
        raise 'Unknown mode'
        
    if mode.startswith('week'):
        x_header = list(spectra.keys()) + ['Sun, 24:00']
        bottom_margin = 135
    elif mode.startswith('day'):
        x_header = list(spectra.keys()) + ['24:00']
        bottom_margin = 85
    
    if mode == 'week10': # each 4 hours
        dtick = 24
    elif mode == 'week60': # each 4 hours
        dtick = 4
    elif mode == 'day10': # each hour
        dtick = 6
    elif mode == 'day60': # each hour
        dtick = 1
    
    layout = {
        'title': title,
        'titlefont': {'size': 20},
        'margin': {'l':185, 'b': bottom_margin, 't': 50},
        'yaxis': { 'dtick': 1, 'tickfont': {'size': 20} },
        'xaxis': { 'dtick': dtick, 'ticklen': 0, 'tickangle': 270, 'tickfont':{'size': 20} },
        'paper_bgcolor': 'rgba(0,0,0,0)',
        'plot_bgcolor': 'rgba(0,0,0,0)',
    }
    data = [{
        'type': 'heatmap', 'colorscale': heatmap_colorscale,
        'z':  np.hstack([spectra.values, spectra.values[:,0].reshape(-1,1)]),
        'x': x_header,
        'y': list(reversed([human_week(week) for week in range(0, 365//7 + 1)])),
        'colorbar': { 'thickness': 10, 'ticklen': 0, 'tickfont': {'size': 16} },
    }]
    fig = go.Figure(data=data, layout=layout)
    if output_filename:
        iplot(fig, image_width=1280, image_height=1024, image='png', filename=output_filename)
    else:
        iplot(fig, image_width=1280, image_height=1024)

In [441]:
heatmap('heatmaps/daytime10/Russia.tsv', mode='day10', output_filename='heatmap_Russia_daytime')
heatmap('heatmaps/weektime60/Russia.tsv', mode='week10', output_filename='heatmap_Russia_weektime60')

heatmap('heatmaps/daytime10/France.tsv', mode='day10', output_filename='heatmap_France_daytime')
heatmap('heatmaps/daytime10/Germany.tsv', mode='day10', output_filename='heatmap_Germany_daytime')
heatmap('heatmaps/daytime10/South Korea.tsv', mode='day10', output_filename='heatmap_SouthKorea_daytime')

heatmap('heatmaps/daytime10/Iran.tsv', mode='day10', output_filename='heatmap_Iran_daytime')
heatmap('heatmaps/daytime10/Iraq.tsv', mode='day10', output_filename='heatmap_Iraq_daytime')
heatmap('heatmaps/daytime10/Morocco.tsv', mode='day10', output_filename='heatmap_Morocco_daytime')
heatmap('heatmaps/weektime60/Seychelles.tsv', mode='week60', output_filename='heatmap_Seychelles_weektime')

# heatmap('heatmaps/weektime10/Russia.tsv', mode='week10', output_filename='heatmap_Algeria')
# heatmap('heatmaps/daytime10/Morocco.tsv', mode='day10')
# heatmap('heatmaps/weektime10/Iran.tsv', mode='week10', output_filename=None)
# heatmap('heatmaps/custom/Moscow_daytime60.tsv', mode='day60', output_filename='heatmap_Moscow_daytime60')

In [391]:
heatmap('spectra/weektime60_week/Brazil.tsv', mode='week60')