In [186]:
import json
import geopandas as gpd
import pandas as pd
import numpy as np
import geoplot
from shapely.geometry import Point, Polygon
import os
import pytz
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import spacy
from string import punctuation
from collections import Counter
from tqdm import tqdm

In [187]:
# So we can use LATEX in Matplotlib/Geoplot/Geopandas plots
plt.rcParams['text.usetex']=True

In [188]:
# Load the spacy nlp model for keyword extraction 
nlp = spacy.load('en_core_web_sm')

In [190]:
# Get a list of all .json files saved in two hour intervals 
news_files_paths = ["Data/{}".format(f) for f in os.listdir("Mapnews-Data") if ".json" in f]

In [191]:
# From geopandas, get the countries outlines
world = gpd.read_file(
    gpd.datasets.get_path('naturalearth_lowres')
)

In [192]:
def read_file(path):
    """ Read a json news file from a given path """
    f = open(path)
    data = f.read()
    f.close()
    return json.loads(data)

In [193]:
def parse_data(data):
    """ Parse the news data into a useful geodataframe and return it """
    
    # Extract all information
    # - title
    # - date
    # - locations
    all_info = []
    for d in data:
        info = []
        for l in d["geolocations"]:
            info += [
                {
                    "title": d["title"],
                    "date": d["published"],
                    **l
                }
            ]
        all_info.extend(info)
    
    # Create the dataframe
    df = gpd.GeoDataFrame(all_info)
    
    # Convert to geometry
    df["geometry"] = df.apply(lambda x: Point(x.lng, x.lat), axis=1)
    df = df.rename(columns={"type": "loc_type"})
    
    # Create a proper date and add a date_hour column for hourly grouping
    df.date = pd.to_datetime(df.date, errors="coerce")
    df.date = df.date.apply(lambda x: x.astimezone(pytz.utc))
    df["date_hour"] = pd.to_numeric(df.date.dt.strftime("%H"))
    
    return df

In [189]:
def get_keywords(text):
    """ Get the keywords from a given text using """
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN']
    doc = nlp(text.lower())
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue
        if(token.pos_ in pos_tag):
            result.append(token.text)
    return result

In [219]:
def get_top_10_locations(data_hb):
    """ Get the top 10 locations from a block of data / the news' locations """
    top_10_loc = data_hb.groupby("name")["count"].sum().sort_values(ascending=False)
    top_10_loc_text = list(top_10_loc[:10].index)
    return top_10_loc_text, list(top_10_loc[:10].values)

In [220]:
# Not used (yet)
whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ')
def get_top_10_keywords(data_hb):
    """ Get the top 10 keywords from a block of data / the news' titles """
    titles_clean = data_hb.title.apply(lambda x: ''.join(filter(whitelist.__contains__, x)))
    titles_clean = " ".join(list(titles_clean))
    top_words = get_hotwords(titles_clean)
    top_10_keywords = [i[0].title() for i in Counter(top_words).most_common(10)]
    counts = [i[1] for i in Counter(top_words).most_common(10)]
    return top_10_keywords, counts

In [216]:
# To keep the plot from adjusting the viewport to the coordinates 
# of the news, a dummy outline is added every time
p = Polygon([[-250, -150], [250, -150], [250, 150], [-250, 150]])
outline = gpd.GeoDataFrame([p])
outline.columns = ["geometry"]

def plot_data(hb, progress, df, weights, save_as=None):
    """ 
        Plot the hour (hb), 
        with a given progress percentage (1-100), 
        the data for this hour (df)
        and the weights (weights)
    """
    
    # Depending on the number of mentions m (already calculated, parameter weights), 
    # multiply each location to exist m times
    # Otherwise the kdeplot does not look as good
    weighted_df = [[item] * int(weights[idx]+1) for idx, item in enumerate(df.geometry)]
    weighted_df = [b for a in weighted_df for b in a]
    weighted_df = gpd.GeoDataFrame(weighted_df)
    weighted_df.columns = ["geometry"]
    
    
    fig, axs = plt.subplots(2, figsize=(20, 10), gridspec_kw={'height_ratios': [100, 1]})
    # Plot the kdeplot
    geoplot.kdeplot(weighted_df,
                    ax=axs[0],
                    shade=True,
                    # thresh=0.05,
                    n_levels=30,
                    # thresh=0.0,
                    extent=(-180, -90, 180, 90),
                    cmap='viridis')
    
    # Plot the outline 
    geoplot.polyplot(world, ax=axs[0], zorder=1, edgecolor='white')
    
    # Plot the mentioned points, with their hue on a log scale (otherwise not as clearly visible)
    geoplot.pointplot(df, hue="count_log", scale="count_log", extent=(-180, -90, 180, 90), ax=axs[0], cmap='Wistia')
    
    # Plot the dummy outline (see above)
    outline.boundary.plot(ax=axs[0], zorder=0, facecolor='#440154')
    
    # Plot top locations
    t, c = get_top_10_locations(data_hb)
    c = c[::-1]
    t = t[::-1]
    for idx, item in enumerate(t):
        axs[0].text(-180, -70+idx*4, "{:4} {}".format(c[idx], item), fontsize=14, color="white")
    
    # Plot watermark / link
    axs[0].text(-35, -87, r'Data from \textbf{www.mapnews.io}', fontsize=16, color="white")    
    
    # Plot progress
    # Text
    time_now = hb.strftime("%Y-%m-%d %H:%M UTC")
    axs[1].text(50 - 7.5, 0.85, "{:^30}".format(time_now), fontsize=16, color="white")
    # Barplot
    axs[1].barh([0], [100], color="#440154")
    axs[1].barh([0], [progress], color="white")
    of = 3.2
    axs[1].set_xlim([-of + 0, 100 + of])
    axs[1].axis('off')
    
    # Remove space & set background!
    fig.tight_layout()
    fig.patch.set_facecolor('#440154')
    
    # If the plot is to be saved, save it.
    if save_as:
        fig = plt.gcf()
        plt.savefig("{}.png".format(save_as), dpi=200, bbox_inches='tight', pad_inches=0.1)

In [None]:
# Read all articles
dfs = []
for path in news_files_paths:
    data = read_file(path)
    dfs += [parse_data(data)]

In [None]:
# Concatinate and sort by hours
data = pd.concat(dfs)
data = data.sort_values("date_hour")

In [None]:
# Keep only the first occurence of the news article
data = data.drop_duplicates(subset=['title', 'name'])
data = data.reset_index(drop=True)

In [None]:
# Each news article has a lifetime of 24 hours
TIME_UNTIL_REMOVE_HRS = 24

# Start and end times of when to show this particular news item
data["date_start"] = pd.to_datetime(data.date.dt.strftime("%Y-%m-%d %H"))
data["date_end"] = data["date_start"] + pd.Timedelta(hours=TIME_UNTIL_REMOVE_HRS)
# For nicer plots
data["count_log"] = np.log(data["count"]) * 10

In [None]:
# Create a series from the start to the end of the data
block_size = 1 #hours
delta_in_days = (data.date.max().date() - data.date.min().date()).days + 1
hour_blocks = pd.date_range(data.date.min().date(), periods=delta_in_days*(24/block_size), freq='{}H'.format(block_size))

In [None]:
# Start only from the 72th hour block 
hour_blocks_selected = hour_blocks[72:]
max_idx = len(hour_blocks_selected)
for idx, hb in enumerate(hour_blocks_selected):
    data_hb = data[(hb >= data.date_start) & (hb <= data.date_end)]
    
    # print("-> {} ({}) ".format(hb, idx), len(data_hb))
    print("{}".format(idx), end=" ")
    
    if data_hb.shape[0] == 0:
        continue
    
    # Calculate weights 
    hours_elapsed_since_break = (hb - data_hb.date_start) / np.timedelta64(1, 'h')
    weights = (TIME_UNTIL_REMOVE_HRS - hours_elapsed_since_break).values
    weights[weights >= 20] = ((24 - weights[weights >= 20])*6)
    # Plot
    progress = idx/max_idx * 100
    plot_data(hb, progress, data_hb, weights, save_as="Plots/{}".format(idx))