In [1]:
import os
import pandas as pd
import numpy as np
import folium
import urllib
import json
import socket
from ipwhois import IPWhois
import pycountry
import io
import requests
import xarray as xr
import holoviews as hv
import geoviews as gv
import geoviews.feature as gf
import geoviews.tile_sources as gts
import geopandas
from bokeh.models import HoverTool

from bokeh.palettes import YlOrBr3 as palette

import cartopy
from cartopy import crs as ccrs

from bokeh.tile_providers import STAMEN_TONER
from bokeh.models import WMTSTileSource

hv.notebook_extension('bokeh')

from geolite2 import geolite2
import time

In [2]:
DATA_PATH = "data/"
data = 'data/DataFrame/'
COUNTRY_CODE_DATA = DATA_PATH + "country-codes/data/country-codes.csv"

In [54]:
# List of the column names of the columns we want to keep in the datasets
col_ex_list = ['GlobalEventID', 'Day', 'FractionDate', 'ActionGeo_CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long', 
               'GoldsteinScale', 'NumMentions', 'IsRootEvent', 'AvgTone', 'SOURCEURL\n']
col_men_list = ['GlobalEventId', 'MentionTimeDate', 'MentionSourceName', 'Confidence', 'MentionDocTone', 
                'MentionDocLen', 'MentionIdentifier']

#ex_keep = ['GlobalEventID', 'Day', 'FractionDate', 'ActionGeo_CountryCode', 'ActionGeo_Lat', 
#           'ActionGeo_Long','GoldsteinScale', 'NumMentions', 'IsRootEvent', 'AvgTone']
#men_keep = ['GlobalEventId', 'MentionTimeDate', 'MentionSourceName', 'Confidence', 'MentionDocTone']
# This is the last version
#ex_keep = ['GlobalEventID', 'Day', 'FractionDate', 'ActionGeo_CountryCode', 'ActionGeo_Lat', 
#           'ActionGeo_Long','GoldsteinScale', 'NumMentions', 'AvgTone']
#men_keep = ['GlobalEventId', 'MentionSourceName', 'Confidence', 'MentionDocTone']
ex_keep = ['GlobalEventID', 'Day', 'ActionGeo_CountryCode', 'ActionGeo_Lat', 
           'ActionGeo_Long','GoldsteinScale', 'NumMentions', 'AvgTone']
men_keep = ['GlobalEventId', 'MentionSourceName', 'Confidence', 'MentionDocTone']

def make_exlite(month_n):
    '''
    This function gets file's name (month_n: int) as an input, and then only keeps the necessary columns and 
    save it back!
    '''
    # Loading export_df for the month_n
    #export_df = pd.read_csv(data_read+str(30*month_n)+'to'+str(30*(month_n+1))+'export.csv.gz', 
    #                        compression='gzip', index_col=0, header=0, low_memory=False)
    export_df = pd.read_csv(data_read+'export_'+str(month_n)+'.csv.gz', 
                            compression='gzip', index_col=0, header=0, low_memory=False)
    # Selecting ex_keep columns to keep
    export_df = export_df[ex_keep]
    
    # Creating a list of numerical and categorical columns
    int_list = ['GlobalEventID', 'Day', 
                'NumMentions']
    fl_list = ['ActionGeo_Lat', 'ActionGeo_Long', 'GoldsteinScale', 'AvgTone']
    
    # Downcasting data types to decrease the file size of the dataframe
    export_df.loc[:,int_list] = export_df.loc[:,int_list].apply(pd.to_numeric, 
                                                                downcast='integer', errors='coerce')
    export_df.loc[:,fl_list] = export_df.loc[:,fl_list].apply(pd.to_numeric, 
                                                                downcast='float', errors='coerce')
    # Saving the dataframe
    export_df.to_csv(data_save+'export_'+str(month_n)+'.csv', encoding='utf=8', index=False)
    
    del export_df
    
    return None
       
def make_menlite(month_n):
    '''
    This function gets file's name (month_n: int) as an input, and then only keeps the necessary columns and 
    save it back!
    '''
    # Loading export_df for the month_n
    #mentions_df = pd.read_csv(data_read+str(30*month_n)+'to'+str(30*(month_n+1))+'mentions.csv.gz', compression='gzip', 
    #                          index_col=0, header=0, low_memory=False) data_read+'export_'str(month_n)+'.csv.gz', 
    mentions_df = pd.read_csv(data_read+'mentions_'+str(month_n)+'.csv.gz', compression='gzip', 
                              index_col=0, header=0, low_memory=False)
    # Selecting men_keep columns to keep
    mentions_df = mentions_df[men_keep]
    # Creating a list of numerical and categorical columns
    int_list = ['GlobalEventId']
    fl_list = ['Confidence', 'MentionDocTone']
    
    # Downcasting data types to decrease the file size of the dataframe
    mentions_df.loc[:,int_list] = mentions_df.loc[:,int_list].apply(pd.to_numeric, 
                                                                downcast='integer', errors='coerce')
    mentions_df.loc[:,fl_list] = mentions_df.loc[:,fl_list].apply(pd.to_numeric, 
                                                                downcast='float', errors='coerce')
    # Saving the dataframe
    mentions_df.to_csv(data_save+'mentions_'+str(month_n)+'.csv', encoding='utf=8', index=False)
    
    del mentions_df
    
    return None
    
def load_clean(month_n, usecols_e=None, usecols_m=None):
    '''
    This function gets number of month as an input and load, clean, and merge the associated "export_df" and
    "mentions_df" and return the merged, cleaned dataframe
    '''
    # Loading export_df for the month_n
    export_df = pd.read_csv(data_read+'export_'+str(month_n)+'.csv', usecols=usecols_e, 
                            index_col=None, header=0, low_memory=False)
    # Loading mentions_df for month_n and month_n+1
    mentions_df = pd.DataFrame(columns=men_keep)
    for i in range(month_n, month_n+1):
        mentions_df = pd.concat([mentions_df, 
                                 pd.read_csv(data_read+'mentions_'+str(month_n)+'.csv', usecols=usecols_m,
                                             index_col=None, header=0, low_memory=False)], 
                                join='inner', copy=False, ignore_index=True, sort=False)
    
    #Dropping rows which have NaN in column 'MentionSourceName'
    #mentions_df = mentions_df.dropna(subset=['MentionSourceName'])
    
    # Merging the two dataFrames (export and mentions)
    df_merged = export_df.set_index('GlobalEventID').join(mentions_df.set_index('GlobalEventId'), how='inner')
    
    # Deleting export and mentions dataframes
    del export_df
    del mentions_df
    
    # Reseting index to get back "GlobalEventID" feature
    df_merged = df_merged.dropna(subset=['MentionSourceName']).reset_index()

    # Renaming the first column correctly
    df_merged = df_merged.rename(columns= {df_merged.columns[0]:'GlobalEventID'})
    
    # Creating a list of numerical and categorical columns
    #num_list = ['GlobalEventID', 'Day', 'FractionDate', 
    #            'IsRootEvent', 'MentionTimeDate', 'MentionDocTone']
    #cat_list = ['ActionGeo_Lat', 'ActionGeo_Long', 'GoldsteinScale', 'AvgTone', 
    #            'Confidence', 'Confidence', 'MentionDocTone']
    #int_list = ['GlobalEventID', 'Day', 'FractionDate', 
    #            'IsRootEvent', 'MentionTimeDate', 'MentionDocTone']
    #fl_list = ['ActionGeo_Lat', 'ActionGeo_Long', 'GoldsteinScale', 'AvgTone', 
    #            'Confidence', 'MentionDocTone']
    
    # Downcasting data types to decrease the file of dataframe
    #df_merged.loc[:,int_list] = df_merged.loc[:,int_list].apply(pd.to_numeric, 
    #                                                            downcast='integer', errors='coerce')
    #df_merged.loc[:,fl_list] = df_merged.loc[:,fl_list].apply(pd.to_numeric, 
    #                                                            downcast='float', errors='coerce')

    return df_merged




def add_sourceCountry(df_source, source_name):
    '''
    This function gets df_source (dataframe) and source_name (string) of sourcename. Then it updates the source_df
    to see if there is any new sources to add and then it adds 'SourceCountry' column to df_source and returned
    the updated dataframe
    '''
    def get_map_site():
        file = pd.read_csv(COUNTRY_CODE_DATA)
        return dict(zip(file['TLD'], file['ISO3166-1-Alpha-3'])), dict(zip(file['ISO3166-1-Alpha-3'], file['TLD']))

    def extension_lookup(website):
        try:
            return site[str('.') + website.split('.')[-1]]
        except:
            return None
        
    def ip_lookup(website):
        socket.setdefaulttimeout(1.5)
        
        try:
            ip = socket.gethostbyname(website)
            reader = geolite2.reader()
            output = reader.get(ip)
            results = output['country']['iso_code']
            country = pycountry.countries.get(alpha_2=results)
            return country.alpha_3
        except:
            return np.nan
        
    def two_way_lookup(x):
        ret = extension_lookup(x)
        if ret == None:
            ret = ip_lookup(x)
            return ret
        return ret

    site, _ = get_map_site()
    
    # Creating a copy of df_source
    df_source = df_source.copy()
    # Loading the df_sourceName
    data_source = pd.read_csv(data_read + 'processed/'+source_name +'.csv', index_col=None, header=0, low_memory=False)
    
    # Adding important sources manually
    source_men_list = ['BBC Monitoring','foxnews.com', 'ctpost.com', 'seattlepi.com','beaumontenterprise.com',
                   'lmtonline.com', 'thisdaylive.com','turkishweekly.net','stamfordadvocate.com',
                   'greenwichtime.com']
    source_men_country = ['GBR', 'USA', 'USA', 'USA', 'USA', 'USA', 'NGA', 'TUR', 'USA', 'USA']
    # Concatenating the new info to 'df_sourceName'
    source_df = pd.concat([data_source,pd.DataFrame({'MentionSourceName': source_men_list,
                                           'SourceCountry': source_men_country})], 
                          sort=False, axis=0, ignore_index=True)
    
    # Dropping duplicates to have only the unique source names
    source_df = source_df.drop_duplicates(keep='first').reset_index(drop=True)
    print('**** source_df shape: ', source_df.shape)
    # Getting new list of Source Names
    df_sourceName = df_source.loc[:,'MentionSourceName']
    # Getting unique Source Names by dropping duplicates
    df_sourceName = df_sourceName.drop_duplicates(keep='first').reset_index(drop=True)
    # Merging
    source_df = pd.merge(pd.DataFrame(df_sourceName), 
                         source_df, on=['MentionSourceName'], how='left')
    
    df_dict = source_df[~source_df.MentionSourceName.isin(data_source.MentionSourceName)]
    print('****** Shape of df_dict: ', df_dict.shape)
    print('****** Starting Apply!')
    if df_dict.shape[0]!=0:
        # Finding SourceCountries for new newsSources
        df_dict['SourceCountry'] = df_dict.apply(lambda x: two_way_lookup(x.iloc[0]), axis=1)
        # Assigning the newsly scraped SourceCountry back to 
        source_df.loc[~source_df.MentionSourceName.isin(data_source.MentionSourceName)] = df_dict 
        #source_df.apply(lambda x: x.fillna(two_way_lookup(x.iloc[0])), axis=1)
    print('******* End of Apply!')
    # Dropping rows with NAs
    source_df = source_df.dropna(subset=['SourceCountry'])
    
    # Changing Index
    source_df = source_df.set_index('MentionSourceName')['SourceCountry']

    # Adding 'SourceName' column
    df_source['SourceCountry'] = df_source['MentionSourceName'].map(source_df)
    
    return df_source

def mapping_country23(df, column):
    '''
    Mapping 2-digit country codes to 3-digit iso-a3 codes for a selected column: 'column'
    '''
    # Making a copy
    df = df.copy()
    
    # Mapping ActionGeo_CountryCode from 'FIPS' to 'ISO-alpha-3'
    country_code_df = pd.read_csv(COUNTRY_CODE_DATA)
    s = country_code_df.set_index('FIPS')['ISO3166-1-Alpha-3']
    
    # Replacing
    df[column] = df[column].replace(s)
    
    return df


import reverse_geocoder as rg

def ll2cc(df):
    '''
    
    '''
    # creating list of columns
    cc_list = ['City', 'Country']
    ll_list = ['ActionGeo_Lat', 'ActionGeo_Long']
    # Making a copy
    df = df.copy()
    # Dropping rows with NAs in ['ActionGeo_Lat', 'ActionGeo_Long'] columns
    df = df.dropna(subset=ll_list, how='any').reset_index(drop=True)
    # Creating a list of ('Latitude', 'Longitude') tuples
    coordinates = list(df.loc[:,ll_list].astype(float).itertuples(index=False, name=None))
    # Getting 'City' and 'Country' features from 'coordinates'
    df_coord = pd.DataFrame([(x['name'], x['cc']) for x in rg.search(coordinates)], columns=cc_list)
    
    # Concatenating the new ['City', 'Country'] features to 'df_mergedP' 
    df = pd.concat([df, df_coord], axis=1, join='inner')
    return df



def cc2ll(df, df_c2g, resolution='high'):
    '''
    This function gets two columns ['City', 'Country'] for high resolution state or one column ['Country'] for low
    resolution state, and returns their geocode Latitudes and Longitudes. In case of low resolution this function returns
    the latitudes and longitudes of the center of the country
    '''
    if resolution=='high':
        cc_list = ['City', 'Country']
    elif resolution=='low':
        # creating list of columns
        cc_list = ['Country']
    else:
        raise ValueError("Resolution can be set either 'low' or 'high'!")
    
    # List of Lat-Lon column
    ll_list = ['ActionGeo_Lat', 'ActionGeo_Long']
    # Making a copy of dataframe
    df = df.copy()
    # Selecting only cc_list columns
    df_cc = df.loc[:,cc_list]
    # Selecting 'cc_list'+'ll_list' columns from df_c2g
    df_c2g = df_c2g.loc[:,cc_list+ll_list].reset_index(drop=True)
    
    # merging df_cc and df_c2g together to get the 'latitude' and 'Longitude' values
    df_cc = pd.merge(df_cc, df_c2g, how='left', on=cc_list, sort=False)
    # Put the results back to df
    df[['ActionGeo_Lat','ActionGeo_Long']] = df_cc[ll_list]
    # Removing rows with NAs in columns ['ActionGeo_CountryCode','ActionGeo_Lat'] from df_mergedP
    df = df.dropna(subset=['ActionGeo_CountryCode','ActionGeo_Lat'])
        
    return df




def point_plot(df, plot_name='Point_plot', resolution='high'):
    
    # Making a copy
    df = df.copy()
    # Columns list needed for plotting
    col_list = ['GlobalEventID', 'ActionGeo_CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long']

    
    if resolution=='high':
        # Computing number of mentions by grouping by 'Action_Geo_CountryCode' and 'Lat' and 'Long'
        #df = df.groupby(by=['ActionGeo_CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long'])['GlobalEventID']\
        #       .count().reset_index(name='Mentions')
        # Getting percentage of mentions for each country/city
        df.Mentions = df.Mentions*100.0/df.Mentions.sum()
        
        # Preparing dataframe for plotting
        mentions_ds = gv.Dataset(df[['ActionGeo_Long', 'ActionGeo_Lat','Mentions', 'ActionGeo_CountryCode']])
        points = mentions_ds.to(gv.Points, ['ActionGeo_Long', 'ActionGeo_Lat'], ['Mentions', 'ActionGeo_CountryCode'])
        # Plotting with Hover tool
        hover = HoverTool(tooltips=[("Country", "@ActionGeo_CountryCode"), ('Percentage of Mentions', 
                                                                                 '@Mentions{0.1f} %')])
        p = (gts.CartoMidnight * points.options(width=900, height=500, tools=[hover], size_index=2, size=10, 
                                         alpha=0.7, color=palette[2],cmap='YlOrBr'))
    elif resolution=='low':
        # Computing number of mentions by grouping by 'Action_Geo_CountryCode' and 'Lat' and 'Long'
        df = df.groupby(by=['ActionGeo_CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long'])['GlobalEventID']\
               .count().reset_index(name='Mentions')
        # Getting percentage of mentions for each country/city
        df.Mentions = df.Mentions*100.0/df.Mentions.sum()
        
        # Preparing dataframe for plotting
        mentions_ds = gv.Dataset(df[['ActionGeo_Long', 'ActionGeo_Lat','Mentions', 'ActionGeo_CountryCode']])
        points = mentions_ds.to(gv.Points, ['ActionGeo_Long', 'ActionGeo_Lat'], ['Mentions', 'ActionGeo_CountryCode'])
        # Plotting with Hover tool
        hover = HoverTool(tooltips=[("Country", "@ActionGeo_CountryCode"), ('Percentage of Mentions', '@Mentions{0.1f} %')])
        # Saving the Plot (First option)
        p = (gts.CartoMidnight * points.options(width=900, height=500, tools=[hover], size_index=2, size=10, 
                                         alpha=0.8, color=palette[2],cmap='YlOrBr'))
    else:
        raise ValueError('Resolution can only gets "low" or "high" as values!')
    renderer = hv.renderer('bokeh')
    # Using renderer save
    renderer.save(p, './plots/'+plot_name)
    
    return None



## 0. A typical path for going from raw datasets to plots

In [5]:
# Loading 'df_merged_source'
df_merged = load_clean(1)

df_merged.head()

Unnamed: 0,GlobalEventID,Day,FractionDate,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,GoldsteinScale,NumMentions,IsRootEvent,AvgTone,MentionTimeDate,MentionSourceName,Confidence,MentionDocTone
0,410412347,20140218,2014.1315,SF,-30.3098,25.2971,2.8,5,0,-4.477612,20150218230000,dailymaverick.co.za,50.0,-4.477612
1,410412348,20140218,2014.1315,IN,12.9833,77.5833,1.9,5,1,2.078522,20150218230000,indiatimes.com,50.0,2.078522
2,410412349,20140218,2014.1315,AS,-36.0667,146.483,1.9,1,1,7.517084,20150218230000,voxy.co.nz,10.0,7.517084
3,410412350,20140218,2014.1315,NZ,-41.0,174.0,1.9,2,1,7.517084,20150218230000,voxy.co.nz,20.0,7.517084
4,410412351,20140218,2014.1315,US,44.2394,-114.51,1.9,10,1,0.0,20150218230000,eastidahonews.com,100.0,0.0


In [None]:
df_mergedS = add_sourceCountry(df_merged, 'data_SourceName')

df_mergedS.head()

In [11]:
df_mergedS = mapping_country23(df_mergedS, column='ActionGeo_CountryCode')

df_mergedS.head()

Unnamed: 0,GlobalEventID,Day,FractionDate,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,GoldsteinScale,NumMentions,IsRootEvent,AvgTone,MentionTimeDate,MentionSourceName,Confidence,MentionDocTone,SourceCountry
0,410412347,20140218,2014.1315,ZAF,-30.3098,25.2971,2.8,5,0,-4.477612,20150218230000,dailymaverick.co.za,50.0,-4.477612,ZAF
1,410412348,20140218,2014.1315,IND,12.9833,77.5833,1.9,5,1,2.078522,20150218230000,indiatimes.com,50.0,2.078522,IND
2,410412349,20140218,2014.1315,AUS,-36.0667,146.483,1.9,1,1,7.517084,20150218230000,voxy.co.nz,10.0,7.517084,NZL
3,410412350,20140218,2014.1315,NZL,-41.0,174.0,1.9,2,1,7.517084,20150218230000,voxy.co.nz,20.0,7.517084,NZL
4,410412351,20140218,2014.1315,USA,44.2394,-114.51,1.9,10,1,0.0,20150218230000,eastidahonews.com,100.0,0.0,USA


In [18]:
df_mergedP = ll2cc(df_mergedS)

df_mergedP.head()

Unnamed: 0,GlobalEventID,Day,FractionDate,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,GoldsteinScale,NumMentions,IsRootEvent,AvgTone,MentionTimeDate,MentionSourceName,Confidence,MentionDocTone,SourceCountry,City,Country
0,410412347,20140218,2014.1315,ZAF,-30.3098,25.2971,2.8,5,0,-4.477612,20150218230000,dailymaverick.co.za,50.0,-4.477612,ZAF,Colesberg,ZA
1,410412348,20140218,2014.1315,IND,12.9833,77.5833,1.9,5,1,2.078522,20150218230000,indiatimes.com,50.0,2.078522,IND,Bangalore,IN
2,410412349,20140218,2014.1315,AUS,-36.0667,146.483,1.9,1,1,7.517084,20150218230000,voxy.co.nz,10.0,7.517084,NZL,Corowa,AU
3,410412350,20140218,2014.1315,NZL,-41.0,174.0,1.9,2,1,7.517084,20150218230000,voxy.co.nz,20.0,7.517084,NZL,Picton,NZ
4,410412351,20140218,2014.1315,USA,44.2394,-114.51,1.9,10,1,0.0,20150218230000,eastidahonews.com,100.0,0.0,USA,Challis,US


In [65]:
df_mergedP2 = cc2ll(df_mergedP, resolution='low')

df_mergedP2.head()

Unnamed: 0,GlobalEventID,Day,FractionDate,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,GoldsteinScale,NumMentions,IsRootEvent,AvgTone,MentionTimeDate,MentionSourceName,Confidence,MentionDocTone,SourceCountry,City,Country
0,410412347,20140218,2014.1315,ZAF,-30.559482,22.937506,2.8,5,0,-4.477612,20150218230000,dailymaverick.co.za,50.0,-4.477612,ZAF,Colesberg,ZA
1,410412348,20140218,2014.1315,IND,20.593684,78.96288,1.9,5,1,2.078522,20150218230000,indiatimes.com,50.0,2.078522,IND,Bangalore,IN
2,410412349,20140218,2014.1315,AUS,-25.274398,133.775136,1.9,1,1,7.517084,20150218230000,voxy.co.nz,10.0,7.517084,NZL,Corowa,AU
3,410412350,20140218,2014.1315,NZL,-40.900557,174.885971,1.9,2,1,7.517084,20150218230000,voxy.co.nz,20.0,7.517084,NZL,Picton,NZ
4,410412351,20140218,2014.1315,USA,37.09024,-95.712891,1.9,10,1,0.0,20150218230000,eastidahonews.com,100.0,0.0,USA,Challis,US


In [49]:
df_mergedP2.dropna(subset=['MentionSourceName']).shape

(18920883, 17)

In [None]:
plot_name = 'plot_low'
point_plot(df_mergedP2, plot_name=plot_name, resolution='low')

# Showing the plot
from IPython.display import HTML
HTML(filename='./plots/'+plot_name+'.html')

## 1. Making all 'export_df' and 'mentions_df' lite

In [4]:
DATA_PATH = "data/"
data = 'data/DataFrame/'
data_read = 'F:/ADA2018/data/'
data_save = 'F:/ADA2018/data/'

In [27]:
p_start = 1
p_end = 18
# Reading the dataframes from the compressed version, and make them lite and save them again
for i in range(p_start, p_end):
    make_exlite(i)
    make_menlite(i)

## 2. Preparing Merged Dataframes

In [5]:
def clean_merge(month_n, resolution='high', usecols_e=None, usecols_m=None):
    '''
    This function gets month number as input and do the preprocessing on export or mention dataframe and save the output
    '''
    
    if resolution=='high' or resolution=='low':
        # Loading export dataframe
        df_merged = load_clean(month_n, usecols_e=usecols_e, usecols_m=usecols_m)
        # Adding SourceCountry
        df_merged = add_sourceCountry(df_merged, 'data_SourceCountry')
        # Mapping 2-digit country codes to 3-digit
        df_merged = mapping_country23(df_merged, column='ActionGeo_CountryCode')
        # Cleaning the noise of Lat-Lon values
        df_merged = ll2cc(df_merged)
        df_merged = cc2ll(df_merged, df_c2g, resolution=resolution)
    else:
        raise ValueError('Resolution can only gets "high" or "low" as values!')
    # Keeping only relevant columns
    col_keep = ['GlobalEventID','Day', 'ActionGeo_CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long',
                'GoldsteinScale','NumMentions','AvgTone', 'MentionSourceName','Confidence',
                'MentionDocTone','SourceCountry']
    df_merged = df_merged[col_keep]
    
    return df_merged


In [6]:
##### Loading df_country2geocode for high resolution
# Loading 'data_country2geo'
df_c2g = pd.read_csv(DATA_PATH + 'data_country2geo.csv', index_col=0, header=0)
# Renaming the columns
df_c2g.columns = ['Country','City', 'AccentCity', 'ActionGeo_Lat', 'ActionGeo_Long']
# Dropping duplicates
df_c2g = df_c2g.drop_duplicates(subset=['Country', 'City'], keep='first')
# creating list of columns

##### Loading df_country2geocode for low resolution
'''# Loading 'data_country2geo'
df_c2g = pd.read_csv(DATA_PATH + 'countries_lat-lon.txt', sep='\t',index_col=None, header=0)
df_c2g.columns = ['Country', 'ActionGeo_Lat', 'ActionGeo_Long', 'Name']
# Dropping duplicates
df_c2g = df_c2g.drop_duplicates(subset=['Country'], keep='first')'''

df_c2g.head()

Unnamed: 0_level_0,Country,City,AccentCity,ActionGeo_Lat,ActionGeo_Long
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2986043,AD,Pic de Font Blanca,Pic de Font Blanca,42.64991,1.53335
2994701,AD,Roc Mele,Roc Mélé,42.58765,1.74028
3007683,AD,Pic des Langounelles,Pic des Langounelles,42.61203,1.47364
3017832,AD,Pic de les Abelletes,Pic de les Abelletes,42.52535,1.73343
3017833,AD,Estany de les Abelletes,Estany de les Abelletes,42.52915,1.73362


In [None]:
p_start = 1
p_end = 23
# Columns to read from export_df and mentions_df
ex_keep = ['GlobalEventID', 'Day', 'ActionGeo_CountryCode', 'ActionGeo_Lat', 
           'ActionGeo_Long','GoldsteinScale', 'NumMentions', 'AvgTone']
men_keep = ['GlobalEventId', 'MentionSourceName', 'Confidence', 'MentionDocTone']

# Iterating over months for creating df_merged for each month
for i in range(p_start, p_end+1):
    print('***** Month {} has started!'.format(i))
    df_merged = clean_merge(i, usecols_e=ex_keep, usecols_m=men_keep)
    # Saving the df_merged
    df_merged.to_csv(data_save+'mergedV2_'+str(i)+'.csv', index=False, encoding='utf-8')

In [19]:
df_merged.head()

Unnamed: 0,GlobalEventID,Day,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,GoldsteinScale,NumMentions,AvgTone,MentionSourceName,Confidence,MentionDocTone,SourceCountry
0,418894570,20140321,GBR,54.64305,-6.74595,-2.0,6,-7.109802,independent.ie,60.0,-7.109802,IRL
1,418894570,20140321,GBR,54.64305,-6.74595,-2.0,6,-7.109802,independent.ie,60.0,-7.109802,IRL
2,418894570,20140321,GBR,54.64305,-6.74595,-2.0,6,-7.109802,independent.ie,60.0,-7.109802,IRL
3,418894570,20140321,GBR,54.64305,-6.74595,-2.0,6,-7.109802,independent.ie,60.0,-7.109802,IRL
4,418894570,20140321,GBR,54.64305,-6.74595,-2.0,6,-7.109802,independent.ie,60.0,-7.109802,IRL


## 2. Creating 'MentionSourceName' Occurance and their 'SourceCountry 

In [9]:
def sourceCount(df_sources, month_n):
    '''
    This function go through indicated month in mentions dataframe and count the number of their occurance.
    Then, if updates the dataframe 'df_sources' by adding the new number of occurance 
    '''
    # Creating a copy of dataframe
    df_sources = df_sources.copy()
    
    # Loading cleaned dataframe
    mentions_df = pd.read_csv(data_read+'mentions_'+str(month_n)+'.csv.gz', compression='gzip',
                              index_col=0, header=0, usecols= ['MentionSourceName', 'GlobalEventId'],
                              low_memory=False)
    
    # Selecting necessary columns
    #col2sel = ['MentionSourceName', 'GlobalEventId']
    #mentions_df = mentions_df[col2sel]
    
    # Counting the occurance
    mentions_df = mentions_df.groupby(by=['MentionSourceName'])['GlobalEventId'].count()\
                             .reset_index(name='Occurance')#.set_index('MentionSourceName')
    # Merging new 'MentionSourceName' to the df_sources
    df_sources = pd.merge(df_sources, mentions_df, how='outer', on=['MentionSourceName'], sort=False)
    # Filling NAs with zero, so that we can sum two columns together
    df_sources = df_sources.fillna(0)
    # Adding two columns together to update the number of occurance
    df_sources['Count'] = df_sources['Count'] + df_sources['Occurance']
    # Removing 'Occurance' column
    df_sources = df_sources[['MentionSourceName', 'Count']]
    
    return df_sources

def occuranceSource(df_sources=None, p_start=1, p_end=1):
    '''
    This function returns the dataframe containing 'MentionSourceName' and their 'Occurance'
    '''
    if df_sources is None:
        df_sources = pd.DataFrame(columns=['MentionSourceName', 'Count'])
    # Iterating over the period to get the number of occurance
    for i in range(p_start, p_end+1):
        df_sources = sourceCount(df_sources, i)
    # Sorting the 'MentionSourceName'
    df_sources = df_sources.sort_values(by=['Count'], ascending=False)
    
    return df_sources

In [None]:
# Computing MentionSourceName occurance
df_sources = occuranceSource(p_start=1, p_end=3)
# Finding their 'SourceCountry'
df_sources = add_sourceCountry(df_sources, 'data_sourceName')

# Saving the result
df_sources.to_csv(data_save+'data_sourceOccurance.csv', encoding='utf=8', index=False)

df_sources.head()

## 3. Preparing dataframes for Point plotting

In [16]:

def df4pointPlot(p_start=1, p_end=1):
    # Columns list needed for plotting
    col_list = ['ActionGeo_CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long', 'Month', 'MentionSourceName', 'SourceCountry']
    # Creating empty dataframe for plotting
    df_plot = pd.DataFrame(columns=col_list)
    # Columns needed from export_df and mentions_df
    col_e = ['GlobalEventID', 'ActionGeo_CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long']
    col_m = ['GlobalEventId', 'MentionSourceName']
    # Iterating over months to add number of mentions
    for i in range(p_start, p_end+1):
        print('***** Month {} has started!'.format(i))
        df_merged = pd.read_csv(data_read+'merged_'+str(i)+'.csv', index_col=None, low_memory=False)
        # Computing number of mentions by grouping by 'Action_Geo_CountryCode' and 'Lat' and 'Long'
        df_merged = df_merged.groupby(by=['ActionGeo_CountryCode', 'MentionSourceName', 'SourceCountry',
                                          'ActionGeo_Lat', 'ActionGeo_Long'])['GlobalEventID']\
                             .count().reset_index(name='Mentions')
        df_merged['Month'] = i
        print('******* This is df_merged:\n')
        display(df_merged.head())
        #Concatenating the result to df_plot
        df_plot = pd.concat([df_plot,df_merged], axis=0, join='outer', copy=False, ignore_index=True, sort=False)
        print('****** df_plot after concatenating:\n')
        display(df_plot.head())
    # Summing up number of mentions per 'Country' and 'City' and 'MentionSourceName'
    df_plot = df_plot.groupby(by=['ActionGeo_CountryCode', 'MentionSourceName', 'Month', 'SourceCountry',
                                  'ActionGeo_Lat', 'ActionGeo_Long'])['Mentions']\
                     .sum().reset_index(name='Mentions').sort_values(by=['Mentions'], ascending=False)
    return df_plot
    

In [None]:
# Computing 'MentionSourceName' mentions for each 'ActionGeo_CountryCode', 'ActionGeo_Lat', and 'ActionGeo_Long'
df_pplot = df4pointPlot(p_start=1, p_end=19)

# Saving the Result
df_pplot.to_csv(data_save+'data_pointPlot.csv', encoding='utf=8', index=False)

df_pplot.head()

### Plotting for animation

In [39]:
df_pplot = df_pplot.groupby(by=['ActionGeo_CountryCode', 'Month', 'ActionGeo_Lat','ActionGeo_Long'])\
                   .agg({'Mentions':'sum'}).reset_index().sort_values(by=['Mentions'], ascending=False)
print(df_pplot.shape)
df_pplot.head()

(95364, 5)


Unnamed: 0,ActionGeo_CountryCode,Month,ActionGeo_Lat,ActionGeo_Long,Mentions
14122,FRA,10,45.59693,5.76591,1013476.0
71776,USA,10,38.89511,-77.03637,683568.0
18192,GRC,15,39.35358,21.09294,680762.0
44186,SYR,10,34.5624,38.28402,678447.0
24700,IRN,12,35.8044,51.4256,676045.0


In [42]:
df_plot = df_pplot.copy()
# Getting percentage of mentions for each country/city
df_plot.Mentions = df_plot.Mentions*100.0/df_plot.Mentions.sum()

In [47]:
import geoviews.tile_sources as gts
from bokeh.io import show
from bokeh.layouts import layout
from bokeh.models import Slider, Button
from bokeh.palettes import YlOrBr3 as palette
from bokeh.plotting import show, output_notebook, curdoc, output_file

renderer = hv.renderer('bokeh').instance(mode='server')

# Create the holoviews app again
def sine(phase):
    # Plotting with Hover tool
    hover = HoverTool(tooltips=[("Country", "@ActionGeo_CountryCode"), ('Percentage of Mentions', '@Mentions{0.1f} %')])
    # Creating geopandas dataframe
    mentions_ds = gv.Dataset(df_plot[df_plot['Month']==phase]\
                            [['ActionGeo_Long', 'ActionGeo_Lat', 'Mentions', 'ActionGeo_CountryCode']])
    points = mentions_ds.to(gv.Points, ['ActionGeo_Long','ActionGeo_Lat'], ['Mentions', 'ActionGeo_CountryCode'])
    return (gts.CartoMidnight * points.options(width=900, height=500, tools=[hover], size_index=2, size=22, 
                                         alpha=0.8, color=palette[2],cmap='YlOrBr'))

stream = hv.streams.Stream.define('Phase', phase=1.)()
dmap = hv.DynamicMap(sine, streams=[stream]) #Here we call sine to update plot with the stream (defined above)

# Define valid function for FunctionHandler
# when deploying as script, simply attach to curdoc
def modify_doc(doc):
    # Create HoloViews plot and attach the document
    hvplot = renderer.get_plot(dmap, doc)

    # Create a slider and play buttons
    def animate_update():
        year = slider.value + 1.0
        if year > end:
            year = start
        slider.value = year

    def slider_update(attrname, old, new):
        # Notify the HoloViews stream of the slider update 
        stream.event(phase=new)
        
    start, end = 1, 19
    slider = Slider(start=start, end=end, value=start, step=1, title="Months (Base Month: February 2015)")
    slider.on_change('value', slider_update)
    
    callback_id = None

    def animate():
        global callback_id
        if button.label == '► Play':
            button.label = '❚❚ Pause'
            callback_id = doc.add_periodic_callback(animate_update, 500) #This number change the time between each update
        else:
            button.label = '► Play'
            doc.remove_periodic_callback(callback_id)
    button = Button(label='► Play', width=60)
    button.on_click(animate)
    
    # Combine the holoviews plot and widgets in a layout
    plot = layout([
    [hvplot.state],
    [slider, button]], sizing_mode='fixed')
    
    doc.add_root(plot)
    return doc

# To display in the notebook
#output_file("graph.html")
show(modify_doc, notebook_url='localhost:8888')

# To display in a script
#doc = modify_doc(curdoc())

from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.embed import file_html

# Convert to bokeh figure then save using bokeh
#plot = renderer.get_plot(doc).state

#show()
#html = file_html(doc, CDN, "my plot")

INFO:bokeh.server.server:Starting Bokeh server version 0.13.0 (running on Tornado 5.1.1)


INFO:tornado.access:200 GET /autoload.js?bokeh-autoload-element=2d964d02-6172-4a86-9ccc-56e658f1f972&bokeh-absolute-url=http://localhost:58550&resources=none (::1) 79.83ms
INFO:tornado.access:101 GET /ws?bokeh-protocol-version=1.0&bokeh-session-id=N4wav1L20XqfXC81ctAkTTfgDNzTvkOzpWgHRnHtYtGF (::1) 1.02ms
INFO:bokeh.server.views.ws:WebSocket connection opened
INFO:bokeh.server.views.ws:ServerConnection created


## Plotting the aggregation over 24 months

In [48]:
# Loading the 'data_pointPlot'
df_pplot = pd.read_csv(data_save+'data_pointPlot.csv', index_col=None, low_memory=False)

df_pplot.head()

Unnamed: 0,ActionGeo_CountryCode,MentionSourceName,Month,SourceCountry,ActionGeo_Lat,ActionGeo_Long,Mentions
0,IND,firstpost.com,1,IND,28.65195,77.23149,79818.0
1,IND,firstpost.com,1,IND,27.82742,81.76565,74397.0
2,ISR,english.wafa.ps,13,PSE,31.52935,35.0938,56552.0
3,USA,wickedlocal.com,12,USA,30.22936,-90.88899,41733.0
4,ISR,english.wafa.ps,13,PSE,31.60998,34.76422,39279.0


In [49]:
# Filter Source country to USA
df_pplot = df_pplot[df_pplot.SourceCountry=='USA']


In [50]:
df_pplot = df_pplot.groupby(by=['ActionGeo_CountryCode', 'ActionGeo_Lat','ActionGeo_Long'])\
                   .agg({'Mentions':'sum'}).reset_index().sort_values(by=['Mentions'], ascending=False)
print(df_pplot.shape)
df_pplot.head()

(23531, 4)


Unnamed: 0,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,Mentions
19656,USA,38.89511,-77.03637,7221796.0
17115,USA,33.91373,-82.67403,3750764.0
20428,USA,39.87755,-89.60093,2811558.0
13347,SYR,34.5624,38.28402,2635422.0
15614,USA,31.08351,-97.65974,2103483.0


In [55]:
plot_name = 'plot_high'
point_plot(df_pplot, plot_name=plot_name, resolution='high')

# Showing the plot
from IPython.display import HTML
HTML(filename='./plots/'+plot_name+'.html')

## 4. Creating dataframe for Plotting the Bias

In [96]:
%%opts Polygons (cmap='Spectral')
from bokeh.models import HoverTool

def bias_plot(df_plot, plot_name='Bias_plot'):
    
    # Making a copy
    df_plot = df_plot.copy()
    # Columns list needed for plotting
    col_list = ['AvgBias', 'ActionGeo_CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long']
    
    # Loading the geolocations of different countries
    path = geopandas.datasets.get_path('naturalearth_lowres')
    df = geopandas.read_file(path)
    # Dropping unnecessary columns
    df = df.drop(columns=['pop_est', 'gdp_md_est'])

    # Adding the number of mentions to the dataframe
    s = df_plot.set_index('ActionGeo_CountryCode')['AvgBias']
    df['AvgBias']=df['iso_a3']
    df['AvgBias'] = df['AvgBias'].replace(s)
    df.AvgBias[df.AvgBias.isin(df['iso_a3'])] = 0
    
    df.AvgBias = df.AvgBias.apply(pd.to_numeric, downcast='float', errors='coerce')
    # droping NAs
    df = df.dropna()
    
    # Plotting with Hover tool
    hover = HoverTool(tooltips=[("Country", "@iso_a3"),('Average Bias',
                                                                       '@AvgBias')])

    
    plot_opts = dict(tools=[hover], width=900, height=600, color_index='AvgBias',colorbar=True, toolbar='above', xaxis=None, yaxis=None, cmap='Spectral')
    p = gv.Polygons(df, vdims=['name', 'AvgBias'], label='Average Bias for U.S. news media around the world').opts(plot=plot_opts).redim.range(Latitude=(-60, 90))
    renderer = hv.renderer('bokeh')
    # Using renderer save
    renderer.save(p, './plots/'+plot_name)
    
    return None


In [34]:
def df4biasPlot(p_start=1, p_end=1):
    def get_weightedAvg(x):
        '''
        This function returns the weighted average of dataframe 'x'.
        Dataframe 'x' should contains two columns ['Avg', 'Count']
        '''
        return (x.loc[:,'AvgBias']*x.loc[:,'Count']).sum()/x.loc[:,'Count'].sum()
    
    # Columns list needed for plotting
    col_list = ['AvgBias', 'ActionGeo_CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long', 'Month']
    # Create empty dataframe for plotting
    df_plot = pd.DataFrame(columns=col_list)
    # Columns for export_df and mentions_df
    col_e = ['GlobalEventID', 'ActionGeo_CountryCode', 'ActionGeo_Lat', 'ActionGeo_Long', 'AvgTone']
    col_m = ['GlobalEventId', 'MentionSourceName', 'MentionDocTone']
    # Iterating over months to add number of mentions
    for i in range(p_start, p_end+1):
        # Loading the merged and cleaned dataframe for month i
        df_merged = pd.read_csv(data_read+'merged_'+str(i)+'.csv', index_col=None, low_memory=False)
        # Making Bias column
        df_merged['Bias'] = df_merged['MentionDocTone']-df_merged['AvgTone']
        # Dropping redundant columns
        df_merged = df_merged.drop(columns=['AvgTone', 'MentionDocTone'])
        # Computing number of mentions by grouping by 'Action_Geo_CountryCode' and 'Lat' and 'Long'
        df_merged = df_merged.groupby(by=['ActionGeo_CountryCode', 'MentionSourceName',
                                          'ActionGeo_Lat', 'ActionGeo_Long'])\
                             .agg({'Bias':'mean', 'MentionSourceName':'count'})

        df_merged = df_merged.rename(columns={'Bias':'AvgBias', 'MentionSourceName':'Count'}).reset_index()
        # Adding new column month
        df_merged['Month'] = i
        #Concatenating the result to df_plot
        df_plot = pd.concat([df_plot,df_merged], axis=0, join='outer', copy=False, ignore_index=True, sort=False)
        print('****** df_plot after concatenating:\n')
        display(df_plot.head())
    # Summing up number of mentions per 'Country' and 'City' and 'MentionSourceName'
    df_grouped = df_plot.groupby(by=['ActionGeo_CountryCode', 'MentionSourceName', 'Month',
                                  'ActionGeo_Lat', 'ActionGeo_Long'])
    
    df_bias= df_grouped.apply(lambda x: get_weightedAvg(x)).reset_index(name='AvgBias')['AvgBias']
    
    df_count = df_grouped.apply(lambda x: x.loc[:,'Count'].sum()).reset_index(name='AvgBias')['AvgBias']
    # Reducing the df_plot
    df_plot = df_grouped.agg({'ActionGeo_CountryCode':'count'}).rename(columns={'ActionGeo_CountryCode':'AvgBias'}).reset_index()
    # Assigning values to associated columns
    df_plot['AvgBias'] = df_bias
    df_plot['Count'] = df_count
    
    return df_plot

In [None]:
df_bplot = pd.read_csv(data_save+'processed/'+'data_biasPlot.csv', index_col=None, low_memory=False)

df_bplot.head()

In [None]:
# Computing 'MentionSourceName' average bias for each 'ActionGeo_CountryCode', 'ActionGeo_Lat', and 'ActionGeo_Long'
df_bplot = df4biasPlot(p_start=1, p_end=19)

# Saving the Result
df_bplot.to_csv(data_save+'data_biasPlot.csv', encoding='utf=8', index=False)

df_bplot.head()

In [59]:
# Loading the 'data_pointPlot'
df_bplot = pd.read_csv(data_read+'data_biasPlot.csv', index_col=None, low_memory=False)

print(df_bplot.shape)
df_bplot.head()

(1807993, 7)


Unnamed: 0,ActionGeo_CountryCode,MentionSourceName,Month,ActionGeo_Lat,ActionGeo_Long,AvgBias,Count
0,ABW,13wham.com,14,12.51627,-69.96321,-0.095238,29.0
1,ABW,1stopnews.com,7,12.51627,-69.96321,0.0,29.0
2,ABW,24ora.com,6,12.52398,-70.02703,0.0,28.0
3,ABW,abc3340.com,4,12.51627,-69.96321,0.0,29.0
4,ABW,abc3340.com,14,12.51627,-69.96321,-0.095238,29.0


In [61]:
# Adding Source Country
df_bplot = add_sourceCountry(df_bplot, 'data_SourceCountry')

df_bplot.head()

**** source_df shape:  (28673, 2)
****** Shape of df_dict:  (0, 2)
****** Starting Apply!
******* End of Apply!


Unnamed: 0,ActionGeo_CountryCode,MentionSourceName,Month,ActionGeo_Lat,ActionGeo_Long,AvgBias,Count,SourceCountry
0,ABW,13wham.com,14,12.51627,-69.96321,-0.095238,29.0,USA
1,ABW,1stopnews.com,7,12.51627,-69.96321,0.0,29.0,USA
2,ABW,24ora.com,6,12.52398,-70.02703,0.0,28.0,
3,ABW,abc3340.com,4,12.51627,-69.96321,0.0,29.0,USA
4,ABW,abc3340.com,14,12.51627,-69.96321,-0.095238,29.0,USA


In [62]:
# Filter Source country to USA
df_bplot = df_bplot[df_bplot.SourceCountry=='USA']

print(df_bplot.shape)
df_bplot.head()

(1119214, 8)


Unnamed: 0,ActionGeo_CountryCode,MentionSourceName,Month,ActionGeo_Lat,ActionGeo_Long,AvgBias,Count,SourceCountry
0,ABW,13wham.com,14,12.51627,-69.96321,-0.095238,29.0,USA
1,ABW,1stopnews.com,7,12.51627,-69.96321,0.0,29.0,USA
3,ABW,abc3340.com,4,12.51627,-69.96321,0.0,29.0,USA
4,ABW,abc3340.com,14,12.51627,-69.96321,-0.095238,29.0,USA
7,ABW,abcnews4.com,14,12.51627,-69.96321,-0.095238,29.0,USA


In [63]:
def get_weightedAvg(x):
    '''
    This function returns the weighted average of dataframe 'x'.
    Dataframe 'x' should contains two columns ['Avg', 'Count']
    '''
    return (x.loc[:,'AvgBias']*x.loc[:,'Count']).sum()/x.loc[:,'Count'].sum()

In [64]:
# Summing up number of mentions per 'Country' and 'City' and 'MentionSourceName'
df_grouped = df_bplot.groupby(by=['ActionGeo_CountryCode', 'MentionSourceName',
                              'ActionGeo_Lat', 'ActionGeo_Long'])
# Calculating the weighted average bias
df_bias= df_grouped.apply(lambda x: get_weightedAvg(x)).reset_index(name='AvgBias')['AvgBias']

# Reducing the df_plot
df_bplot = df_grouped.agg({'ActionGeo_CountryCode':'count'}).rename(columns={'ActionGeo_CountryCode':'AvgBias'}).reset_index()
# Assigning values to associated columns
df_bplot['AvgBias'] = df_bias

df_bplot.head()

Unnamed: 0,ActionGeo_CountryCode,MentionSourceName,ActionGeo_Lat,ActionGeo_Long,AvgBias
0,ABW,13wham.com,12.51627,-69.96321,-0.095238
1,ABW,1stopnews.com,12.51627,-69.96321,0.0
2,ABW,abc3340.com,12.51627,-69.96321,-0.047619
3,ABW,abcnews4.com,12.51627,-69.96321,-0.095238
4,ABW,antiguaobserver.com,12.51627,-69.96321,0.0


In [66]:
# Aggregating over the 'MentionSourceName'
df_bplot = df_bplot.groupby(by=['ActionGeo_CountryCode', 'ActionGeo_Lat','ActionGeo_Long'])\
                   .agg({'AvgBias':'mean'}).reset_index().sort_values(by=['AvgBias'], ascending=False)

df_bplot.head()

Unnamed: 0,ActionGeo_CountryCode,ActionGeo_Lat,ActionGeo_Long,AvgBias
12658,RUS,52.81667,79.33333,8.057298
2502,CHN,27.24833,111.73222,6.096059
4518,EST,57.77781,26.0473,4.882763
9960,MEX,29.68333,-108.26667,4.06862
805,AUS,-27.46667,153.26667,3.836207


In [89]:
np.unique(df_bplot.AvgBias)

array([-6.456141  , -5.37268973, -4.3214736 , ...,  4.882763  ,
        6.0960593 ,  8.05729842])

In [97]:
plot_name= 'Bias_plot_US_news_media'
bias_plot(df_bplot, plot_name=plot_name)

In [98]:
# Showing the plot
from IPython.display import HTML
HTML(filename='./plots/'+plot_name+'.html')