In [2]:
'''
This notebook includes methods for all the plots in the project
'''

import numpy as np
import pandas as pd
import os
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import impute
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.preprocessing import MinMaxScaler, normalize
from pygam import GAM, s, f, LinearGAM
import geopandas as gpd

import xgboost
import shap
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

### Plots for Feature Importance

In [4]:
def feature_imp_plot_bar(to_plot, methods, file_path, title = None):
    
    ''' This function plots the corresponding feature importances for various methods in a single bar plot.
        
        Input:
        --------------------------
        
        to_plot (Pandas dataframe): where rows show feature importance for various methods and columns are features
        
        methods (list of strings): including name of the methods consistance with the rows of to_plot
        
        file_path (str): the path for saving results
        
        title (str): Title of the plot (default is None)
        
        Output:
        ---------------------------
        None
    '''
    
    n_features = len(to_plot.columns)
    if (n_features < 20):
        font_size = 15
    else:
        font_size = 20
    n_methods = len(methods)
    fig_width = n_features
    fig = plt.figure(figsize=(fig_width, 5))
    ax = fig.add_subplot(111)
    bar_loc = 1 / (n_methods + 4) 
    bar_width = bar_loc
    X_axis = np.arange(n_features)
    for i in range(n_methods):
        ax.bar(X_axis + i * bar_loc, to_plot.iloc[i], bar_width, label=methods[i])
        
    plt.xticks(X_axis + bar_loc*(n_methods/2), to_plot.columns, rotation='vertical', fontsize = font_size)
    plt.yticks(fontsize = font_size)
    plt.legend(loc='upper left', fontsize= font_size - 0.4 * font_size)
    plt.ylabel('Feature Importance', fontsize = font_size + 0.3 * font_size)    
    if title:
        plt.title(title)
    print(file_path)
    plt.savefig(file_path)
    plt.show()
    

In [2]:
def feature_imp_plot_scatter(to_plot, methods, file_path, title = None):
    
    ''' This function plots the corresponding feature importances for various methods in a single scatter plot.
        
        Input:
        --------------------------
        
        to_plot (Pandas dataframe): where rows show feature importance for various methods and columns are features
        
        methods (list of strings): including name of the methods consistance with the rows of to_plot
        
        file_path (str): the path for saving results
        
        title (str): title of the plot (default None)
        
        Output:
        ---------------------------
        None
    '''
    
    n_features = len(to_plot.columns)
    if (n_features < 20):
        font_size = 10
    else:
        font_size = 15
    n_methods = len(methods)
    fig_width = n_features
    fig = plt.figure(figsize=(fig_width, 4))
    ax = fig.add_subplot(111)
    
    X_axis = np.arange(len(to_plot.columns))
    for i in range(len(methods)):
        ax.scatter(to_plot.columns, to_plot.iloc[i], s = 100, label=methods[i])
        
    plt.xticks(X_axis, to_plot.columns, rotation='vertical', fontsize = font_size)
    plt.yticks(fontsize = font_size)
    plt.legend(loc='upper left', fontsize= font_size - 0.4 * font_size)
    plt.ylabel('Feature Importance', fontsize = font_size + 0.3 * font_size)
    if title:
        plt.title(title)
    plt.savefig(file_path)
    

In [3]:
def single_feature_imp_plot_bar(importances, method, file_path, approach = None):
    
    ''' This function plots the feature importances for a single method in a bar plot.
        
        Input:
        --------------------------
        
        importances (Pandas series): shows importance of various features
        
        method (str): including name of the methods consistance with the rows of to_plot
        
        file_path (str): the path for saving results
        
        approach (str): used approach to compute feature importances (default is None)
        
        Output:
        ---------------------------
        None
    '''
    fig, ax = plt.subplots(figsize = (15,4))
    importances.plot.bar(ax=ax)
    ax.set_title('Feature importances using '+ approach +'_' + method)
    if approach == 'MDI':
        ax.set_ylabel("Mean decrease in impurity")
    else:
        ax.set_ylabel == 'Feature Importances'
    fig.tight_layout()
    plt.savefig(file_path)
    plt.show()
    

In [5]:
def feature_imp_thr_plot_bar(feature_imp, thr, output_path, file_name = None):
    
    ''' This function plots the corresponding feature importances for various methods in a single bar plot with and without
    considering a threshold for the importances.
        
        Input:
        --------------------------
        
        feature_imp (Pandas dataframe): where rows show feature importance for various methods and columns are features,
        last column shows the name of the methods
        
        thr (float): a threshold for the importances to avoid features which are not so mportant
        
        output_path (str): the path for saving results
        
        file_name (str): Name of the plot (default is None)
        
        Output:
        ---------------------------
        None
    '''
    
    methods = feature_imp['Method']
    to_plot = feature_imp.drop(['Method'], axis = 1)
    temp_df = to_plot

    for i in range(len(to_plot.columns)):
        if all(to_plot.iloc[:,i] < thr):
            temp_df = temp_df.drop(to_plot.columns[i], axis = 1)

    feature_imp_plot_bar(temp_df, methods, output_path + 'Plots/Feature_importance_bar_'+ str(thr).replace('.', '') + '_' + file_name)
    feature_imp_plot_bar(to_plot, methods, output_path + 'Plots/Feature_importance_bar_all_' + file_name)
    

In [6]:
def plot_feature_importance_scatterplot(feature_imp, thr, output_path, file_name):
    
    ''' This function plots the corresponding feature importances for various methods in a single scatter plot with and without
    considering a threshold for the importances.
        
        Input:
        --------------------------
        
        feature_imp (Pandas dataframe): where rows show feature importance for various methods and columns are features,
        last column shows the name of the methods
        
        thr (float): a threshold for the importances to avoid features which are not so mportant
        
        output_path (str): the path for saving results
        
        file_name (str): Name of the plot (default is None)
        
        Output:
        ---------------------------
        None
    '''
    
    methods = feature_imp['Method']
    to_plot = feature_imp.drop(['Method'], axis = 1)
    temp_df = to_plot

    for i in range(len(to_plot.columns)):
        if all(to_plot.iloc[:,i] < thr):
            temp_df = temp_df.drop(to_plot.columns[i], axis = 1)

    feature_imp_plot_scatter(temp_df, methods, output_path + 'Plots/Feature_importance_scatter_' + str(thr).replace('.', '') 
                         + '_' + file_name)
    feature_imp_plot_scatter(to_plot, methods, output_path + 'Plots/Feature_importance_scatter_all_' + file_name )
    

In [7]:
def shap_summary_plot(method, shap_values, X, feature_names, file_path):
    
    ''' This function plots SHAP summary plot which shows the contribution of every observation for all the features.
        
        Input:
        --------------------------
        
        method (str): name of the underlying method used to model the data
        
        shap_values (numpy array): array of shap values for all the features and observations
        
        X (Pandas dataframe or Numpy array): the data used to fit the model 
        
        thr (float): a threshold for the importances to avoid features which are not so mportant
        
        feature_names (list of str): name of the features
        
        file_path (str): where to save the plot
        
        Output:
        ---------------------------
        None
    '''
    
    shap.summary_plot(shap_values, features = X, feature_names = feature_names, show=False)
    plt.savefig(file_path)
    plt.show()
    

In [8]:
def cvd_mortality_single_map_plot(file_path, title, output_path, show_map = False):
    
    ''' This function plots a shape file for cvd mortality rate in Germany
        
        Input:
        --------------------------
        
        file_path (str): where to find the shape files
                
        title (str): title of the method
        
        output_path (str): where to save the map
        
        show_map (boolean): whether or not show the maps        
        
        Output:
        ---------------------------
        None
    '''
    data_shp = gpd.read_file(file_path)
    fig, ax = plt.subplots(1, 1, figsize=(10,15))
    data_shp.plot(column='cvd_mortal', ax = ax, legend = True, legend_kwds={'loc': 'lower right'},
                  cmap = plt.cm.get_cmap('magma_r'), scheme='user_defined', classification_kwds={'bins':[3, 4, 5, 6, 8, 10]})
    ax.set_axis_off()
    ax.set_title(title, fontsize = 20)
    plt.savefig(output_path)
    if (show_map == True):
        plt.show()

In [3]:
def all_maps_plot(output_path, methods, number_of_columns, color_map = 'YlGn', file_name = 'all_methods_maps',
                  bins = None, output_var = None, legend_loc = None):
    
    ''' This function plots a shape file for cvd mortality rate in Germany
        
        Input:
        --------------------------
        
        file_path (str): where to find the shape files
        
        methods (list of str): including names of the methods to which we plot the maps
                
        number_of_columns (int): shows the number of maps in a row to plot
        
        color_map (str): the color map to use for plotting (default is YlGn)
        
        file_name (str): name of the plot to save (default is 'all_methods_maps')
        
        bins (list of float): ranges to plot the maps (default is None, so the bins will be set automatically)
        
        output_path (str): where to save the map and find the shape files, please see the template folders
        
        output_var (str): the name of the output variable if any output except for cvd-mortality is considered (default is None)
        
        legend_loc (str): where to show legends, default top-left of the first map. If set to 'bottom',
                        legends go to bottom-rightnof the last map
        
        Output:
        ---------------------------
        None
    '''
    
    # Subplots are organized in a Rows x Cols Grid
    # Tot and Cols are known
    Tot = len(methods)
    Cols = number_of_columns

    # Compute Rows required
    Rows = Tot // Cols 
    Rows += Tot % Cols

    # Create a Position index
    Position = range(1,Tot + 1)
    
    # Set the size of the figure
    fig_size_width = 5 * min(number_of_columns, len(methods))
    fig_size_heigth = 7 * np.ceil((len(methods) / number_of_columns))
    fig = plt.figure(1, figsize=(fig_size_width, fig_size_heigth))
    
    for k in range(Tot):
        # read the shape files
        data_shp = gpd.read_file(output_path + methods[k] + '/' + methods[k] + '_shape.shp')
        
        # if not defined, set the output variable to group the maps by
        if not output_var:
            for i in data_shp.columns:
                if 'mortal' in i:
                    output_var = i
        # add every single subplot to the figure with a for loop
        ax = fig.add_subplot(Rows,Cols,Position[k])
        
        if (legend_loc == 'bottom'):
            if (k == (len(range(Tot)) - 1) and bins):
                data_shp.plot(column = output_var, ax = ax, legend = True, legend_kwds={'loc': 'lower right',
                              'bbox_to_anchor': (1, 0)}, cmap = plt.cm.get_cmap(color_map), scheme='user_defined',
                              classification_kwds={'bins': bins})
            elif (k == (len(range(Tot)) - 1) and not bins):
                divider = make_axes_locatable(ax)
                cax = divider.append_axes("right", size="5%", pad=0.1)
                data_shp.plot(column = output_var, ax = ax, legend = True, cax = cax, cmap = plt.cm.get_cmap(color_map))
            elif (k < (len(range(Tot)) - 1) and bins):
                data_shp.plot(column = output_var, ax = ax, cmap = plt.cm.get_cmap(color_map), scheme='user_defined',
                              classification_kwds={'bins':bins})
            else:
                divider = make_axes_locatable(ax)
                cax = divider.append_axes("right", size="5%", pad=0.1)
                data_shp.plot(column = output_var, ax = ax, cax = cax, legend = True, cmap = plt.cm.get_cmap(color_map))
        else:
            if (k == 0 and bins): 
                data_shp.plot(column = output_var, ax = ax, legend = True, legend_kwds={'loc': 'lower right',
                              'bbox_to_anchor': (0, 1)}, cmap = plt.cm.get_cmap(color_map), scheme='user_defined',
                              classification_kwds={'bins': bins})
            elif (k == 0 and not bins):
                divider = make_axes_locatable(ax)
                cax = divider.append_axes("right", size="5%", pad=0.1)
                data_shp.plot(column = output_var, ax = ax, legend = True, cax = cax, cmap = plt.cm.get_cmap(color_map))
            elif (k > 0 and bins):
                data_shp.plot(column = output_var, ax = ax, cmap = plt.cm.get_cmap(color_map), scheme='user_defined', classification_kwds={'bins':bins})
            else:
                divider = make_axes_locatable(ax)
                cax = divider.append_axes("right", size="5%", pad=0.1)
                data_shp.plot(column = output_var, ax = ax, cax = cax, legend = True, cmap = plt.cm.get_cmap(color_map))
            
        ax.set_axis_off()
        ax.set_title(methods[k].replace('diff_', '') , fontsize= 16)
    
    plt.savefig(output_path + file_name)
    plt.show()

In [10]:
def annotate_colname(x, **kws):
        
    ''' This function annotates columns of the correlation plot
    '''
    ax = plt.gca()
    ax.annotate(x.name, xy=(0.05, 0.9), xycoords=ax.transAxes,
              fontweight='bold')

In [4]:
def corrdot(*args, **kwargs):
    
    ''' This function plots the correlation coefficients as circles with different sizes proportional to the coefficients.
    '''
    # spearman
    corr_r = args[0].corr(args[1], 'pearson')
    corr_text = f"{corr_r:2.2f}".replace("0.", ".")
    ax = plt.gca()
    ax.set_axis_off()
    marker_size = abs(corr_r) * 10000
    ax.scatter([.5], [.5], marker_size, [corr_r], alpha=0.6, cmap="coolwarm",
               vmin=-1, vmax=1, transform=ax.transAxes)
    font_size = abs(corr_r) * 40 + 5
    ax.annotate(corr_text, [.5, .5,],  xycoords="axes fraction",
                ha='center', va='center', fontsize=font_size)

In [12]:
def results_corr_plot(preds, file_path):

    ''' This function plots the Spearman correlations among prediction results of vsrious methods as well as the ground truth.
        
        Input:
        --------------------------
        
        preds (Pandas dataframe): 
                        
        file_path (str): where to save the plot
        
        Output:
        ---------------------------
        None
    '''
    
    plt.figure()
    sns.set(style='white', font_scale=2.5)
    g = sns.PairGrid(preds, aspect=1.4, diag_sharey=False)
    
    # if this line is used a fitted line which is not necessarily straight will be shown
    #g.map_lower(sns.regplot, lowess=True, ci=True, line_kws={'color': 'red'})
    
    g.map_lower(sns.regplot, scatter_kws={'s':10}, line_kws={'color': 'red'})
    g.map_diag(sns.distplot, kde_kws={'color': 'red'})
    g.map_diag(annotate_colname)
    g.map_upper(corrdot)
    plt.savefig(file_path)
    plt.show()