----
# Comparing RS to established measures








In [None]:
from __future__ import division
import numpy as np
import scipy as sc
from itertools import product
import time
import matplotlib.pyplot as plt
import PIL
from numpy import log10
import random
from math import factorial
import warnings
import pandas as pd
import re
import os
import math
from collections import Counter

from scipy.stats import linregress, gaussian_kde, skew
from scipy.optimize import linear_sum_assignment
from scipy.spatial import distance
from scipy import stats

from sklearn.preprocessing import PolynomialFeatures
from statsmodels.stats.outliers_influence import summary_table
import statsmodels.api as sm


warnings.filterwarnings('ignore')

%config InlineBackend.figure_formats = ['svg']
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

pd.set_option('display.max_columns', None)


In [None]:
def generate_cum_dists(n_obs, n_bins, cum=True):
    """Generate all possible discrete distributions for a given number of 
        observations distributed across a given number of bins
    """
    
    def partitions(n, k):
        if k == 1:
            yield (n,)
        else:
            for i in range(n + 1):
                for result in partitions(n - i, k - 1):
                    yield (i,) + result

    for combo in partitions(n_obs, n_bins):
        if cum:
            cum_dist = [sum(combo[:i + 1]) for i in range(n_bins)]
            yield cum_dist
        else:
            yield combo

            
def count_pts_within_radius(x, y, radius, scale=0):
    """Count the number of points within a fixed radius in 2D space"""
    
    raw_data = np.array([x, y])
    x = np.array(x)
    y = np.array(y)
    raw_data = raw_data.transpose()
    
    # Get unique data points by adding each pair of points to a set
    unique_points = set()
    for xval, yval in raw_data:
        unique_points.add((xval, yval))
    
    count_data = []
    for a, b in unique_points:
        if scale == 'sqrt':
            num_neighbors = len(x[((sqrt(x) - sqrt(a)) ** 2 +
                                   (sqrt(y) - sqrt(b)) ** 2) <= sqrt(radius) ** 2])
        else:        
            num_neighbors = len(x[((x - a) ** 2 + (y - b) ** 2) <= radius ** 2])
        count_data.append((a, b, num_neighbors))
    return count_data



def plot_color_by_pt_dens(x, y, radius, loglog=0, scale=0, plot_obj=None, point_size=10):
    
    """Plot bivariate relationships with large n using color for point density

    Inputs:
    x & y -- variables to be plotted
    radius -- the linear distance within which to count points as neighbors
    scale -- a flag to indicate the use of a scale plot (scale = 1)

    The color of each point in the plot is determined by the logarithm (base 10)
    of the number of points that occur with a given radius of the focal point,
    with hotter colors indicating more points. The number of neighboring points
    is determined in linear space regardless of whether a scale plot is
    presented.
    """
    
    plot_data = count_pts_within_radius(x, y, radius, scale)
    sorted_plot_data = np.array(sorted(plot_data, key=lambda point: point[2]))

    if plot_obj == None:
        plot_obj = plt.axes()
        
    plot_obj.scatter(sorted_plot_data[:, 0],
            sorted_plot_data[:, 1],
            facecolors='none',
            s = point_size, 
            edgecolors='0.1', 
            linewidths=1., 
            #cmap='Greys'
            )
    
    # plot points
    c = np.array(sorted_plot_data[:, 2])**0.25
    c = np.max(c) - c
    plot_obj.scatter(sorted_plot_data[:, 0],
                    sorted_plot_data[:, 1],
                    c = c,
                    s = point_size, 
                    edgecolors='k', 
                    linewidths=0.0, 
                    cmap='Greys_r',
                    #alpha = 0.5,
                    )
        
    return plot_obj


In [None]:

def histogram_intersection(p, q):
    
    # Calculate histogram intersection
    
    # q is the reference distribution
    # p is the query distribution
    
    minima = np.minimum(p, q)
    hi = np.true_divide(np.sum(minima), np.sum(p))
    
    if hi > 1 or hi < 0:
        print('Error, HI =', hi)
        print(p)
        print(q)
        return
    return hi


def chi_square_distance(p, q):
    
    # Calculate chi-square distance
    
    # q is the reference distribution
    # p is the query distribution
    
    p = np.array(p)/np.sum(p)
    q = np.array(q)/np.sum(q)

    return np.sum(((p - q)**2 / (p + q))) / 2


def kl_divergence(p, q):
    
    # Calculate Kullback-Leibler Divergence
    
    # q is the reference distribution
    # p is the query distribution
    
    # Ensure both lists are numpy arrays with dtype=float
    
    p = np.array(p, dtype=float)
    q = np.array(q, dtype=float)
    
    p /= p.sum()
    q /= q.sum()
    
    kl_div = np.sum(p * np.log(p / q))
    return kl_div


def earth_movers_distance(p, q):
    
    # Calculate Earth Mover's Distance
    
    # q is the reference distribution
    # p is the query distribution
    
    # Ensure both lists are numpy arrays with dtype=float
    p = np.array(p, dtype=float)
    q = np.array(q, dtype=float)

    # Normalize the distributions to ensure they sum to 1
    p /= p.sum()
    q /= q.sum()

    # Calculate cumulative distributions
    P = np.cumsum(p)
    Q = np.cumsum(q)

    # Calculate the cost matrix
    C = np.abs(np.subtract.outer(P, Q))

    # Solve the linear sum assignment problem
    row_ind, col_ind = linear_sum_assignment(C)

    # Calculate the Earth Mover's Distance
    emd = C[row_ind, col_ind].sum()

    return emd


def kolmogorov_smirnov_distance(p, q):
    
    # Calculate KS distance
    
    # q is the reference distribution
    # p is the query distribution
    
    # Ensure both lists are numpy arrays with dtype=float
    p = np.array(p, dtype=float)
    q = np.array(q, dtype=float)

    # Normalize the distributions to ensure they sum to 1
    p /= p.sum()
    q /= q.sum()

    # Calculate cumulative distributions
    P = np.cumsum(p)
    Q = np.cumsum(q)

    # Calculate the KS distance
    ks_distance = np.max(np.abs(P - Q))

    return ks_distance


def rank_probability_score(p, q):
    
    # Calculate the ranked probability score
    
    # q is the reference distribution
    # p is the query distribution
    
    # Ensure both lists are numpy arrays with dtype=float
    p = np.array(p, dtype=float)
    q = np.array(q, dtype=float)

    # Normalize the distributions to ensure they sum to 1
    p /= p.sum()
    q /= q.sum()

    # Calculate cumulative distributions
    P = np.cumsum(p)
    Q = np.cumsum(q)

    # Calculate Rank Probability Score
    rps = np.sum((P - Q)**2)

    return rps


def RDS(p, q):
    
    # Calculate Relative distribution shift
    
    # q is the reference distribution
    # p is the query distribution
    p_bins = len(p)
    p_obs = sum(p)
    
    q_bins = len(q)
    q_obs = sum(q)
    
    z_p = (p_bins + 1)/p_bins
    p = [sum(p[:ii+1])**(z_p) for ii in range(len(p))]
    Sp = np.sum(np.array(p)/(p_obs**z_p)) - 1
    Sp = Sp/(p_bins - 1)
    
    z_q = (q_bins + 1)/q_bins
    q = [sum(q[:ii+1])**(z_q) for ii in range(len(q))]
    Sq = np.sum(np.array(q)/(q_obs**z_q)) - 1
    Sq = Sq/(q_bins - 1)
    
    return Sq - Sp
    
    
def DS(p):
    p_bins = len(p)
    p_obs = sum(p)
    
    z_p = (p_bins + 1)/p_bins
    p = [sum(p[:ii+1])**(z_p) for ii in range(len(p))]
    Sp = np.sum(np.array(p)/(p_obs**z_p)) - 1
    ds = Sp/(p_bins - 1)
    return ds


In [None]:

def generate_distributions(N, n, k, model, **kwargs):
    dists = []
    while len(dists) < N:
        
        if model == 'Poisson':
            lambda_param = kwargs.get('lambda_p', 5)  # Default lambda value is 5
            data = np.random.poisson(lambda_param, n)
        
        elif model == 'Gaussian':
            mean = kwargs.get('mean', 0)  # Default mean is 0
            std = kwargs.get('std', 1)  # Default standard deviation is 1
            data = np.random.normal(mean, std, n)
        
        elif model == 'Negative binomial':
            r = kwargs.get('r', 1)  # Number of successes
            p = kwargs.get('p', 0.5)  # Probability of success in each trial
            data = np.random.negative_binomial(r, p, n)
        
        elif model == 'Lognormal':
            mean = kwargs.get('mean', 0)  # Mean of the underlying normal distribution
            sigma = kwargs.get('sigma', 1)  # Standard deviation of the underlying normal distribution
            data = np.random.lognormal(mean, sigma, n)
        
        elif model == 'Weibull':
            a = kwargs.get('a', 1)  # Shape parameter
            data = np.random.weibull(a, n)
        
        else:
            raise ValueError("Unsupported model type")

        hist_vals, bins = np.histogram(data, bins=k, density=False)
        dists.append(hist_vals.tolist())

    return dists

N = 10**4
models = ['Poisson', 'Gaussian', 'Negative binomial', 'Lognormal', 'Weibull']
n_ls = [10, 100, 10, 100, 100, 1000]
k_ls = [3, 3, 50, 50, 500, 500]

for m in models:
    fig = plt.figure(figsize=(13, 14))
    
    fig_num = 0
    for i, n in enumerate(n_ls):
        k = k_ls[i]
            
        dists = generate_distributions(N, n=100, k=5, model=m, lambda_p=5)
            
        RDS_vals = []
        csd = []
        ks_dists = []
        hist_ints = []
        kl_divs = []
        em_dists = []
        rp_scores = []
        d1_ls = []
        d2_ls = []

        num = 0
        start_time_0 = time.time()
        while num < N:

            d1, d2 = random.sample(dists, 2)

            d1_ls.append(d1)
            d2_ls.append(d2)

            # Relative distributional shift
            j = RDS(d1, d2)
            RDS_vals.append(j)

            # KS distance
            j = kolmogorov_smirnov_distance(d1, d2)
            ks_dists.append(j)

            # Histogram intersection
            j = histogram_intersection(d1, d2)
            hist_ints.append(1-j)

            # chi-square distance
            j = chi_square_distance(d1, d2)
            csd.append(j)

            # KL divergence
            j = kl_divergence(d1, d2)
            kl_divs.append(j)


            # Rank probability score
            j = rank_probability_score(d1, d2)
            rp_scores.append(j)

            # Earth movers distance
            j = earth_movers_distance(d1, d2)
            em_dists.append(j)

            num += 1

        end_time = time.time()
            
        lists = [np.sqrt(csd).tolist(),
                     ks_dists, 
                     hist_ints,
                     np.sqrt(kl_divs).tolist(), 
                     em_dists, 
                     np.sqrt(rp_scores).tolist(),
                     np.abs(RDS_vals),
                    ]

        labs = [r"$\sqrt{CSD}$",
                      'KSD',
                      '1-HI',
                      r"$\sqrt{KLD}$",
                      'EMD',
                      r"$\sqrt{RPS}$",
                      '|RDS|',
                     ]
        ind = 1
            
        X_lists = [ks_dists, 
                       em_dists, 
                       np.sqrt(rp_scores).tolist(),
                       np.sqrt(csd).tolist(), 
                       hist_ints, 
                       np.sqrt(kl_divs).tolist(),
                      ]

        x_labs = ['Kolmogorov-Smirnov distance',
                      'Earth Mover Distance',
                      r"$\sqrt{RPS}$",
                      r"$\sqrt{CSD}$",
                      '1 - Histogram Intersection',
                      r"$\sqrt{KLD}$",
                      ]
            
        text_x_vals = [0.02, 0.05, 0.02, 
                           0.02, 0.02, 0.05]

        for i, x_ls in enumerate(X_lists):
            fig_num += 1

            xv = []
            yv = []
            ct = 0
            for ii, val in enumerate(x_ls):
                if val > 0 and val < 10**10:
                    xv.append(val)
                    yv.append(RDS_vals[ii])

            plot_color_by_pt_dens(xv, yv, radius=0.05, loglog=0, plot_obj=plt.subplot(6, 6, fig_num), point_size=10)
            slope, intercept, r_val, p_val, std_err = linregress(xv, np.abs(yv))
            fitted_vals = slope * np.array(xv) + intercept
            s = 'n=' + str(n) + ', k=' + str(k) + ' ' + r'$r^{2}$' + ' = ' + str(round(r_val**2, 2))
            plt.title(s, fontsize=8)
            plt.xlabel(x_labs[i], fontsize= 8)
            plt.ylabel('RDS', fontsize= 8)
            plt.tick_params(axis='both', labelsize=6)

    fig.patch.set_facecolor('white')
    plt.subplots_adjust(hspace=0.65, wspace=0.55)
    plt.savefig('Final_Figs/models_and_nk_combos/SupFig_'+m+'.jpg', bbox_inches='tight', format='jpg', dpi=600)
    plt.close()
    #break
