# Defining distributional shift (DS)

We define distributional shift (DS) as the concentration of frequencies towards the lowest discrete class, which we measure via the sum of cumulative frequencies.

We begin by simply deriving DS as: 

$$DS = (\sum{F}/n - 1)/(k-1)$$<br><br>

We then refine DS to include exponentiated cumulative frequencies:

$$DS = (\sum{F^{z}}/n^{z} - 1)/(k-1)$$<br><br>

Finally, we refine the exponent (*z*) to take fractional values:

$$DS = (\sum{F^{k+1/k}}/n^{k+1/k} - 1)/(k-1)$$<br><br>

Below, we provide a function to allow users to explore the calculation of DS and to vary the number of observations (n_obs) and the number of bins (n_bins).

In [1]:

from __future__ import division
import numpy as np
import scipy as sc
from itertools import product
import time
import matplotlib.pyplot as plt
import PIL
from numpy import log10
import random
from math import factorial
from scipy.stats import linregress, gaussian_kde, skew
from scipy import stats
from scipy.spatial import distance
import warnings
import pandas as pd
import re
import os
import math
from collections import Counter
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.stats.outliers_influence import summary_table
import statsmodels.api as sm
from scipy.optimize import linear_sum_assignment

warnings.filterwarnings('ignore')

%config InlineBackend.figure_formats = ['svg']
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}

pd.set_option('display.max_columns', None)


  from pandas import Int64Index as NumericIndex


In [2]:
def generate_cum_dists(n_obs, n_bins, cum=True):
    """Generate all possible discrete distributions for a given number of 
        observations distributed across a given number of bins
    """
    
    def partitions(n, k):
        if k == 1:
            yield (n,)
        else:
            for i in range(n + 1):
                for result in partitions(n - i, k - 1):
                    yield (i,) + result

    for combo in partitions(n_obs, n_bins):
        if cum:
            cum_dist = [sum(combo[:i + 1]) for i in range(n_bins)]
            yield cum_dist
        else:
            yield combo


In [3]:
n = 5
k = 4

# Generate cumulative distributions one at a time and add to a list
_set = []
for dist in generate_cum_dists(n, k, cum=False):
    _set.append(dist)


DS_ls = []
for i, d in enumerate(_set):

    # get cumulative distribution
    cd = np.array([sum(d[:i+1]) for i in range(len(d))])
    
    # exponentiate cumulative frequencies
    z = 1
    z = (k + 1)/k
    
    DS = str(((sum(cd**z)/(n**z)) - 1) / (k - 1))
    DS = DS[:8]
    
    if sum(cd)/(n) == 2.2:    
        #print(i+1, d, tuple(cd), sum(cd**z)/(n**z))
        print(i+1, d, tuple(cd), DS)

    DS_ls.append(DS)
    
print('\n')
print(len(DS_ls), 'members of the feasible set for n and k')

11 (0, 1, 4, 0) (0, 1, 5, 5) 0.377916
14 (0, 2, 2, 1) (0, 2, 4, 5) 0.358233
16 (0, 3, 0, 2) (0, 3, 3, 5) 0.352044
25 (1, 0, 3, 1) (1, 1, 4, 5) 0.341363
28 (1, 1, 1, 2) (1, 2, 3, 5) 0.326641
37 (2, 0, 0, 3) (2, 2, 2, 5) 0.318108


56 members of the feasible set for n and k


# Figure 1

In [4]:
n_obs = 10
n_bins = 5
fig = plt.figure(figsize=(10.5, 7))
   
################  TOP ROW  ###########

_set = []
for dist in generate_cum_dists(n_obs, n_bins, cum=False):
    _set.append(dist)

plot_num = 1
for z in [2, 4, 7]:
    
    t1 = []
    t2 = []
    for d in _set:

        cd = [sum(d[:i+1]) for i in range(len(d))]
        ncd = np.array(cd)/max(cd)
        t1.append(sum(ncd))

        G = [sum(d[:i+1])**(z) for i in range(len(d))]
        G_ = np.array(G)/(n_obs**z)

        t2.append(sum(G_))

    print('n:', n_obs, 'k:', n_bins)
    print('cardinality:', len(t1), '| no. of unique ΣF/n:', 
          len(list(set(t1))), '| no. of unique ΣF^z/n^z:', len(list(set(t2))), '\n')

    ax = plt.subplot(2, 3, plot_num)
    plt.scatter(t1, t2, s=1, c='k')
    plt.xlabel(r'$\sum{F/n}$', fontsize= 14)
    plt.ylabel(r'$\sum{F^{' + str(z) + '}/n^{' + str(z) + '}}$', fontsize= 14)
    
    s = '|A' + r'$_{n' + '=' + str(n_obs) + ', ' + 'k' + '=' + str(n_bins) + '}$' + '| = ' + str(len(_set)) + '\n'
    s += '\nValues of ' + r'$\sum{F^{' + str(z) + '}/n^{' + str(z) + '}}$' + '\n = ' + str(len(list(set(t2))))
    
    plt.text(1.01, 3.65, s, fontsize=10)
    plt.tick_params(axis='both', labelsize=10)
    plot_num += 1
    
print('\n')

################  BOTTOM ROW  ###########  

N_obs = [10, 10, 20]
N_bins = [5, 10, 10]
sets = []
for i, n_obs in enumerate(N_obs):
    n_bins = N_bins[i]
    
    _set = []
    for dist in generate_cum_dists(n_obs, n_bins, cum=False):
        _set.append(dist)
    sets.append(_set)


for i, n_obs in enumerate(N_obs):
    n_bins = N_bins[i]
    
    _set = sets[i]
    
    t1 = []
    t2 = []
    for d in _set:

        cd = [sum(d[:ii+1]) for ii in range(len(d))]
        ncd = np.array(cd)/max(cd)
        t1.append(sum(ncd))

        z = (n_bins + 1)/n_bins
        G = [sum(d[:ii+1])**(z) for ii in range(len(d))]
        G_ = np.array(G)/(n_obs**z)
        t2.append(sum(G_))

    print('n:', n_obs, 'k:', n_bins)
    print('cardinality:', len(t1), '| no. of unique ΣF/n:', 
          len(list(set(t1))), '| no. of unique ΣF^z/n^z:', len(list(set(t2))), '\n')
    
    s = '|A' + r'$_{n' + '=' + str(n_obs) + ', ' + 'k' + '=' + str(n_bins) + '}$' + '| = ' + str(len(_set)) + '\n'
    s += '\nValues of ' + r'$\sum{F_{i}^{' + str(z) + '}/n^{' + str(z) + '}}$' + '\n = ' + str(len(_set))
    
    if len(_set) > 10**5: 
        indices = np.random.choice(len(_set), 10**5, replace=False)
        t1 = np.array(t1)
        t2 = np.array(t2)
        t1 = t1[indices]
        t2 = t2[indices]
        t1 = t1.tolist()
        t2 = t2.tolist()
    
    ax = plt.subplot(2, 3, plot_num)
    plt.scatter(t1, t2, s=1, c='k')
    plt.xlabel(r'$\sum{F/n}$', fontsize= 14)
    plt.ylabel(r'$\sum{F^{' + str(z) + '}/n^{' + str(z) + '}}$', fontsize= 14)
    
    if plot_num == 4:
        plt.text(1. * min(t1), 0.73*max(t2), s, fontsize=10)
    elif plot_num == 5:
        plt.text(1. * min(t1), 0.7*max(t2), s, fontsize=10)
    elif plot_num == 6:
        plt.text(0.9 * min(t1), 0.71*max(t2), s, fontsize=10)
        
    plt.tick_params(axis='both', labelsize=10)
    plot_num += 1
    

fig.patch.set_facecolor('white')
plt.subplots_adjust(hspace=0.45, wspace=0.4)
plt.savefig('Final_Figs/manuscript/Fig1.pdf', bbox_inches='tight', format='pdf', dpi=600)
plt.savefig('Final_Figs/manuscript/Fig1.jpg', bbox_inches='tight', format='jpg', dpi=600)
plt.close()

n: 10 k: 5
cardinality: 1001 | no. of unique ΣF/n: 60 | no. of unique ΣF^z/n^z: 355 

n: 10 k: 5
cardinality: 1001 | no. of unique ΣF/n: 60 | no. of unique ΣF^z/n^z: 956 

n: 10 k: 5
cardinality: 1001 | no. of unique ΣF/n: 60 | no. of unique ΣF^z/n^z: 1001 



n: 10 k: 5
cardinality: 1001 | no. of unique ΣF/n: 60 | no. of unique ΣF^z/n^z: 1001 

n: 10 k: 10
cardinality: 92378 | no. of unique ΣF/n: 208 | no. of unique ΣF^z/n^z: 92378 

n: 20 k: 10
cardinality: 10015005 | no. of unique ΣF/n: 491 | no. of unique ΣF^z/n^z: 10015005 

