KL-Divergence:
If two distributions perfectly match, D_{KL} (p||q) = 0 otherwise it can take values between 0 and ∞. Lower the KL divergence value, the better we have matched the true distribution with our approximation.

In [1]:
from scipy.stats import ttest_ind, mannwhitneyu, chisquare, wasserstein_distance
from statsmodels.stats import weightstats as stests
from scipy.spatial.distance import cityblock
from matplotlib import pyplot as plt 
from scipy.spatial import distance
from scipy.special import kl_div,rel_entr
from os.path import isfile, join
from scipy.stats import entropy
from os import listdir
import seaborn as sns
import pandas as pd
import numpy as np

# Set-Up

In [2]:
sources = ['twitter', 'uwcse', 'imdb', 'cora']
targets = ['imdb', 'cora', 'uwcse', 'twitter']

## Visualization

In [None]:
def plot_kde(source_distribution, source_name, target_distribution, target_name):
    fig, ax = plt.subplots(figsize=(12, 6))
    
    if(len(source_distribution) != len(target_distribution)):
        source_distribution, target_distribution = shape_arrays_to_same_length(source_distribution, target_distribution)
        
    sns.kdeplot(data=np.array(source_distribution).cumsum(),
                color='crimson', label=source_name.upper(), fill=True, ax=ax)
    sns.kdeplot(data=np.array(target_distribution).cumsum(),
                color='limegreen', label=target_name.upper(), fill=True, ax=ax)
    ax.legend()
    fig.tight_layout()
    plt.title(f'{source_name.upper()} -> {target_name.upper()}')
    #fig.savefig(f'figures/kde/{source_name}/kde_{source_name}_{target_name}.pdf')
    plt.show()

In [None]:
def plot_histogram(source_distribution, source_name, target_distribution, target_name):
    bins = np.linspace(0, 1, 10)

    if(len(source_distribution) != len(target_distribution)):
        source_distribution, target_distribution = shape_arrays_to_same_length(source_distribution, target_distribution)
    
    plt.hist(source_distribution, bins, alpha=0.5, label=source_name.upper())
    plt.hist(target_distribution, bins, alpha=0.5, label=target_name.upper())
    plt.legend(loc='upper right')
    plt.title(f'{source_name.upper()} -> {target_name.upper()}')
    #plt.savefig(f'figures/hist/{target_name}/hist_{source_name}_{target_name}.pdf')
    plt.show()

## Reshape

In [4]:
def use_histograms(array1,array2):
    if len(array1) > len(array2):
        n_bins = round(np.sqrt(len(array2)))
    else:
        n_bins = round(np.sqrt(len(array1)))
    return np.histogram(array1,bins=n_bins)[0],np.histogram(array2,bins=n_bins)[0]

In [None]:
def shape_arrays_to_same_length_dumb_way(array1, array2):
    size_array_1 = len(array1)
    size_array_2 = len(array2)
    np.random.shuffle(array1)
    np.random.shuffle(array2)
    if(size_array_1 > size_array_2):
        return array1[:size_array_2], array2
    elif(size_array_1 < size_array_2):
        return array1, array2[:size_array_1]

In [None]:
def shape_arrays_to_same_length_truncate(array1, array2):
    size_array_1 = len(array1)
    size_array_2 = len(array2)
    if(size_array_1 > size_array_2):
        np.random.shuffle(array2)
        return np.random.choice(array1, size_array_2), array2
    elif(size_array_1 < size_array_2):
        np.random.shuffle(array1)
        return array1, np.random.choice(array2, size_array_1)

In [None]:
def shape_arrays_to_same_length(array1, array2):
    size_array_1 = len(array1)
    size_array_2 = len(array2)
    if(size_array_1 > size_array_2):
        return array1, np.concatenate((array2,np.zeros(size_array_1-size_array_2)))
    elif(size_array_1 < size_array_2):
        return np.concatenate((array1,np.zeros(size_array_2-size_array_1))), array2

## Probabilities Pre-Processing

In [3]:
def generate_probabilities_array(sample):
    prob, class_ = sample.split(' ')
    prob = float(prob.strip())
    class_ = class_.strip()
    
    if class_ == '0':
        return [prob, 1-prob][1]
    elif class_ == '1':
        return [1-prob, prob][1]
    else:
        raise Exception("Error --")

## Load Functions

In [5]:
def load_source_distribution(path,fold=0):
    
    files = [f for f in listdir(path) if isfile(join(path, f))]
    files = [f'aucTemp_{fold+1}']
    
    N_FOLDS = len(files)
    source_name = path.split('/')[-1].upper()
    #print(f'load_source_distribution: {source_name} has {N_FOLDS} folds.')
    
    if source_name == 'TWITTER' and fold > 1:
        files = ['aucTemp_1']

    source_distribution = []
    for file in files:
        test = open(path + '/' + file, 'r').read().split('\n')
        
        #print(f'load_source_distribution: {file} has {len(test)} examples.')

        if test[-1] == '':
            del test[-1]
        
        #if source_distribution.size == 0:
        #    source_distribution = np.zeros(len(test))

        source_distribution = source_distribution + [generate_probabilities_array(test[i]) for i in range(len(test))]

    #source_distribution /= N_FOLDS
    return np.array(source_distribution)

In [6]:
def load_target_features(path):
    files = [f for f in listdir(path) if isfile(join(path, f))]
    
    for file in files:
        if '.npz' in file:
            test = np.load(path + '/' + file)
            return np.matrix(test['examples'])

In [7]:
def load_target_distribution(path, fold=0):
    files = [f for f in listdir(path) if isfile(join(path, f))]
    
    if '.json' in files[0]:
        test = np.load(path + '/' + files[1])
    else:
        test = np.load(path + '/' + files[0])
            
    #target_name = path.split('/')[-1].upper()
    #print(f'load_target_distribution: {target_name} has {N_FOLDS} folds.')
    
    N_EXAMPLES = len(test['proba'][fold])
    target_distribution = [test['proba'][fold][i][1] for i in range(N_EXAMPLES)]
    
    return np.array(target_distribution)

## Smoothing and Metrics

In [None]:
def lidstone_smoothing(samples,alpha):
    # P(x’/positive)= (number of reviews with x’ and target_outcome=positive + α) / (N + α*k)
    # alpha(α) represents the smoothing parameter,
    # K represents the dimensions(no of features) in the data,
    # N represents the number of reviews with target_outcome=positive
    
    D = dict()
    
    train_unique_feature_values = training_data[training_data['label'] == category][column_name].unique()
    
    data_unique_feature_values = data[column_name].unique()
    
    for feature_value in data_unique_feature_values:
        
        if feature_value not in train_unique_feature_values:
            
            D[feature_value] = alpha/(training_data[training_data['label'] == category].shape[0] + (data_unique_feature_values.shape[0]*alpha))
            
        else:
            
            D[feature_value] = np.count_nonzero(training_data[training_data['label'] == category][column_name] == feature_value)/training_data[training_data['label'] == category].shape[0]
            
    return D

In [None]:
#def laplace_smoothing(samples,alpha):
    
#    for sample in samples:
        

In [None]:
def kl_divergence(p,q):
    current = []
    for i,j in zip(p,q):
        #log_pq = np.exp(p/q)
        if (np.log(i/j)) == 0:
        #if (np.exp(p/q)).all() == 0:
            log_pq = 0
            continue
        else:
            log_pq = np.log(i/j)
            #log_pq = np.exp(p/q)
        current.append(i*log_pq)
    return np.sum(np.array(current))

# Calculate Similarities

In [8]:
def calculate_probabilities(fold):
    results = {}
    distributions = {}

    for target in targets:
        for source in sources:
            if source == target:
                continue
            
            #print(source, target, fold)
            
            #if source == 'twitter' and fold > 1:
            #    continue

            #print(f'Starting experiment {source.upper()} -> {target.upper()}')
            experiment = source.upper() + '_' + target.upper()
            results[experiment] = {}
            #print(f'Experiments for {source.upper()} and {target.upper()}')

            #if source.upper() in distributions:
            #    source_distribution = distributions['source_' + source.upper()][:]
            #else:
                # Load source distributions
            my_source_path = f'distributions-tests/CLLs/{source}'
            source_distribution = load_source_distribution(my_source_path, fold)
                #distributions['source_' + source.upper()] = source_distribution[:]

            #print(f'{source.upper()} has {len(source_distribution)} examples.')

            #if target.upper() in distributions:
            #    target_distribution = distributions['target_' + target.upper()][:]
            #else:
                # Load target distributions
            my_target_path = f'logs/{target}'
            target_distribution = load_target_distribution(my_target_path, fold)
                #distributions['target_' + target.upper()] = target_distribution[:]

            #print(f'{target.upper()} has {len(target_distribution)} examples.')

            if(len(source_distribution) != len(target_distribution)):
                #source_distribution, target_distribution = shape_arrays_to_same_length_dumb_way(source_distribution, target_distribution)
                source_distribution,target_distribution = use_histograms(source_distribution,target_distribution)            
            
            results[experiment]['JSD'] = distance.jensenshannon(source_distribution,target_distribution)**2
            results[experiment]['bckp'] = distance.jensenshannon(source_distribution,target_distribution)
            #ztest, propability_value = stests.ztest(source_distribution, target_distribution, value=146)
            #results[experiment]['z-test'] = float(propability_value)
            #results[experiment]['Euclidean Distance'] = distance.euclidean(source_distribution,target_distribution)
            #results[experiment]['Manhattan Distance'] = cityblock(source_distribution,target_distribution)
            #results[experiment]['Earth Movers Distance'] = wasserstein_distance(source_distribution, target_distribution)
            #stat, p_value = ttest_ind(source_distribution, target_distribution)
            #print(f"{experiment.upper()} -- t-test: statistic={stat:.4f}, p-value={p_value:.4f}")
            #stat, p_value = mannwhitneyu(source_distribution, target_distribution)
            #print(f" Mann–Whitney U Test: statistic={stat:.4f}, p-value={p_value:.4f}")
            #stat, p_value = chisquare(source_distribution, target_distribution)
            #print(f"Chi-squared Test: statistic={stat:.4f}, p-value={p_value:.4f}")

            #del source_distribution
        #del target_distribution
    #del distributions
    return results

# Results

In [9]:
N_FOLDS = 5
for i in range(N_FOLDS):
    results = calculate_probabilities(i)
    
    if i == 0:
        df_add = pd.DataFrame(results)
    else:
        df_results = pd.DataFrame(results)
        df_add = df_add.add(df_results, fill_value = 0)
#df_add = df_add/N_FOLDS

In [None]:
df_add#.sort_values(by=['KL Divergence'], axis = 1)

## Visualization

In [None]:
for source in sources:
    for target in targets:
        if source == target:
            continue
        plot_histogram(distributions['source_' + source.upper()], source, distributions['target_' + target.upper()], target)

In [None]:
for source in sources:
    for target in targets:
        if source == target:
            continue
        plot_kde(distributions['source_' + source.upper()], source, distributions['target_' + target.upper()], target)

## Statistics

### IMDB

In [10]:
df_IMDB = df_add[['CORA_IMDB', 'TWITTER_IMDB', 'UWCSE_IMDB']]

In [11]:
df_IMDB = df_IMDB/5

In [12]:
print(df_IMDB.to_latex(index=True))

\begin{tabular}{lrrr}
\toprule
{} &  CORA\_IMDB &  TWITTER\_IMDB &  UWCSE\_IMDB \\
\midrule
JSD  &   0.574002 &      0.257874 &    0.257545 \\
bckp &   0.757321 &      0.505428 &    0.498705 \\
\bottomrule
\end{tabular}



  print(df_IMDB.to_latex(index=True))


### UWCSE

In [13]:
df_UWCSE = df_add[['CORA_UWCSE', 'IMDB_UWCSE', 'TWITTER_UWCSE']]

In [14]:
df_UWCSE = df_UWCSE / 5

In [15]:
print(df_UWCSE.to_latex(index=True))

\begin{tabular}{lrrr}
\toprule
{} &  CORA\_UWCSE &  IMDB\_UWCSE &  TWITTER\_UWCSE \\
\midrule
JSD  &    0.619087 &    0.123431 &       0.179144 \\
bckp &    0.786714 &    0.350986 &       0.422459 \\
\bottomrule
\end{tabular}



  print(df_UWCSE.to_latex(index=True))


### Cora

In [16]:
df_CORA = df_add[['IMDB_CORA', 'TWITTER_CORA', 'UWCSE_CORA']]

In [17]:
df_CORA = df_CORA / 5

In [18]:
print(df_CORA.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  IMDB\_CORA &  TWITTER\_CORA &  UWCSE\_CORA \\
\midrule
JSD  &   0.278352 &      0.337812 &    0.318033 \\
bckp &   0.527379 &      0.580828 &    0.563612 \\
\bottomrule
\end{tabular}



  print(df_CORA.to_latex())


### Twitter

In [19]:
df_TWITTER = df_add[['CORA_TWITTER', 'IMDB_TWITTER', 'UWCSE_TWITTER']]

In [20]:
df_TWITTER = df_TWITTER / 5

In [21]:
print(df_TWITTER.to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  CORA\_TWITTER &  IMDB\_TWITTER &  UWCSE\_TWITTER \\
\midrule
JSD  &      0.655241 &      0.186627 &       0.388452 \\
bckp &      0.809262 &      0.431531 &       0.615872 \\
\bottomrule
\end{tabular}



  print(df_TWITTER.to_latex())
