# Algorithm:

1) Load in unshuffled latent rep.

2) Take 1000 random contigs C:
    Calculate Pearson to all other contigs
    Save the 99.9% percentile value for later
    Make histogram of pearsons (remember to remove 1 from pearson = 1, that's Pearson(C, C))
    Add histogram to sum_histograms
    
3) Fit KDE:
    Fit a normal distribution to each point in hist multiplied by value of sum_histograms
    (you don't even need scipy.stats.norm, just precalculate the pdf)
    
4) Manually differentiate KDE - just do (y1 - y0)/(x1 - x0) for each consecutive y, x pair

5) Find the biggest peak

6) Find zeros in diff after biggest peak where it goes from positive to negative

7) If there are not two total peaks (biggest + one found above):
    Fall back to taking the 99.9% percentile of Pearson
    
    Else:
        Determine standard dev of peak by looking at the left half for left peak,
        right half for right peak
        
        If the two peaks are not a certain number of std devs from each other:
        Fall back to taking the 99.9% percentile of Pearson
        

In [49]:
import sys
import os
import argparse
import numpy as np
import random
from math import sqrt

if __package__ is None or __package__ == '':
    import vambtools
    
else:
    import vamb.vambtools as vambtools

In [35]:
# This is the PDF of normal with µ=0, s=0.015 from -0.4975 to 0.4975 with intervals
# of 0.005, for a total of 200 values

normalpdf = np.array([3.60430797e-238, 2.15804846e-233, 1.15623351e-228, 5.54338405e-224,
       2.37820584e-219, 9.12996205e-215, 3.13641528e-210, 9.64146896e-206,
       2.65214895e-201, 6.52826346e-197, 1.43794572e-192, 2.83421224e-188,
       4.99881774e-184, 7.88946029e-180, 1.11422358e-175, 1.40812903e-171,
       1.59241997e-167, 1.61145372e-163, 1.45922787e-159, 1.18242459e-155,
       8.57371116e-152, 5.56300308e-148, 3.22994265e-144, 1.67812914e-140,
       7.80191120e-137, 3.24579947e-133, 1.20833520e-129, 4.02529972e-126,
       1.19992494e-122, 3.20077386e-119, 7.64013375e-116, 1.63189416e-112,
       3.11909121e-109, 5.33469037e-106, 8.16460951e-103, 1.11816702e-099,
       1.37032314e-096, 1.50274180e-093, 1.47465625e-090, 1.29491805e-087,
       1.01751034e-084, 7.15451889e-082, 4.50160201e-079, 2.53453805e-076,
       1.27695499e-073, 5.75701585e-071, 2.32254596e-068, 8.38448255e-066,
       2.70852764e-063, 7.82952357e-061, 2.02526780e-058, 4.68786004e-056,
       9.70983553e-054, 1.79967535e-051, 2.98484319e-049, 4.42990007e-047,
       5.88316998e-045, 6.99155730e-043, 7.43500909e-041, 7.07512543e-039,
       6.02465026e-037, 4.59065437e-035, 3.13013024e-033, 1.90983249e-031,
       1.04273321e-029, 5.09443694e-028, 2.22722572e-026, 8.71319300e-025,
       3.05025039e-023, 9.55517636e-022, 2.67847120e-020, 6.71862360e-019,
       1.50806027e-017, 3.02902003e-016, 5.44415709e-015, 8.75596887e-014,
       1.26015169e-012, 1.62288036e-011, 1.87023094e-010, 1.92863049e-009,
       1.77970441e-008, 1.46957509e-007, 1.08587728e-006, 7.17984003e-006,
       4.24809135e-005, 2.24914774e-004, 1.06558274e-003, 4.51753399e-003,
       1.71380237e-002, 5.81788463e-002, 1.76731730e-001, 4.80406651e-001,
       1.16855337e+000, 2.54350823e+000, 4.95407757e+000, 8.63450638e+000,
       1.34665790e+001, 1.87941250e+001, 2.34710218e+001, 2.62293144e+001,
       2.62293144e+001, 2.34710218e+001, 1.87941250e+001, 1.34665790e+001,
       8.63450638e+000, 4.95407757e+000, 2.54350823e+000, 1.16855337e+000,
       4.80406651e-001, 1.76731730e-001, 5.81788463e-002, 1.71380237e-002,
       4.51753399e-003, 1.06558274e-003, 2.24914774e-004, 4.24809135e-005,
       7.17984003e-006, 1.08587728e-006, 1.46957509e-007, 1.77970441e-008,
       1.92863049e-009, 1.87023094e-010, 1.62288036e-011, 1.26015169e-012,
       8.75596887e-014, 5.44415709e-015, 3.02902003e-016, 1.50806027e-017,
       6.71862360e-019, 2.67847120e-020, 9.55517636e-022, 3.05025039e-023,
       8.71319300e-025, 2.22722572e-026, 5.09443694e-028, 1.04273321e-029,
       1.90983249e-031, 3.13013024e-033, 4.59065437e-035, 6.02465026e-037,
       7.07512543e-039, 7.43500909e-041, 6.99155730e-043, 5.88316998e-045,
       4.42990007e-047, 2.98484319e-049, 1.79967535e-051, 9.70983553e-054,
       4.68786004e-056, 2.02526780e-058, 7.82952357e-061, 2.70852764e-063,
       8.38448255e-066, 2.32254596e-068, 5.75701585e-071, 1.27695499e-073,
       2.53453805e-076, 4.50160201e-079, 7.15451889e-082, 1.01751034e-084,
       1.29491805e-087, 1.47465625e-090, 1.50274180e-093, 1.37032314e-096,
       1.11816702e-099, 8.16460951e-103, 5.33469037e-106, 3.11909121e-109,
       1.63189416e-112, 7.64013375e-116, 3.20077386e-119, 1.19992494e-122,
       4.02529972e-126, 1.20833520e-129, 3.24579947e-133, 7.80191120e-137,
       1.67812914e-140, 3.22994265e-144, 5.56300308e-148, 8.57371116e-152,
       1.18242459e-155, 1.45922787e-159, 1.61145372e-163, 1.59241997e-167,
       1.40812903e-171, 1.11422358e-175, 7.88946029e-180, 4.99881774e-184,
       2.83421224e-188, 1.43794572e-192, 6.52826346e-197, 2.65214895e-201,
       9.64146896e-206, 3.13641528e-210, 9.12996205e-215, 2.37820584e-219,
       5.54338405e-224, 1.15623351e-228, 2.15804846e-233, 3.60430797e-238])

In [12]:
# Load in latent representation
def load_latent(path):
    matrix = np.loadtxt(path, sep='\t', dtype=np.float32)
    
    vambtools.zscore(matrix, axis=1, inplace=True)
    
    return matrix

In [None]:
def pearson(matrix, rownumber):
    dotproducts = matrix @ matrix[n].T
    pearson_correlations = dotproducts / matrix.shape[1]
    
    return pearson_correlations

In [22]:
# 2) Take 1000 random contigs C:
#     Calculate Pearson to all other contigs
#     Save the 99.9% percentile value for later
#     Make histogram of pearsons (remember to remove 1 from pearson = 1, that's Pearson(C, C))
#     Add histogram to sum_histograms

def sample_contigs(matrix, nsamples=250):
    length = len(matrix)
    topthousands = list()
    sum_histograms = np.zeros(400)
    bins = np.linspace(-1, 1, 401)
    
    for row in random.sample(range(length), k=nsamples):
        pearsons = pearson(matrix, row)
        
        _, numbers = np.histogram(pearsons, bins=bins)
        
        numbers[-1] -= 1 # compensate for self-correlation of chosen contig
        
        sum_histograms += numbers
        topthousands.append(np.percentile(numbers, 99.9))
    
    return sum_histograms, topthousands

In [37]:
# 3) Fit KDE:
#     Fit a normal distribution to each point in hist multiplied by value of sum_histograms
#     (you don't even need scipy.stats.norm, just precalculate the pdf)

def kernel_density_estimation(numbers):
    density = np.zeros(600)
    for i, number in enumerate(numbers):
        density[i:i+200] += normalpdf * number
        
    return density[100:-100]

In [41]:
# 5) Find the biggest peak
# 6) Find zeros in diff after biggest peak where it goes from positive to negative

def findzeros(density):
    density_xs = np.linspace(-0.9975, 0.9975, 400)
    biggestpeak_index = np.argmax(density)
    biggestpeak = density_xs[biggestpeak_index]
    
    oldy = density[biggestpeakindex]
    peaks = [biggestpeak]
    slope_is_positive = False
    for i, (x, y) in enumerate(zip(density_xs[1:], density[1:])):
        if i <= biggestpeakindex:
            continue
            
        diff = (y - oldy)
        oldy = y
        
        if diff > 0:
            slope_is_positive = True
        
        elif diff < 0:
            if slope_is_positive:
                peak.append(x)
            
            slope_is_positive = False
            
    return peaks

In [71]:
import scipy.stats
import matplotlib.pyplot as plt
%matplotlib inline

UsageError: Line magic function `%matplitlib` not found.


In [112]:
def calc_std_leftpeak(xs, ys, peakx):
    std = 0
    totaln = 0
    
    for x, y in zip(xs, ys):
        deltax = x - peakx
        std += y * (deltax * deltax)
        totaln += y
        
        if x == peakx:
            break
            
    std /= totaln
    return sqrt(std)

In [113]:
a = scipy.stats.norm(scale=0.71).pdf(np.linspace(-5, 5, 1001))

In [114]:
calc_std_leftpeak(np.linspace(-5, 5, 1001), a, 0)

0.7080136554684734

In [None]:
# 7) If there are not two total peaks (biggest + one found above):
#     Fall back to taking the 99.9% percentile of Pearson
    
#     Else:
#         Determine standard dev of peak by looking at the left half for left peak,
#         right half for right peak
        
#         If the two peaks are not a certain number of std devs from each other:
#         Fall back to taking the 99.9% percentile of Pearson

def validatepeaks(peaks, density):
    density_xs = np.linspace(-0.9975, 0.9975, 400)
    
    if len(peaks) != 2:
        return False
    
    # Get

In [21]:
np.percentile([5,3,8,2,1,1,8,9,9,6], 99.9)

9.0

In [19]:
random.sample(range(9), k=10)

ValueError: Sample larger than population or is negative

In [4]:
import numpy as np

In [26]:
(-0.5 + -0.495) / 2

-0.4975

In [27]:
np.linspace(-0.4975, 0.4975, 200)

array([-0.4975, -0.4925, -0.4875, -0.4825, -0.4775, -0.4725, -0.4675,
       -0.4625, -0.4575, -0.4525, -0.4475, -0.4425, -0.4375, -0.4325,
       -0.4275, -0.4225, -0.4175, -0.4125, -0.4075, -0.4025, -0.3975,
       -0.3925, -0.3875, -0.3825, -0.3775, -0.3725, -0.3675, -0.3625,
       -0.3575, -0.3525, -0.3475, -0.3425, -0.3375, -0.3325, -0.3275,
       -0.3225, -0.3175, -0.3125, -0.3075, -0.3025, -0.2975, -0.2925,
       -0.2875, -0.2825, -0.2775, -0.2725, -0.2675, -0.2625, -0.2575,
       -0.2525, -0.2475, -0.2425, -0.2375, -0.2325, -0.2275, -0.2225,
       -0.2175, -0.2125, -0.2075, -0.2025, -0.1975, -0.1925, -0.1875,
       -0.1825, -0.1775, -0.1725, -0.1675, -0.1625, -0.1575, -0.1525,
       -0.1475, -0.1425, -0.1375, -0.1325, -0.1275, -0.1225, -0.1175,
       -0.1125, -0.1075, -0.1025, -0.0975, -0.0925, -0.0875, -0.0825,
       -0.0775, -0.0725, -0.0675, -0.0625, -0.0575, -0.0525, -0.0475,
       -0.0425, -0.0375, -0.0325, -0.0275, -0.0225, -0.0175, -0.0125,
       -0.0075, -0.0

In [28]:
scipy.stats.norm(scale=0.015).pdf(np.linspace(-0.4975, 0.4975, 200))

array([3.60430797e-238, 2.15804846e-233, 1.15623351e-228, 5.54338405e-224,
       2.37820584e-219, 9.12996205e-215, 3.13641528e-210, 9.64146896e-206,
       2.65214895e-201, 6.52826346e-197, 1.43794572e-192, 2.83421224e-188,
       4.99881774e-184, 7.88946029e-180, 1.11422358e-175, 1.40812903e-171,
       1.59241997e-167, 1.61145372e-163, 1.45922787e-159, 1.18242459e-155,
       8.57371116e-152, 5.56300308e-148, 3.22994265e-144, 1.67812914e-140,
       7.80191120e-137, 3.24579947e-133, 1.20833520e-129, 4.02529972e-126,
       1.19992494e-122, 3.20077386e-119, 7.64013375e-116, 1.63189416e-112,
       3.11909121e-109, 5.33469037e-106, 8.16460951e-103, 1.11816702e-099,
       1.37032314e-096, 1.50274180e-093, 1.47465625e-090, 1.29491805e-087,
       1.01751034e-084, 7.15451889e-082, 4.50160201e-079, 2.53453805e-076,
       1.27695499e-073, 5.75701585e-071, 2.32254596e-068, 8.38448255e-066,
       2.70852764e-063, 7.82952357e-061, 2.02526780e-058, 4.68786004e-056,
       9.70983553e-054, 1

In [None]:
counter = Counter()

for cnv in cnvs:
    cnvcounter = Counter()
    
    for variant in cnv:
        class_ = variant[9]
        
        cnvcounter[class_] += 1
        
    cnv.append(cnvcounter)

In [None]:
{'CN1': 1}

In [88]:
counter = Counter()

In [89]:
counter['hej']

0

In [90]:
counter

Counter()

In [None]:
(cnv, class_)

In [87]:
counter = Counter()

dictionary[5] += dictionary.get(5, 0)

In [82]:
class A:
    pass

In [84]:
a = {'frequency': 0.9
    'variants': [varinsj, fjklf, fdjsfk],
    'numberof ': 1}

In [72]:
from collections import Counter

In [74]:
a = Counter()

In [75]:
a[0] = 1

In [79]:
a[5] += 1

In [80]:
a

Counter({0: 1, 5: 1})

In [81]:
a + a

Counter({0: 2, 5: 2})