In [0]:
%pyspark
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import glob
from collections import Counter
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import io
import requests
import os
from pyspark import SparkFiles
from pyspark.sql import functions as F
from pyspark.sql.functions import col,collect_list, concat, lit, when, monotonically_increasing_id, concat_ws 
from pyspark.sql.types import StringType

In [1]:
%pyspark

t_ocr = spark \
    .table('occurrences') \
    .where((F.col('study_id').isin('SD_NMVV8A1Y')))

select_ocr_cols = ['biospecimen_id', 'chromosome', 'start', 'end','reference', 'alternate', 'name', 'hgvsg','variant_class','is_proband','quality','info_dp']

t_ocr_select = t_ocr.select(select_ocr_cols).withColumn('unique_variant_id',concat(col("chromosome"), lit(":"),
                                                   col("start"),lit(':'),col('reference'),lit(':'),col('alternate')))
t_ocr_select.count()

In [2]:
%pyspark

t_ocr_KUTD = spark \
    .table('occurrences') \
    .where((F.col('study_id').isin('SD_NMVV8A1Y'))) # Kidney and Urinary Tract Defects Cohort Study ID

#bs_id_KUTD = t_ocr_KUTD.select('biospecimen_id').distinct().toPandas()['biospecimen_id']


In [3]:
%pyspark
#t_ocr_CHD = spark \
#    .table('occurrences') \
#    .where((F.col('study_id').isin('SD_PREASA7S'))) # Congenital Heart Defects Cohort Study ID
    
#bs_id_CHD = t_ocr_CHD.select('biospecimen_id').distinct().toPandas()['biospecimen_id']

In [4]:
%pyspark
denovos = t_ocr_select.where((col('is_hi_conf_denovo') == True) & (col('is_proband') == True) ) #  | (col('Impact') == 'High')
denovos.count()

In [5]:
%pyspark
#denovo_bs_id = denovos.select('biospecimen_id').distinct().toPandas()['biospecimen_id']
#denovo_bs_id

In [6]:
%pyspark
t_csq = spark \
    .table('consequences') \
    .withColumnRenamed('name', 'rsID')\
    .drop('variant_class')\
    .where( (F.array_contains( F.col('study_ids'),'SD_NMVV8A1Y' )) & (F.col('original_canonical') == 'true')) 

t_csq_select = t_csq.select(['chromosome','start', 'reference','alternate','rsID','impact','symbol'
                                ,'biotype','hgvsc','hgvsg','feature_type','strand','consequences'])\
                                .withColumn('unique_variant_id',concat(col("chromosome"), lit(":"),
                                 col("start"),lit(':'),col('reference'),lit(':'),col('alternate')))

#gene_var_map = t_csq_select.select('unique_variant_id','symbol').drop_duplicates(['unique_variant_id'])
t_csq_select.count()

In [7]:
%pyspark

# get all pc vars
all_pcs = t_csq_select.where(col('biotype') == 'protein_coding')
    
all_pcs_snps = all_pcs.where(  ~( (F.length("reference") > 1) | (F.length("alternate") > 1) ) )    
all_pcs_indels = all_pcs.where(  (F.length("reference") > 1) | (F.length("alternate") > 1)  )

# score snps
cadd_snps_gt20= spark.read.parquet("s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/cadd_scores_gt20/cadd_gt20_snps_2.parquet")
all_pcs_snps_gt20_scored = all_pcs_snps.join(cadd_snps_gt20.select('unique_variant_id','cadd_score'),'unique_variant_id','left')
all_pcs_snps_gt20_scored_nonNUll = all_pcs_snps_gt20_scored.where(~col('cadd_score').isNull())
#print(all_pcs_snps_gt20_scored_nonNUll.count())

# score indels
cadd_indels_gt20= spark.read.parquet("s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/cadd_scores_gt20/cadd_gt20_gnomAD_all_chroms.parquet")
all_pcs_indels_gt20_scored = all_pcs_indels.join(cadd_indels_gt20.select('unique_variant_id','cadd_score'),'unique_variant_id','left')
all_pcs_indels_gt20_scored_nonNUll = all_pcs_indels_gt20_scored.where(~col('cadd_score').isNull())
#print(all_pcs_indels_gt20_scored_nonNUll.count())

# union
all_pcs_gt20_scored = all_pcs_snps_gt20_scored_nonNUll.union(all_pcs_indels_gt20_scored_nonNUll)

# get all proband variants
pbs = t_ocr_select.where(col('is_proband') == True).drop('chromosome','start','reference','alternate','hgvsg')

# join together so we have all proband protein coding variants (not just a unique list of pc vars)
all_pcs_gt20_perPerson = pbs.join(all_pcs_gt20_scored,'unique_variant_id','left')

all_pcs_gt20_perPerson_nonNull = all_pcs_gt20_perPerson.where(~col('cadd_score').isNull())
#all_pcs_gt20_perPerson_nonNull.show()
#all_pcs_gt20_perPerson_nonNull.count()

# add identifiier cols
all_pcs_gt20_perPerson_nonNull = all_pcs_gt20_perPerson_nonNull.withColumn('High_Impact_boolean',lit(0))\
                                                .withColumn('DeNovo_boolean',lit(0))\
                                                .withColumn('gt20_boolean',lit(1))
all_pcs_gt20_perPerson_nonNull.count()

In [8]:
%pyspark
denovos_hiMod = denovos.join(t_csq_select.select('unique_variant_id','impact','biotype','consequences'),'unique_variant_id','left')\
                                                      .where( (col('impact') == 'HIGH') | (col('impact') == 'MODERATE'))
                                                      
#denovos_hiMod = denovos_hiMod.drop_duplicates(['unique_variant_id'])                                                      
print(denovos_hiMod.count())
denovos_hiMod.show(5)

In [9]:
%pyspark
denovos_hiMod_SNPs = denovos_hiMod.where(  ~( (F.length("reference") > 1) | (F.length("alternate") > 1) ) )
chroms = ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '22', '3', '4', '5', '6', '7','8', '9', 'X','Y']

for n,CHROM in enumerate(chroms):
    cadd_chrom = spark.read.parquet(f"s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/cadd_scores/cadd_chrom_files_parquet/chr{CHROM}")

    cadd_chrom = cadd_chrom.select(col("0").alias("chromosome"), col("1").alias("start"),col("2").alias("reference"),col("3").alias("alternate"),
                                   col('5').alias("CADD_score")).withColumn('unique_variant_id',concat(col("chromosome"), lit(":"),
                                   col("start"),lit(':'),col('reference'),lit(':'),col('alternate')))
         
    denovos_hiMod_SNPs_chrom = denovos_hiMod_SNPs.where(col("chromosome") == CHROM)
    
    print(f"Joining CADD scores for chromosome {CHROM} ({denovos_hiMod_SNPs_chrom.count()} x {cadd_chrom.count()})...",end="")
    
    deNovo_results_chrom = denovos_hiMod_SNPs_chrom.join(cadd_chrom.select('unique_variant_id','cadd_score'), ['unique_variant_id'], "left")
    
    print(f"Done...Result is {deNovo_results_chrom.count()} rows long.")
    
    if n == 0: deNovo_SNPs_full =  deNovo_results_chrom
    else: deNovo_SNPs_full = deNovo_SNPs_full.union(deNovo_results_chrom)

In [10]:
%pyspark
denovos_hiMod_indels = denovos_hiMod.where(  (F.length("reference") > 1) | (F.length("alternate") > 1)  )
cadd_gnomad_chrom = spark.read.parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/cadd_scores/cadd_gnomad_all.parquet')
'''
for n,CHROM in enumerate(chroms):

    cadd_chrom =  cadd_gnomad_chrom.where(col('chromosome') == CHROM)
    denovos_hiMod_indels_chrom = denovos_hiMod_indels.where(col('chromosome') == CHROM)

    num2join = denovos_hiMod_indels_chrom.count()
    print(f'Joining CADD scores for chromosome {CHROM} ({cadd_chrom.count()} x {denovos_hiMod_indels_chrom.count()})...',end='')
    
    deNovo_results_chrom = denovos_hiMod_indels_chrom.join(cadd_chrom.select('unique_variant_id','cadd_score'), ["unique_variant_id"], "left")
    
    print(f'Done...Result is {deNovo_results_chrom.count()} rows long.')
    
    if n == 0: deNovo_indels_full =  deNovo_results_chrom
    else: deNovo_indels_full = deNovo_indels_full.union(deNovo_results_chrom)'''

In [11]:
%pyspark

deNovo_indels_full = denovos_hiMod_indels.join(cadd_gnomad_chrom.select('unique_variant_id','cadd_score'), ["unique_variant_id"], "left")
deNovo_indels_full.count()

In [12]:
%pyspark

assert list(deNovo_indels_full.columns) == list(deNovo_SNPs_full.columns)

deNovos_scored = deNovo_SNPs_full.union(deNovo_indels_full)
deNovos_scored = deNovos_scored.withColumn('DeNovo_boolean', lit(1))

print(deNovos_scored.count())
deNovos_scored.show(3)

In [13]:
%pyspark
'''   ALREADY FOUND HIGH IIMPACT VARIANTS AND SCORED THEM, SAVED AS ~/tables/hiImpact_scored_KUTD.parquet

highImpact = t_csq_select.where(col('impact') == 'HIGH').drop('hgvsc','hgvsg','feature_type','strand')
print(highImpact.count())
highImpact.show(3)'''

In [14]:
%pyspark

'''   ALREADY FOUND HIGH IIMPACT VARIANTS AND SCORED THEM, SAVED AS ~/tables/hiImpact_scored_KUTD.parquet

highImpact_SNPs = highImpact.where(  ~( (F.length("reference") > 1) | (F.length("alternate") > 1) ) )
chroms = ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '22', '3', '4', '5', '6', '7','8', '9', 'X','Y']

for n,CHROM in enumerate(chroms):
    cadd_chrom = spark.read.parquet(f"s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/cadd_scores/cadd_chrom_files_parquet/chr{CHROM}")

    cadd_chrom = cadd_chrom.select(col("0").alias("chromosome"), col("1").alias("start"),col("2").alias("reference"),col("3").alias("alternate"),
                                   col('5').alias("CADD_score")).withColumn('unique_variant_id',concat(col("chromosome"), lit(":"),
                                   col("start"),lit(':'),col('reference'),lit(':'),col('alternate')))
         
    highImpact_SNPs_chrom = highImpact_SNPs.where(col("chromosome") == CHROM)
    hiImpact_results_chrom = highImpact_SNPs_chrom.join(cadd_chrom.select('unique_variant_id','cadd_score'), ['unique_variant_id'], "left")
    if n == 0: hiImpact_SNPs_full =  hiImpact_results_chrom
    else: hiImpact_SNPs_full = hiImpact_SNPs_full.union(hiImpact_results_chrom)
    
print(highImpact_SNPs.count())
print(hiImpact_SNPs_full.count())'''

In [15]:
%pyspark
'''   ALREADY FOUND HIGH IIMPACT VARIANTS AND SCORED THEM, SAVED AS ~/tables/hiImpact_scored_KUTD.parquet

highImpact_indels = highImpact.where(  (F.length("reference") > 1) | (F.length("alternate") > 1)  )
chroms = ['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '22', '3', '4', '5', '6', '7','8', '9', 'X','Y']

for n,CHROM in enumerate(chroms):
    cadd_chrom = spark.read.parquet(f"s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/cadd_scores/cadd_chrom_files_parquet/chr{CHROM}")

    cadd_chrom = cadd_chrom.select(col("0").alias("chromosome"), col("1").alias("start"),col("2").alias("reference"),col("3").alias("alternate"),
                                   col('5').alias("CADD_score")).withColumn('unique_variant_id',concat(col("chromosome"), lit(":"),
                                   col("start"),lit(':'),col('reference'),lit(':'),col('alternate')))
         
    highImpact_indels_chrom = highImpact_indels.where(col("chromosome") == CHROM)
    hiImpact_results_chrom = highImpact_indels_chrom.join(cadd_chrom.select('unique_variant_id','cadd_score'), ['unique_variant_id'], "left")

    if n == 0: hiImpact_indels_full =  hiImpact_results_chrom
    else: hiImpact_indels_full = hiImpact_indels_full.union(hiImpact_results_chrom) 
print(highImpact_indels.count())
print(hiImpact_indels_full.count())
hiImpact_indels_full.where(col('cadd_score').isNull()).count()'''

#hiImpact_scored = hiImpact_SNPs_full.union(hiImpact_indels_full)
#hiImpact_scored.count()

#hiImpact_scored.repartition(200,'unique_variant_id').write.mode('overwrite').parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/hiImpact_scored_KUTD.parquet')

In [16]:
%pyspark
highImpact_CADD_scores = spark.read.parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/hiImpact_scored_KUTD.parquet')
print(highImpact_CADD_scores.count())
highImpact_CADD_scores.show(5)


In [17]:
%pyspark

#highImpact_perPerson_probands_pc = highImpact_perPerson_probands.where( (col('biotype') == 'protein_coding') )
#print(highImpact_perPerson_probands_pc.count())

#high_impact_perPerson_scored = highImpact_CADD_scores.select('unique_variant_id','cadd_score').join(highImpact_perPerson_probands_pc,'unique_variant_id','right')
#print(high_impact_perPerson_scored.count())

high_impact_perPerson_scored = t_ocr_select.where(col('is_proband') == True)\
                                            .drop('chromosome','start','end','reference','alternate')\
                                            .join(highImpact_CADD_scores,'unique_variant_id','right')
                                            
print(high_impact_perPerson_scored.count())

high_impact_perPerson_scored = high_impact_perPerson_scored.withColumn('High_Impact_boolean',lit(1))\
                                                           .withColumn('DeNovo_boolean',lit(0))\
                                                           .withColumn('gt20_boolean',lit(0))


In [18]:
%pyspark

#denovos_scored_kutd = spark.read.parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/denovos_scored_KIDNEY.parquet/')
#print(denovos_scored_kutd.count())
#denovos_scored_kutd.show()


In [19]:
%pyspark
gene_var_map = t_csq_select.select('unique_variant_id','symbol').drop_duplicates(['unique_variant_id'])

deNovos_scored_sym = deNovos_scored.join(gene_var_map,'unique_variant_id','left')
#print(deNovos_scored_sym.count())
#print(deNovos_scored_sym.where(col('symbol').isNull()).count())

In [20]:
%pyspark
deNovos_scored_sym = deNovos_scored_sym.withColumn('High_Impact_boolean',lit(0))\
                                    .withColumn('gt20_boolean',lit(0))
                                    
# join rsIDs in
deNovos_scored_sym_rsID = deNovos_scored_sym.join(t_csq_select.select('rsID','unique_variant_id'),'unique_variant_id','left') 

In [21]:
%pyspark
all_pcs_gt20_perPerson_nonNull = all_pcs_gt20_perPerson_nonNull.drop('end','feature_type', 'hgvsc','strand')
deNovos_scored_sym_rsID = deNovos_scored_sym_rsID.drop('end')

In [22]:
%pyspark
set(all_pcs_gt20_perPerson_nonNull.columns) - set(deNovos_scored_sym_rsID.columns)

In [23]:
%pyspark
print(all_pcs_gt20_perPerson_nonNull.count())
print(all_pcs_gt20_perPerson_nonNull.drop_duplicates(['biospecimen_id','unique_variant_id']).count())

print(deNovos_scored_sym_rsID.count())
print(deNovos_scored_sym_rsID.drop_duplicates(['biospecimen_id','unique_variant_id']).count())

print(high_impact_perPerson_scored.count())
print(high_impact_perPerson_scored.drop_duplicates(['biospecimen_id','unique_variant_id']).count())

In [24]:
%pyspark
all_pcs_gt20_perPerson_nonNull = all_pcs_gt20_perPerson_nonNull.drop_duplicates(['biospecimen_id','unique_variant_id'])
deNovos_scored_sym_rsID = deNovos_scored_sym_rsID.drop_duplicates(['biospecimen_id','unique_variant_id'])
high_impact_perPerson_scored = high_impact_perPerson_scored.drop_duplicates(['biospecimen_id','unique_variant_id'])

In [25]:
%pyspark
c = ['unique_variant_id', 'cadd_score', 'biospecimen_id', 'name', 'hgvsg', 'variant_class', 'is_proband', 'quality', 'info_dp', 'chromosome',
                     'start', 'reference', 'alternate', 'impact','rsID', 'symbol','biotype','consequences','DeNovo_boolean','High_Impact_boolean','gt20_boolean'] 
                     
                     
scored_variants = deNovos_scored_sym_rsID.select(c).union(high_impact_perPerson_scored.select(c))
scored_variants = scored_variants.select(c).union(all_pcs_gt20_perPerson_nonNull.select(c))

In [26]:
%pyspark
scored_variants_dedup = scored_variants.drop_duplicates(['biospecimen_id','unique_variant_id'])
scored_variants_dedup.count()


In [27]:
%pyspark
scored_variants.sample(1/1000).show()

In [28]:
%pyspark
scored_variants.count()

In [29]:
%pyspark
scored_variants = scored_variants.withColumn('eqtl_boolean',lit(0))

In [30]:
%pyspark
#scored_variants_dedup.write.mode('overwrite').partitionBy("chromosome").parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/scored_variants_KIDNEY.parquet') # .repartitionByRange(20, "chromosome", "start")

In [31]:
%pyspark
#scored_variants_dedup.repartitionByRange(20, "chromosome", "start").write.mode('overwrite').partitionBy("chromosome").parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/scored_variants_KIDNEY_2.parquet') #

In [32]:
%sh
aws s3 ls s3://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/CHD_all_unique_variants.parquet/


In [33]:
%pyspark
eqtl_highImpact = spark.read.parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/eqtl_highImpact_scored_july8.parquet')
#print(eqtl_highImpact.count())
eqtls = eqtl_highImpact.where(col('eqtl_boolean') == 1)
#print(eqtls.count())
#e = eqtl_highImpact.select('cadd_score').toPandas()
#e.hist(bins=50)
#z.show(plt)
c = ['unique_variant_id', 'cadd_score', 'biospecimen_id', 'name', 'hgvsg', 'variant_class', 'is_proband', 'quality', 'info_dp', 'chromosome',
                     'start', 'reference', 'alternate',  'symbol']
                     
eqtls_perPerson = eqtls.drop('alternate','reference','start','chromosome').join(
                                    t_ocr_select.where(col('is_proband') == True ),'unique_variant_id','left')\
                                    .drop_duplicates(['biospecimen_id','unique_variant_id']).select(c)
print(eqtls_perPerson.count())
eqtls_perPerson.show()

In [34]:
%pyspark
eqtls_perPerson = eqtls_perPerson.withColumn('DeNovo_boolean',lit(0))\
                                .withColumn('High_Impact_boolean',lit(0))\
                                .withColumn('gt20_boolean',lit(0)) #\.withColumn('eqtl_boolean',lit(1))

In [35]:
%pyspark
eqtls_perPerson_dropna = eqtls_perPerson.where(~col('biospecimen_id').isNull())
eqtls_perPerson_dropna.count()


In [36]:
%pyspark
eqtls_perPerson_dropna = eqtls_perPerson_dropna.withColumn('eqtl_boolean',lit(1))

In [37]:
%pyspark
eqtls_perPerson_dropna =  eqtls_perPerson_dropna.withColumn('biotype',lit('NA'))\
                                                    .withColumn('consequences',F.array([]))\
                                                    .withColumn('impact',lit('NA'))

In [38]:
%pyspark
eqtls_perPerson_dropna.show(1)

In [39]:
%pyspark
#scored_variants = scored_variants_kidney.union(eqtls_perPerson)

ce = ['unique_variant_id', 'cadd_score', 'biospecimen_id', 'name', 'hgvsg', 'variant_class', 'is_proband', 'quality', 'info_dp', 'chromosome',
                     'start', 'reference', 'alternate', 'impact','rsID', 'symbol','biotype','consequences','DeNovo_boolean','High_Impact_boolean','gt20_boolean','eqtl_boolean'] 
                     
scored_variants_final = scored_variants.select(ce).union(eqtls_perPerson_dropna.select(ce))#.drop_duplicates(['biospecimen_id','unique_variant_id'])
scored_variants_final.count()

In [40]:
%pyspark
scored_variants_final.repartition(200, "unique_variant_id").write.mode('overwrite').parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/scored_variants_KIDNEY_w_eQTLs_sep14.parquet') #  .partitionBy("chromosome")

In [41]:
%pyspark


scored_variants_final = spark.read.parquet("s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/scored_variants_KIDNEY_w_eQTLs_sep14.parquet")
print(scored_variants_final.count())
scored_variants_final.show(3)

In [42]:
%pyspark
print('Total De Novo.............',scored_variants_final.where(col('DeNovo_boolean') == 1).count()) # De Novo High/Moderate
print('Total VEP High Impact.....',scored_variants_final.where(col('High_Impact_boolean') == 1).count()) # VEP High Impact 
print('Total CADD > 20...........',scored_variants_final.where(col('gt20_boolean') == 1).count()) # CADD greater than 20.0
print('Total eQTL................',scored_variants_final.where(col('eqtl_boolean') == 1).count()) # eQTLs



In [43]:
%pyspark
scored_variants_final.where(col('DeNovo_boolean') == 1).select('cadd_score').dropna().count() #.groupby('biospecimen_id').sum('cadd_score').show()

In [44]:
%pyspark
print(100*(scored_variants_final.where(col('DeNovo_boolean') == 1).where(col('cadd_score').isNull()).count()/scored_variants_final.where(col('DeNovo_boolean') == 1).count()))
print(100*(scored_variants_final.where(col('High_Impact_boolean') == 1).where(col('cadd_score').isNull()).count()/scored_variants_final.where(col('High_Impact_boolean') == 1).count()))
print(100*(scored_variants_final.where(col('gt20_boolean') == 1).where(col('cadd_score').isNull()).count()/scored_variants_final.where(col('gt20_boolean') == 1).count()))
print(100*(scored_variants_final.where(col('eqtl_boolean') == 1).where(col('cadd_score').isNull()).count()/scored_variants_final.where(col('eqtl_boolean') == 1).count()))


In [45]:
%pyspark
scored_variants_kidney = spark.read.parquet('s3a://kf-strides-variant-parquet-prd/notebooks/5175e6e3-c3d7-4c19-b51f-6f1ea4dd3700/tables/scored_variants_KIDNEY.parquet/')
print(scored_variants_kidney.count())
scored_variants_kidney.show(3)

In [46]:
%pyspark
'''from scipy.stats import gaussian_kde
plt.figure()
density = gaussian_kde(data[0])
xs = np.linspace(0,8,200)
density.covariance_factor = lambda : .25
density._compute_covariance()
plt.plot(xs,density(xs))
plt.show()
z.show(plt)

density = gaussian_kde(data[1])
xs = np.linspace(0,8,200)
density.covariance_factor = lambda : .25
density._compute_covariance()
plt.plot(xs,density(xs),'g')
plt.xlim([0,2])
plt.show()
z.show(plt)'''