# Data Paths and Prep

In [111]:
import importlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy.stats
import sklearn.manifold as sklm
import os
from sklearn.decomposition import PCA
import scanpy as sc
import functions
importlib.reload(functions)
functions.set_user('Laila')
data = functions.call_data_clean(p_threshold=1.3)

In [105]:
ATAC = data['ATAC_seq']
RNA = data['RNA_seq']
QC = data['QC_metrics']
ATAC_scores = data['ATAC_seq_only_scores']
ATACtest = data['test1']

# Data Clean up and Normalization
---

### Check Data Sets after cleaning

In [106]:
# qc metrics
print(f"NaN in QC-metrics data set: {QC.isna().sum().sum()}")

# ATAC-seq
    # check for inf values
ATAC_inf = ATAC.isin([np.inf, -np.inf]).sum()
print(f"Infinite values in ATAC_seq: {ATAC_inf[ATAC_inf > 0].sum()}")

    # check for NaN values
print(f"NaN in ATAC_seq data set: {ATAC.isna().sum().sum()}")

print(len(QC))
print(len(ATAC.columns))
#print(ATAC)
#print(ATACtest.columns)
#print(ATACtest.index)
#print(ATACtest)
print(ATACtest.index[:10])
print(ATACtest.columns)
ATACtest

NaN in QC-metrics data set: 0
Infinite values in ATAC_seq: 0
NaN in ATAC_seq data set: 430740
176
97
Index(['ImmGenATAC1219.peak_1', 'ImmGenATAC1219.peak_2',
       'ImmGenATAC1219.peak_3', 'ImmGenATAC1219.peak_4',
       'ImmGenATAC1219.peak_5', 'ImmGenATAC1219.peak_6',
       'ImmGenATAC1219.peak_7', 'ImmGenATAC1219.peak_8',
       'ImmGenATAC1219.peak_9', 'ImmGenATAC1219.peak_10'],
      dtype='object', name='ImmGenATAC1219.peakID')
Index(['chrom', 'Summit', 'mm10.60way.phastCons_scores', '_-log10_bestPvalue',
       'Included.in.systematic.analysis', 'TSS', 'genes.within.100Kb',
       'LTHSC.34-.BM', 'LTHSC.34+.BM', 'STHSC.150-.BM', 'MPP4.135+.BM',
       'proB.CLP.BM', 'proB.FrA.BM', 'proB.FrBC.BM', 'preB.FrD.BM', 'B.FrE.BM',
       'B1b.PC', 'B.T1.Sp', 'B.T2.Sp', 'B.T3.Sp', 'B.Sp', 'B.Fem.Sp',
       'B.MZ.Sp', 'B.Fo.Sp', 'B.mem.Sp', 'B.GC.CB.Sp', 'B.GC.CC.Sp', 'B.PB.Sp',
       'B.PC.Sp', 'B.PC.BM', 'preT.DN1.Th', 'preT.DN2a.Th', 'preT.DN2b.Th',
       'preT.DN3.Th', 'T.DN4.Th'

Unnamed: 0_level_0,chrom,Summit,mm10.60way.phastCons_scores,_-log10_bestPvalue,Included.in.systematic.analysis,TSS,genes.within.100Kb,LTHSC.34-.BM,LTHSC.34+.BM,STHSC.150-.BM,...,DC.4+.Sp,DC.8+.Sp,DC.pDC.Sp,DC.103+11b+.SI,DC.103+11b-.SI,FRC.SLN,IAP.SLN,BEC.SLN,LEC.SLN,Ep.MEChi.Th
ImmGenATAC1219.peakID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ImmGenATAC1219.peak_1,chr1,3020786,0.00,0.56,,,,0.41,0.71,0.90,...,0.10,0.10,3.19,1.37,0.52,1.27,0.10,0.57,3.27,1.41
ImmGenATAC1219.peak_2,chr1,3087226,0.00,0.50,,,,0.41,1.64,0.90,...,1.70,0.10,1.41,0.47,0.11,0.92,0.98,2.16,2.34,0.94
ImmGenATAC1219.peak_3,chr1,3120109,0.07,10.80,1.0,,,2.36,0.10,0.90,...,0.87,0.54,2.72,0.95,0.11,63.38,8.92,1.33,1.04,0.11
ImmGenATAC1219.peak_4,chr1,3121485,0.15,3.02,1.0,,,0.41,0.10,0.11,...,0.44,1.83,0.66,0.11,0.92,13.50,0.98,1.28,1.04,0.11
ImmGenATAC1219.peak_5,chr1,3372787,0.03,1.31,,,,0.41,0.10,0.11,...,0.44,0.10,0.66,1.79,0.51,0.92,0.75,1.33,1.61,4.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ImmGenATAC1219.peak_512591,chrY,90812450,0.00,3.99,1.0,,Erdr1,4.37,8.79,3.52,...,3.81,3.34,4.27,6.73,5.53,7.21,5.96,5.17,6.53,6.11
ImmGenATAC1219.peak_512592,chrY,90812906,0.00,3.21,1.0,,Erdr1,0.41,7.41,3.52,...,4.28,5.55,4.15,6.88,7.16,6.21,8.75,6.83,8.14,4.64
ImmGenATAC1219.peak_512593,chrY,90813175,0.00,0.69,,,Erdr1,0.41,0.71,0.11,...,0.44,1.38,1.20,1.37,1.82,1.61,1.29,0.14,1.23,0.50
ImmGenATAC1219.peak_512594,chrY,90813624,0.00,0.60,,,Erdr1,0.41,1.64,0.11,...,0.77,0.97,0.11,1.39,1.39,0.50,0.10,0.14,1.05,0.11


# Exploratory Data
---

### Variance of peak signal

In [109]:
# calculate mean accessibility score for each peak
mean_accessibility_per_peak = ATAC_scores.mean(axis=1, numeric_only=True)

# calculate median accessibility score for each peak
median_accessibility_per_peak = ATAC_scores.median(axis=1, numeric_only=True)

# calculate standard deviation of accessibility scores for each peak
sd_accessibility_per_peak = ATAC_scores.std(axis=1, numeric_only=True)

# variance within cell type
variance_within_cell = ATAC_scores.var(axis = 0, numeric_only = True)

# variance across cell types (for each peak)
variance_per_peak = ATAC_scores.var(axis = 1, numeric_only=True)

# create summary data frame
mean_med_sd_df = pd.DataFrame({
    "Mean Accessibility": mean_accessibility_per_peak,
    "Median Accessibility": median_accessibility_per_peak,
    "Standard Deviation": sd_accessibility_per_peak,
    "Variance across cells": variance_per_peak
})

#print(ATAC_scores)
#print(mean_accessibility_per_peak)
#print(mean_med_sd_df)
print(mean_med_sd_df)
print(variance_within_cell)



                            Mean Accessibility  Median Accessibility  \
ImmGenATAC1219.peakID                                                  
ImmGenATAC1219.peak_3                 1.521000                 0.495   
ImmGenATAC1219.peak_4                 0.874556                 0.530   
ImmGenATAC1219.peak_5                 0.882111                 0.630   
ImmGenATAC1219.peak_6                 1.096556                 0.785   
ImmGenATAC1219.peak_7                 0.944556                 0.680   
...                                        ...                   ...   
ImmGenATAC1219.peak_512589            5.823889                 4.955   
ImmGenATAC1219.peak_512590            5.526444                 4.705   
ImmGenATAC1219.peak_512591            6.936111                 5.775   
ImmGenATAC1219.peak_512592            6.995111                 6.440   
ImmGenATAC1219.peak_512595            1.751778                 1.420   

                            Standard Deviation  Variance across

# Discriptive and Comparative Analysis

# Statistical Analysis and Regression Models

In [16]:
## Test lf 
print(ATAC_seq.iloc[0,0])
print(ATAC_seq.index[:20])
with open(ATAC_seq_path) as f:
    for _ in range(5):
        print(f.readline())

ImmGenATAC1219.peak_1
RangeIndex(start=0, stop=20, step=1)
"ImmGenATAC1219.peakID","chrom","Summit","mm10.60way.phastCons_scores","_-log10_bestPvalue","Included.in.systematic.analysis","TSS","genes.within.100Kb","LTHSC.34-.BM","LTHSC.34+.BM","STHSC.150-.BM","MPP4.135+.BM","proB.CLP.BM","proB.FrA.BM","proB.FrBC.BM","preB.FrD.BM","B.FrE.BM","B1b.PC","B.T1.Sp","B.T2.Sp","B.T3.Sp","B.Sp","B.Fem.Sp","B.MZ.Sp","B.Fo.Sp","B.mem.Sp","B.GC.CB.Sp","B.GC.CC.Sp","B.PB.Sp","B.PC.Sp","B.PC.BM","preT.DN1.Th","preT.DN2a.Th","preT.DN2b.Th","preT.DN3.Th","T.DN4.Th","T.ISP.Th","T.DP.Th","T.4.Th","T.8.Th","T.4.Nve.Sp","T.4.Nve.Fem.Sp","T.8.Nve.Sp","T.4.Sp.aCD3+CD40.18hr","Treg.4.FP3+.Nrplo.Co","Treg.4.25hi.Sp","T8.TN.P14.Sp","T8.IEL.LCMV.d7.SI","T8.TE.LCMV.d7.Sp","T8.MP.LCMV.d7.Sp","T8.Tcm.LCMV.d180.Sp","T8.Tem.LCMV.d180.Sp","NKT.Sp","NKT.Sp.LPS.3hr","NKT.Sp.LPS.18hr","NKT.Sp.LPS.3d","Tgd.g2+d17.24a+.Th","Tgd.g2+d17.LN","Tgd.g2+d1.24a+.Th","Tgd.g2+d1.LN","Tgd.g1.1+d1.24a+.Th","Tgd.g1.1+d1.LN","Tgd.Sp","NK

In [30]:
ATAC_seq_T = data['ATAC_seq_T']

ATAC_seq_T

ImmGenATAC1219.peakID,ImmGenATAC1219.peak_1,ImmGenATAC1219.peak_2,ImmGenATAC1219.peak_3,ImmGenATAC1219.peak_4,ImmGenATAC1219.peak_5,ImmGenATAC1219.peak_6,ImmGenATAC1219.peak_7,ImmGenATAC1219.peak_8,ImmGenATAC1219.peak_9,ImmGenATAC1219.peak_10,...,ImmGenATAC1219.peak_512586,ImmGenATAC1219.peak_512587,ImmGenATAC1219.peak_512588,ImmGenATAC1219.peak_512589,ImmGenATAC1219.peak_512590,ImmGenATAC1219.peak_512591,ImmGenATAC1219.peak_512592,ImmGenATAC1219.peak_512593,ImmGenATAC1219.peak_512594,ImmGenATAC1219.peak_512595
CellType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
chrom,chr1,chr1,chr1,chr1,chr1,chr1,chr1,chr1,chr1,chr1,...,chrY,chrY,chrY,chrY,chrY,chrY,chrY,chrY,chrY,chrY
Summit,3020786,3087226,3120109,3121485,3372787,3399217,3400115,3416260,3434092,3434378,...,90808861,90811022,90811406,90811728,90812084,90812450,90812906,90813175,90813624,90828985
mm10.60way.phastCons_scores,0.0,0.0,0.07,0.15,0.03,0.06,0.44,0.01,0.18,0.15,...,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
_-log10_bestPvalue,0.56,0.5,10.8,3.02,1.31,2.39,2.57,2.57,1.34,1.83,...,6.58,4.35,2.6,2.33,3.12,3.99,3.21,0.69,0.6,1.41
Included.in.systematic.analysis,,,1,1,,1,1,1,,1,...,1,1,1,1,1,1,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FRC.SLN,1.182692,0.941106,6.008541,3.857981,0.941106,0.613532,1.83996,1.863938,0.941106,0.15056,...,3.917432,1.803227,2.636915,2.615887,3.02148,3.037382,2.849999,1.38405,0.584963,0.887525
IAP.SLN,0.137504,0.9855,3.31034,0.9855,0.807355,1.263034,0.807355,0.9855,0.584963,0.807355,...,3.485427,1.83996,2.718088,3.340562,2.192194,2.799087,3.285402,1.195348,0.137504,1.244887
BEC.SLN,0.650765,1.659925,1.22033,1.189034,1.22033,0.925999,1.722466,1.22033,1.545968,1.843984,...,3.638074,2.295723,2.063503,1.682573,1.863938,2.62527,2.969012,0.189034,0.189034,0.650765
LEC.SLN,2.094236,1.739848,1.028569,1.028569,1.38405,1.952334,1.739848,3.051372,1.531069,1.718088,...,4.921722,2.790772,3.153805,2.91265,2.720278,2.91265,3.192194,1.157044,1.035624,0.15056


In [31]:
ATAC_w_info = data['ATAC_seq_w_info']

print(ATAC_w_info)

ATAC_w_info[ATAC_w_info.isna().any(axis=1)]

                           CellType ImmGenATAC1219.peak_1  \
0                             chrom                  chr1   
1                            Summit               3020786   
2       mm10.60way.phastCons_scores                   0.0   
3                _-log10_bestPvalue                  0.56   
4   Included.in.systematic.analysis                         
..                              ...                   ...   
92                          FRC.SLN              1.182692   
93                          IAP.SLN              0.137504   
94                          BEC.SLN              0.650765   
95                          LEC.SLN              2.094236   
96                      Ep.MEChi.Th              1.269033   

   ImmGenATAC1219.peak_2 ImmGenATAC1219.peak_3 ImmGenATAC1219.peak_4  \
0                   chr1                  chr1                  chr1   
1                3087226               3120109               3121485   
2                    0.0                  0.07     

Unnamed: 0,CellType,ImmGenATAC1219.peak_1,ImmGenATAC1219.peak_2,ImmGenATAC1219.peak_3,ImmGenATAC1219.peak_4,ImmGenATAC1219.peak_5,ImmGenATAC1219.peak_6,ImmGenATAC1219.peak_7,ImmGenATAC1219.peak_8,ImmGenATAC1219.peak_9,...,ImmGenATAC1219.peak_512589,ImmGenATAC1219.peak_512590,ImmGenATAC1219.peak_512591,ImmGenATAC1219.peak_512592,ImmGenATAC1219.peak_512593,ImmGenATAC1219.peak_512594,ImmGenATAC1219.peak_512595,Lineage,CellFamily,Organ
0,chrom,chr1,chr1,chr1,chr1,chr1,chr1,chr1,chr1,chr1,...,chrY,chrY,chrY,chrY,chrY,chrY,chrY,,,
1,Summit,3020786,3087226,3120109,3121485,3372787,3399217,3400115,3416260,3434092,...,90811728,90812084,90812450,90812906,90813175,90813624,90828985,,,
2,mm10.60way.phastCons_scores,0.0,0.0,0.07,0.15,0.03,0.06,0.44,0.01,0.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
3,_-log10_bestPvalue,0.56,0.5,10.8,3.02,1.31,2.39,2.57,2.57,1.34,...,2.33,3.12,3.99,3.21,0.69,0.6,1.41,,,
4,Included.in.systematic.analysis,,,1,1,,1,1,1,,...,1,1,1,1,,,,,,
5,TSS,,,,,,,,,,...,,,,,,,,,,
6,genes.within.100Kb,,,,,,,,,,...,Erdr1,Erdr1,Erdr1,Erdr1,Erdr1,Erdr1,Erdr1,,,
7,LTHSC.34-.BM,0.495695,0.495695,1.748461,0.495695,0.495695,1.748461,0.495695,1.748461,2.424922,...,0.495695,1.748461,2.424922,0.495695,0.495695,0.495695,0.495695,,,
8,LTHSC.34+.BM,0.773996,1.400538,0.137504,0.137504,0.137504,1.400538,0.137504,0.773996,0.137504,...,3.072106,3.291309,3.291309,3.072106,0.773996,1.400538,0.773996,,,
9,STHSC.150-.BM,0.925999,0.925999,0.925999,0.15056,0.15056,1.510962,0.15056,0.925999,0.925999,...,1.510962,0.925999,2.176323,2.176323,0.15056,0.15056,0.925999,,,
