# Creation of UMAP to identify batch effects

Import Packages and Data

In [6]:
#import packages
import pandas as pd
import umap 
import numpy as np
import matplotlib.pyplot as plt

# Pia Kentschke
ATAC_seq_path = "/Users/piakentschke/Documents/Uni/Data Analysis/ImmGenATAC18_AllOCRsInfo.csv"
ATAC_scores_path = "/Users/piakentschke/Documents/Uni/Data Analysis/ATAC_scores.csv"
RNA_seq_path = "/Users/piakentschke/Documents/Uni/Data Analysis/RNA-seq_data.csv"
Transcription_exons_path = "/Users/piakentschke/Documents/Uni/Data Analysis/refFlat.txt"
Cell_population_qc_path = "/Users/piakentschke/Documents/Uni/Data Analysis/mmc1.xlsx"
Voluntary_path = "/Users/piakentschke/Documents/Uni/Data Analysis/ImmGenATAC18_AllTFmotifsInOCRs.txt"


ATAC_scores = pd.read_csv(ATAC_scores_path, sep=",", keep_default_na=False)
RNA_seq = pd.read_csv(RNA_seq_path)
ATAC_seq = pd.read_csv(ATAC_seq_path, sep=",", keep_default_na=False)

#drop peaks with infinite -log10 p values
ATAC_seq = ATAC_seq[ATAC_seq['_-log10_bestPvalue'] != np.inf]

#define threshold (p = 0.05)
threshold_pval = 1.3

# Filter ATAC-seq peaks based on logp value threshold 
ATAC_cleaned = ATAC_seq[ATAC_seq["_-log10_bestPvalue"] >= threshold_pval]


# Create data subsets
col_ATAC_seq_ab_act_gd_T = ['LTHSC.34-.BM', 'LTHSC.34+.BM', 'STHSC.150-.BM', 'MPP4.135+.BM','preT.DN1.Th', 'preT.DN2a.Th', 'preT.DN2b.Th',
    'preT.DN3.Th', 'T.DN4.Th', 'T.ISP.Th', 'T.DP.Th', 'T.4.Th', 'T.8.Th',
    'T.4.Nve.Sp', 'T.4.Nve.Fem.Sp', 'T.8.Nve.Sp', 'T.4.Sp.aCD3+CD40.18hr',
    'Treg.4.FP3+.Nrplo.Co', 'Treg.4.25hi.Sp', 'T8.TN.P14.Sp',
    'T8.IEL.LCMV.d7.SI', 'T8.TE.LCMV.d7.Sp', 'T8.MP.LCMV.d7.Sp',
    'T8.Tcm.LCMV.d180.Sp', 'T8.Tem.LCMV.d180.Sp', 'NKT.Sp',
    'NKT.Sp.LPS.3hr', 'NKT.Sp.LPS.18hr', 'NKT.Sp.LPS.3d',
    'Tgd.g2+d17.24a+.Th', 'Tgd.g2+d17.LN', 'Tgd.g2+d1.24a+.Th',
    'Tgd.g2+d1.LN', 'Tgd.g1.1+d1.24a+.Th', 'Tgd.g1.1+d1.LN', 'Tgd.Sp']

col_ATAC_seq_ab_act_T = ['LTHSC.34-.BM', 'LTHSC.34+.BM', 'STHSC.150-.BM', 'MPP4.135+.BM','preT.DN1.Th', 'preT.DN2a.Th', 'preT.DN2b.Th',
    'preT.DN3.Th', 'T.DN4.Th', 'T.ISP.Th', 'T.DP.Th', 'T.4.Th', 'T.8.Th',
    'T.4.Nve.Sp', 'T.4.Nve.Fem.Sp', 'T.8.Nve.Sp', 'T.4.Sp.aCD3+CD40.18hr',
    'Treg.4.FP3+.Nrplo.Co', 'Treg.4.25hi.Sp', 'T8.TN.P14.Sp',
    'T8.IEL.LCMV.d7.SI', 'T8.TE.LCMV.d7.Sp', 'T8.MP.LCMV.d7.Sp',
    'T8.Tcm.LCMV.d180.Sp', 'T8.Tem.LCMV.d180.Sp', 'NKT.Sp',
    'NKT.Sp.LPS.3hr', 'NKT.Sp.LPS.18hr', 'NKT.Sp.LPS.3d']

col_ATAC_seq_gd_T = ['LTHSC.34-.BM', 'LTHSC.34+.BM', 'STHSC.150-.BM', 'MPP4.135+.BM','Tgd.g2+d17.24a+.Th', 'Tgd.g2+d17.LN', 'Tgd.g2+d1.24a+.Th',
    'Tgd.g2+d1.LN', 'Tgd.g1.1+d1.24a+.Th', 'Tgd.g1.1+d1.LN', 'Tgd.Sp']

col_ATAC_seq_ab_T = ['LTHSC.34-.BM', 'LTHSC.34+.BM', 'STHSC.150-.BM', 'MPP4.135+.BM','preT.DN1.Th', 'preT.DN2a.Th', 'preT.DN2b.Th',
    'preT.DN3.Th', 'T.DN4.Th', 'T.ISP.Th', 'T.DP.Th', 'T.4.Th', 'T.8.Th',
    'T.4.Nve.Sp', 'T.4.Nve.Fem.Sp', 'T.8.Nve.Sp', 'T.4.Sp.aCD3+CD40.18hr',
    'Treg.4.FP3+.Nrplo.Co', 'Treg.4.25hi.Sp']

ATAC_seq_ab_act_gd_T = ATAC_cleaned[col_ATAC_seq_ab_act_gd_T]
ATAC_seq_gd_T = ATAC_seq[col_ATAC_seq_gd_T]
ATAC_seq_ab_act_T = ATAC_seq[col_ATAC_seq_ab_act_T]
ATAC_seq_ab_T = ATAC_seq[col_ATAC_seq_ab_T]

## Create UMAP dimensionality reduction to check for outlier cells


In [None]:
#initialize UMAP reducer 
umap_reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='euclidean')

# Fit and transform the ATAC scores using UMAP
umap_embedding = umap_reducer.fit_transform(ATAC_seq_gd_T.iloc[:, 1:])

# Plot results

plt.scatter(umap_embedding[:, 0], umap_embedding[:, 1])
plt.title('UMAP of ATAC Scores, colored by cell type')
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.show()

UMAP for subset of cell types relevant for ab gd T cell differentiation