In [1]:
import pandas as pd

#cat Homo_sapiens.GRCh38.114.gtf | awk '$3=="gene"' | tr ';' '\t' | tr ' ' '\t' | sed 's/"//g' | cut -f1,4,5,7,10 | awk '{print $1"\t"$2"\t"$3"\t"$5"\t0\t"$4}' > Homo_sapiens.GRCh38.114.bed

# Import BED file from GTF file
df = pd.read_csv("Homo_sapiens.GRCh38.114.bed", sep="\t", header=None,
                 names=["chr", "start", "end", "gene_id", "score", "strand"])

# Create promotor region +- 500 bp from TSS
def get_promoter(row):
    if row["strand"] == "+":
        tss = row["start"]
    else:
        tss = row["end"]
    start = max(tss - 500, 0)  # unikamy ujemnych wartości
    end = tss + 500
    return pd.Series([row["chr"], start, end, row["gene_id"], row["strand"]])

promoters = df.apply(get_promoter, axis=1)
promoters.columns = ["chr", "prom_start", "prom_end", "gene_id", "strand"]

# Save BED file
promoters[["chr", "prom_start", "prom_end", "gene_id"]].to_csv("promoters_500bp.bed",
                                                               sep="\t", header=False, index=False)


In [4]:
import pandas as pd

# 1. Import cellid from Seurat stst file
whitelist = pd.read_csv("k562.dis.stat.txt", sep="\t", usecols=["cellid"])
cellids = set(whitelist["cellid"])

# 2. Import ATAC BED file made by bash command: cat k562.allele.flt.M.fragment.unsorted.tsv | sed 's/chr//' | cut -f1-4 > k562.allele.flt.M.fragment.unsorted.bed

atac_cols = ["chr", "start", "end", "barcode", "count"]
atac = pd.read_csv("k562.allele.flt.M.fragment.unsorted.bed", sep="\t", names=atac_cols, header=None, dtype={"chr": str} )

# 3. Filter ATAC by cellid
atac_filtered = atac[atac["barcode"].isin(cellids)]

# 4. Save filtered file
atac_filtered.to_csv("k562.atac.filtered.bed", sep="\t", index=False, header=False)


In [None]:
# make TSV file showiong localisation od ATC signal on chromosomes and in gene promotores

#bedtools intersect -a k562.atac.filtered.bed -b promoters_500bp.bed -wa -wb > atac_mapped_to_genes.tsv


In [6]:
import pandas as pd

# Read the ATAC localisation file
cols = ["chr_a", "start_a", "end_a", "barcode", "count",
        "chr_b", "prom_start", "prom_end", "gene_id"]
mapped = pd.read_csv("atac_mapped_to_genes.tsv", sep="\t", names=cols, header=None, dtype={"chr_a": str, "chr_b": str})

# Count number of ATAC peaks for every gene in every cell
atac_counts = mapped.groupby(["barcode", "gene_id"]).size().unstack(fill_value=0)

# SAve matrix cells x genes
atac_counts.to_csv("atac_counts_matrix.tsv", sep="\t")


In [None]:
# NORMALIZACJA!?

In [None]:
# Now import RNAseq expression and combine RNA with ATAC

In [7]:
import pandas as pd

df_expr_norm = pd.read_csv("k562.allele.flt.M_df_expr_norm.tsv", sep="\t", index_col=0)


In [9]:
# Add Phase info to ATAC data
atac_with_phase = atac_counts.merge(df_expr_norm["Phase"], left_index=True, right_index=True)

# Agregate ATAC by Phase (calculate mean ATAC signal)
atac_by_phase = atac_with_phase.groupby("Phase").mean()


In [32]:
atac_by_phase

Unnamed: 0_level_0,ENSG00000000460,ENSG00000000971,ENSG00000001460,ENSG00000001561,ENSG00000002330,ENSG00000002726,ENSG00000002745,ENSG00000002822,ENSG00000002834,ENSG00000002919,...,ENSG00000310504,ENSG00000310507,ENSG00000310511,ENSG00000310515,ENSG00000310516,ENSG00000310525,ENSG00000310529,ENSG00000310564,ENSG00000310566,ENSG00000310567
Phase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
G1,0.00122,0.000697,0.000871,0.000523,0.000871,0.001742,0.000697,0.00331,0.004704,0.002091,...,0.008014,0.000697,0.002265,0.000348,0.000348,0.001916,0.000174,0.002265,0.012718,0.000697
G2M,0.00079,0.000474,0.001422,0.000632,0.000316,0.00079,0.000316,0.005216,0.003003,0.002687,...,0.01059,0.001581,0.002529,0.001739,0.000316,0.002845,0.000158,0.002055,0.01138,0.001106
S,0.001247,0.000249,0.000748,0.000125,0.000624,0.001871,0.000125,0.003741,0.00212,0.002993,...,0.010849,0.000499,0.001372,0.000998,0.000249,0.002494,0.000249,0.00212,0.013717,0.001122


In [18]:
common_genes = avg_by_phase.columns.intersection(atac_by_phase.columns)

rna_filtered = avg_by_phase.loc[:, common_genes]
atac_filtered = atac_by_phase.loc[:, common_genes]


In [22]:
# Make tables
rna_long = rna_filtered.reset_index().melt(id_vars="Phase", var_name="gene", value_name="RNA_expr")
atac_long = atac_filtered.reset_index().melt(id_vars="Phase", var_name="gene", value_name="ATAC_access")

# Merge RNA and ATAC in one table
merged = pd.merge(rna_long, atac_long, on=["Phase", "gene"])


In [30]:
merged.to_csv("k562.allele.flt.M_atacrnamerged.tsv", sep="\t")

In [31]:
merged

Unnamed: 0,Phase,gene,RNA_expr,ATAC_access
0,G1,ENSG00000000460,0.545158,0.001220
1,G2M,ENSG00000000460,0.819479,0.000790
2,S,ENSG00000000460,0.949821,0.001247
3,G1,ENSG00000000971,0.322822,0.000697
4,G2M,ENSG00000000971,0.227182,0.000474
...,...,...,...,...
15841,G2M,ENSG00000288067,0.000000,0.001106
15842,S,ENSG00000288067,0.008637,0.001122
15843,G1,ENSG00000288156,0.067966,0.000174
15844,G2M,ENSG00000288156,0.021988,0.000474
