# Claudia data CGI tracks

In [11]:
import pandas as pd
import pybedtools
from os import listdir
from Bio import SeqIO

In [12]:
# Import libraries and genome
my_seqlist = []
for seq_record in SeqIO.parse('genome/Mus_musculus.GRCm38.chromosome.1.fa', 'fasta'):
    my_seqlist.append(seq_record)

CpG_Ill = pybedtools.BedTool("CpG_Il_mm10.bed")

In [13]:
print("Total number of seqs")
print(len(my_seqlist))
print("ID seq 1")
print(my_seqlist[0].id)
print("First 100 bp seq 1")
print(my_seqlist[0].seq[0:100])
print("Total length seq 1")
print(len(my_seqlist[0]))

Total number of seqs
66
ID seq 1
1
First 100 bp seq 1
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
Total length seq 1
195471971


In [14]:
# Obtain CG kmers list
from itertools import product

kmers = list(product('ATCG', repeat=6))
kmers = ["".join(x) for x in kmers]

kmers_CG = []
for i in kmers:
    if i[2:4] == "CG":
        kmers_CG.append(i)

In [23]:
# Dictionary key location
dict_chr_loc = {}
for i in range(len(my_seqlist)):
    dict_chr_loc[my_seqlist[i].id] = i

In [27]:
# Function to analyze k-mers
def kmer_analysis(file_name,folder_input,folder_output):    
    file_input = str(folder_input) + "/"+ str(file_name)
    print(file_input)
    df = pd.read_csv(file_input,sep='\t',skiprows=0,header=None, dtype={0: str})

    pd_kmers_CG = pd.DataFrame(0, index=kmers_CG, columns=['Met','Unmet'])

    for i in range(len(df)):
        position = my_seqlist[dict_chr_loc.get(df[0][i])].seq[(df[1][i])-1]
        if position == "C":
            substring = my_seqlist[dict_chr_loc.get(df[0][i])].seq[(df[1][i])-3:df[1][i]+3]
        elif position =="G":
            substring = (my_seqlist[dict_chr_loc.get(df[0][i])].seq[(df[1][i])-4:df[1][i]+2]).reverse_complement()
        if (substring[2:4] == "CG") & (str(substring) in kmers_CG):
            if df[3][i] == "Z":
                pd_kmers_CG.loc[str(substring)][0] = (pd_kmers_CG.loc[str(substring)][0]) + 1
            else:
                pd_kmers_CG.loc[str(substring)][1] = (pd_kmers_CG.loc[str(substring)][1]) + 1

    pd_kmers_CG['Total'] = pd_kmers_CG.sum(axis=1)
    pd_kmers_CG['Per_met'] = pd_kmers_CG['Met']*100/pd_kmers_CG['Total']
    file_output = str(folder_output) + "/" + str(file_name) + ".csv"
    pd_kmers_CG.to_csv(file_output) 

In [18]:
# Claudia CGI analysis

In [17]:
files = listdir("Claudia")
inputs = []
for i in range(len(files)):
    if files[i].endswith("deduplicated.txt") == True:
        inputs.append(files[i])

In [19]:
folder_input = "Claudia/"
out_path = "Claudia_CGI/"

for input_temp in inputs:
    print(input_temp)
    file_input = folder_input + input_temp
    df = pd.read_csv(file_input,sep='\t',skiprows=1,header=None)
    list_positions = pd.DataFrame(df[[2,3,3,4]]).values.tolist()
    temp_bed = pybedtools.BedTool(list_positions)
    temp_CPI = (temp_bed.intersect(CpG_Ill))
    temp_non_CPI = (temp_bed - temp_CPI)
    
    temp_CPI.saveas(out_path + "CGI_" + input_temp + ".csv")
    temp_non_CPI.saveas(out_path + "non_CGI_" + input_temp + ".csv")

CpG_context_50ugmL_Asc_2i_A_0h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_A_12_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_A_18h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_A_24h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_A_30h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_A_36h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_A_42h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_A_6h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_B_0h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_B_12_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_B_18h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_B_24h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmL_Asc_2i_B_30h_fastq.gz_trimmed_bismark_bt2.deduplic

  interactivity=interactivity, compiler=compiler, result=result)


CpG_context_50ugmLAsc_DOX_2i_C_30h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_DOX_2i_C_36h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_DOX_2i_C_42h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_DOX_2i_C_6h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_Dox_A_0h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_Dox_A_12h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_Dox_A_18h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_Dox_A_24h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_Dox_A_30h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_Dox_A_36h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_Dox_A_6h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_Dox_B_0h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt
CpG_context_50ugmLAsc_Dox_B_12h_fastq.gz_trimmed_bismark

In [21]:
files = listdir("Claudia_CGI")
inputs = []
for i in range(len(files)):
    if files[i].endswith("deduplicated.txt.csv") == True:
        inputs.append(files[i])

In [28]:
for i in range(len(inputs)):
    kmer_analysis(inputs[i],"Claudia_CGI","Claudia_CGI_output")

Claudia_CGI/CGI_CpG_context_50ugmL_Asc_2i_A_0h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Claudia_CGI/CGI_CpG_context_50ugmL_Asc_2i_A_12_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Claudia_CGI/CGI_CpG_context_50ugmL_Asc_2i_A_18h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Claudia_CGI/CGI_CpG_context_50ugmL_Asc_2i_A_24h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Claudia_CGI/CGI_CpG_context_50ugmL_Asc_2i_A_30h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Claudia_CGI/CGI_CpG_context_50ugmL_Asc_2i_A_36h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Claudia_CGI/CGI_CpG_context_50ugmL_Asc_2i_A_42h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Claudia_CGI/CGI_CpG_context_50ugmL_Asc_2i_A_6h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Claudia_CGI/CGI_CpG_context_50ugmL_Asc_2i_B_0h_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Claudia_CGI/CGI_CpG_context_50ugmL_Asc_2i_B_12_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Claudia_CGI/CGI_CpG_conte

# Rosie CGI analysis

In [33]:
files = listdir("Rosie")
inputs = []
for i in range(len(files)):
    if files[i].endswith("deduplicated.txt") == True:
        inputs.append(files[i])

In [34]:
folder_input = "Rosie/"
out_path = "Rosie_CGI/"

for input_temp in inputs:
    file_input = folder_input + input_temp
    df = pd.read_csv(file_input,sep='\t',skiprows=1,header=None)
    list_positions = pd.DataFrame(df[[2,3,3,4]]).values.tolist()
    temp_bed = pybedtools.BedTool(list_positions)
    temp_CPI = (temp_bed.intersect(CpG_Ill))
    temp_non_CPI = (temp_bed - temp_CPI)
    
    temp_CPI.saveas(out_path + "CGI_" + input_temp + ".csv")
    temp_non_CPI.saveas(out_path + "non_CGI_" + input_temp + ".csv")

In [35]:
files = listdir("Rosie_CGI")
inputs = []
for i in range(len(files)):
    if files[i].endswith("deduplicated.txt.csv") == True:
        inputs.append(files[i])

In [38]:
for i in range(len(inputs)):
    kmer_analysis(inputs[i],"Rosie_CGI","Rosie_CGI_output")

Rosie_CGI/CGI_CpG_context_iSeq009_TET_TKO_5AZA_0h_R1_RG_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Rosie_CGI/CGI_CpG_context_iSeq009_TET_TKO_5AZA_0h_R2_RG_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Rosie_CGI/CGI_CpG_context_iSeq009_TET_TKO_5AZA_0h_R3_RG_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Rosie_CGI/CGI_CpG_context_iSeq009_TET_TKO_5AZA_12h_R1_RG_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Rosie_CGI/CGI_CpG_context_iSeq009_TET_TKO_5AZA_12h_R2_RG_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Rosie_CGI/CGI_CpG_context_iSeq009_TET_TKO_5AZA_12h_R3_RG_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Rosie_CGI/CGI_CpG_context_iSeq009_TET_TKO_5AZA_18h_R1_RG_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Rosie_CGI/CGI_CpG_context_iSeq009_TET_TKO_5AZA_18h_R2_RG_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Rosie_CGI/CGI_CpG_context_iSeq009_TET_TKO_5AZA_18h_R3_RG_fastq.gz_trimmed_bismark_bt2.deduplicated.txt.csv
Rosie_CGI/CGI_CpG_context_iSeq009_TET_TK